LinuxPPC-Dev Archive on lore.kernel.org

LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed

* Re: [PATCH net] ibmvnic: Fix IRQ mapping disposal in error path
From: David Miller @ 2020-07-29 22:36 UTC (permalink / raw)
  To: tlfalcon; +Cc: drt, netdev, linuxppc-dev
In-Reply-To: <1596058592-12025-1-git-send-email-tlfalcon@linux.ibm.com>

From: Thomas Falcon <tlfalcon@linux.ibm.com>
Date: Wed, 29 Jul 2020 16:36:32 -0500

> RX queue IRQ mappings are disposed in both the TX IRQ and RX IRQ
> error paths. Fix this and dispose of TX IRQ mappings correctly in
> case of an error.
> 
> Signed-off-by: Thomas Falcon <tlfalcon@linux.ibm.com>

Applied with Fixes: tag added and queued up for -stable.

^ permalink raw reply

* Re: [PATCH net] ibmvnic: Fix IRQ mapping disposal in error path
From: Jakub Kicinski @ 2020-07-29 22:28 UTC (permalink / raw)
  To: Thomas Falcon; +Cc: drt, netdev, linuxppc-dev
In-Reply-To: <1596058592-12025-1-git-send-email-tlfalcon@linux.ibm.com>

On Wed, 29 Jul 2020 16:36:32 -0500 Thomas Falcon wrote:
> RX queue IRQ mappings are disposed in both the TX IRQ and RX IRQ
> error paths. Fix this and dispose of TX IRQ mappings correctly in
> case of an error.
> 
> Signed-off-by: Thomas Falcon <tlfalcon@linux.ibm.com>

Thomas, please remember about Fixes tags (for networking patches, 
at least):

Fixes: ea22d51a7831 ("ibmvnic: simplify and improve driver probe function")

^ permalink raw reply

* Re: [PATCH v2] pseries/drmem: don't cache node id in drmem_lmb struct
From: Laurent Dufour @ 2020-07-29 15:32 UTC (permalink / raw)
  To: Scott Cheloha, linuxppc-dev
  Cc: Nathan Lynch, Michal Suchanek, David Hildenbrand, Rick Lindsley
In-Reply-To: <20200728165339.3031126-1-cheloha@linux.ibm.com>

Hi Scott,

Le 28/07/2020 à 18:53, Scott Cheloha a écrit :
> At memory hot-remove time we can retrieve an LMB's nid from its
> corresponding memory_block.  There is no need to store the nid
> in multiple locations.
> 
> Note that lmb_to_memblock() uses find_memory_block() to get the
> corresponding memory_block.  As find_memory_block() runs in sub-linear
> time this approach is negligibly slower than what we do at present.
> 
> In exchange for this lookup at hot-remove time we no longer need to
> call memory_add_physaddr_to_nid() during drmem_init() for each LMB.
> On powerpc, memory_add_physaddr_to_nid() is a linear search, so this
> spares us an O(n^2) initialization during boot.
> 
> On systems with many LMBs that initialization overhead is palpable and
> disruptive.  For example, on a box with 249854 LMBs we're seeing
> drmem_init() take upwards of 30 seconds to complete:
> 
> [   53.721639] drmem: initializing drmem v2
> [   80.604346] watchdog: BUG: soft lockup - CPU#65 stuck for 23s! [swapper/0:1]
> [   80.604377] Modules linked in:
> [   80.604389] CPU: 65 PID: 1 Comm: swapper/0 Not tainted 5.6.0-rc2+ #4
> [   80.604397] NIP:  c0000000000a4980 LR: c0000000000a4940 CTR: 0000000000000000
> [   80.604407] REGS: c0002dbff8493830 TRAP: 0901   Not tainted  (5.6.0-rc2+)
> [   80.604412] MSR:  8000000002009033 <SF,VEC,EE,ME,IR,DR,RI,LE>  CR: 44000248  XER: 0000000d
> [   80.604431] CFAR: c0000000000a4a38 IRQMASK: 0
> [   80.604431] GPR00: c0000000000a4940 c0002dbff8493ac0 c000000001904400 c0003cfffffede30
> [   80.604431] GPR04: 0000000000000000 c000000000f4095a 000000000000002f 0000000010000000
> [   80.604431] GPR08: c0000bf7ecdb7fb8 c0000bf7ecc2d3c8 0000000000000008 c00c0002fdfb2001
> [   80.604431] GPR12: 0000000000000000 c00000001e8ec200
> [   80.604477] NIP [c0000000000a4980] hot_add_scn_to_nid+0xa0/0x3e0
> [   80.604486] LR [c0000000000a4940] hot_add_scn_to_nid+0x60/0x3e0
> [   80.604492] Call Trace:
> [   80.604498] [c0002dbff8493ac0] [c0000000000a4940] hot_add_scn_to_nid+0x60/0x3e0 (unreliable)
> [   80.604509] [c0002dbff8493b20] [c000000000087c10] memory_add_physaddr_to_nid+0x20/0x60
> [   80.604521] [c0002dbff8493b40] [c0000000010d4880] drmem_init+0x25c/0x2f0
> [   80.604530] [c0002dbff8493c10] [c000000000010154] do_one_initcall+0x64/0x2c0
> [   80.604540] [c0002dbff8493ce0] [c0000000010c4aa0] kernel_init_freeable+0x2d8/0x3a0
> [   80.604550] [c0002dbff8493db0] [c000000000010824] kernel_init+0x2c/0x148
> [   80.604560] [c0002dbff8493e20] [c00000000000b648] ret_from_kernel_thread+0x5c/0x74
> [   80.604567] Instruction dump:
> [   80.604574] 392918e8 e9490000 e90a000a e92a0000 80ea000c 1d080018 3908ffe8 7d094214
> [   80.604586] 7fa94040 419d00dc e9490010 714a0088 <2faa0008> 409e00ac e9490000 7fbe5040
> [   89.047390] drmem: 249854 LMB(s)
> 
> With a patched kernel on the same machine we're no longer seeing the
> soft lockup.  drmem_init() now completes in negligible time, even when
> the LMB count is large.
> 
> Signed-off-by: Scott Cheloha <cheloha@linux.ibm.com>
> ---
>   arch/powerpc/include/asm/drmem.h              | 21 -------------------
>   arch/powerpc/mm/drmem.c                       |  6 +-----
>   .../platforms/pseries/hotplug-memory.c        | 19 ++++++++++-------
>   3 files changed, 13 insertions(+), 33 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/drmem.h b/arch/powerpc/include/asm/drmem.h
> index 414d209f45bb..34e4e9b257f5 100644
> --- a/arch/powerpc/include/asm/drmem.h
> +++ b/arch/powerpc/include/asm/drmem.h
> @@ -13,9 +13,6 @@ struct drmem_lmb {
>   	u32     drc_index;
>   	u32     aa_index;
>   	u32     flags;
> -#ifdef CONFIG_MEMORY_HOTPLUG
> -	int	nid;
> -#endif
>   };
> 
>   struct drmem_lmb_info {
> @@ -104,22 +101,4 @@ static inline void invalidate_lmb_associativity_index(struct drmem_lmb *lmb)
>   	lmb->aa_index = 0xffffffff;
>   }
> 
> -#ifdef CONFIG_MEMORY_HOTPLUG
> -static inline void lmb_set_nid(struct drmem_lmb *lmb)
> -{
> -	lmb->nid = memory_add_physaddr_to_nid(lmb->base_addr);
> -}
> -static inline void lmb_clear_nid(struct drmem_lmb *lmb)
> -{
> -	lmb->nid = -1;
> -}
> -#else
> -static inline void lmb_set_nid(struct drmem_lmb *lmb)
> -{
> -}
> -static inline void lmb_clear_nid(struct drmem_lmb *lmb)
> -{
> -}
> -#endif
> -
>   #endif /* _ASM_POWERPC_LMB_H */
> diff --git a/arch/powerpc/mm/drmem.c b/arch/powerpc/mm/drmem.c
> index 59327cefbc6a..873fcfc7b875 100644
> --- a/arch/powerpc/mm/drmem.c
> +++ b/arch/powerpc/mm/drmem.c
> @@ -362,10 +362,8 @@ static void __init init_drmem_v1_lmbs(const __be32 *prop)
>   	if (!drmem_info->lmbs)
>   		return;
> 
> -	for_each_drmem_lmb(lmb) {
> +	for_each_drmem_lmb(lmb)
>   		read_drconf_v1_cell(lmb, &prop);
> -		lmb_set_nid(lmb);
> -	}
>   }
> 
>   static void __init init_drmem_v2_lmbs(const __be32 *prop)
> @@ -410,8 +408,6 @@ static void __init init_drmem_v2_lmbs(const __be32 *prop)
> 
>   			lmb->aa_index = dr_cell.aa_index;
>   			lmb->flags = dr_cell.flags;
> -
> -			lmb_set_nid(lmb);
>   		}
>   	}
>   }
> diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c
> index 5ace2f9a277e..7bf66fdcf916 100644
> --- a/arch/powerpc/platforms/pseries/hotplug-memory.c
> +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
> @@ -356,25 +356,29 @@ static int dlpar_add_lmb(struct drmem_lmb *);
> 
>   static int dlpar_remove_lmb(struct drmem_lmb *lmb)
>   {
> +	struct memory_block *mem_block;
>   	unsigned long block_sz;
>   	int rc;
> 
>   	if (!lmb_is_removable(lmb))
>   		return -EINVAL;
> 
> +	mem_block = lmb_to_memblock(lmb);
> +	if (mem_block == NULL)
> +		return -EINVAL;
> +

lmb_to_memblock is 'getting' the memblock's device, so you should call 
put_device(mem_block->device) to reverse that operation.

Something like:

--- a/arch/powerpc/platforms/pseries/hotplug-memory.c
+++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
@@ -367,7 +367,7 @@ static int dlpar_remove_lmb(struct drmem_lmb *lmb)

  	rc = dlpar_offline_lmb(lmb);
  	if (rc)
-		return rc;
+		goto out;

  	block_sz = pseries_memory_block_size();

@@ -379,7 +379,9 @@ static int dlpar_remove_lmb(struct drmem_lmb *lmb)
  	invalidate_lmb_associativity_index(lmb);
  	lmb->flags &= ~DRCONF_MEM_ASSIGNED;

-	return 0;
+out:
+	put_device(&mem_block->dev);
+	return rc;
  }

  static int dlpar_memory_remove_by_count(u32 lmbs_to_remove)



>   	rc = dlpar_offline_lmb(lmb);
>   	if (rc)
>   		return rc;
> 
>   	block_sz = pseries_memory_block_size();
> 
> -	__remove_memory(lmb->nid, lmb->base_addr, block_sz);
> +	__remove_memory(mem_block->nid, lmb->base_addr, block_sz);
> 
>   	/* Update memory regions for memory remove */
>   	memblock_remove(lmb->base_addr, block_sz);
> 
>   	invalidate_lmb_associativity_index(lmb);
> -	lmb_clear_nid(lmb);
>   	lmb->flags &= ~DRCONF_MEM_ASSIGNED;
> 
>   	return 0;
> @@ -631,7 +635,7 @@ static int dlpar_memory_remove_by_ic(u32 lmbs_to_remove, u32 drc_index)
>   static int dlpar_add_lmb(struct drmem_lmb *lmb)
>   {
>   	unsigned long block_sz;
> -	int rc;
> +	int nid, rc;
> 
>   	if (lmb->flags & DRCONF_MEM_ASSIGNED)
>   		return -EINVAL;
> @@ -642,11 +646,13 @@ static int dlpar_add_lmb(struct drmem_lmb *lmb)
>   		return rc;
>   	}
> 
> -	lmb_set_nid(lmb);
>   	block_sz = memory_block_size_bytes();
> 
> +	/* Find the node id for this address. */
> +	nid = memory_add_physaddr_to_nid(lmb->base_addr);

This function is calling hot_add_drconf_scn_to_nid() which is walking all the 
LMBs linearly, so we may spend a lot of time here.

I think it be nice to take some breath in the caller's loop in 
dlpar_memory_add_by_count() to avoid soft lockup to be raised there when adding 
a large number of LMBs, isn't it?

Nathan sent a patch to add such a breath in for_each_drmem_lmb_in_range() may be 
it would be nice to add one in dlpar_memory_add_by_count() through that patch.

> +
>   	/* Add the memory */
> -	rc = __add_memory(lmb->nid, lmb->base_addr, block_sz);
> +	rc = __add_memory(nid, lmb->base_addr, block_sz);
>   	if (rc) {
>   		invalidate_lmb_associativity_index(lmb);
>   		return rc;
> @@ -654,9 +660,8 @@ static int dlpar_add_lmb(struct drmem_lmb *lmb)
> 
>   	rc = dlpar_online_lmb(lmb);
>   	if (rc) {
> -		__remove_memory(lmb->nid, lmb->base_addr, block_sz);
> +		__remove_memory(nid, lmb->base_addr, block_sz);
>   		invalidate_lmb_associativity_index(lmb);
> -		lmb_clear_nid(lmb);
>   	} else {
>   		lmb->flags |= DRCONF_MEM_ASSIGNED;
>   	}
> 


^ permalink raw reply

* linux-next: Fixes tag needs some work in the powerpc tree
From: Stephen Rothwell @ 2020-07-29 21:55 UTC (permalink / raw)
  To: Michael Ellerman, PowerPC
  Cc: Athira Rajeev, Linux Next Mailing List, Linux Kernel Mailing List

[-- Attachment #1: Type: text/plain, Size: 390 bytes --]

Hi all,

In commit

  443359aebce0 ("powerpc/perf: Fix MMCRA_BHRB_DISABLE define for binutils < 2.28")

Fixes tag

  Fixes: 9908c826d5ed ("Add Power10 PMU feature to DT CPU features")

has these problem(s):

  - Subject does not match target commit subject
    Just use
	git log -1 --format='Fixes: %h ("%s")'

Just a hint for the future.

-- 
Cheers,
Stephen Rothwell

[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply

* [PATCH 2/2] spi: mpc512x-psc: Convert to use GPIO descriptors
From: Linus Walleij @ 2020-07-29 21:48 UTC (permalink / raw)
  To: Mark Brown, linux-spi
  Cc: Linus Walleij, Anatolij Gustschin, linuxppc-dev,
	Uwe Kleine-König
In-Reply-To: <20200729214817.478834-1-linus.walleij@linaro.org>

This driver is already relying on the core to provide
valid GPIO numbers in spi->cs_gpio through
of_spi_get_gpio_numbers(), so we can switch to letting
the core use GPIO descriptors instead.

Make sure that the .set_cs() is always called, as some controller
set-up is also needed.

Cc: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Cc: Anatolij Gustschin <agust@denx.de>
Cc: linuxppc-dev@lists.ozlabs.org
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/spi/spi-mpc512x-psc.c | 33 +++++++--------------------------
 1 file changed, 7 insertions(+), 26 deletions(-)

diff --git a/drivers/spi/spi-mpc512x-psc.c b/drivers/spi/spi-mpc512x-psc.c
index 35313a77f977..dd8bba408301 100644
--- a/drivers/spi/spi-mpc512x-psc.c
+++ b/drivers/spi/spi-mpc512x-psc.c
@@ -23,7 +23,6 @@
 #include <linux/clk.h>
 #include <linux/spi/spi.h>
 #include <linux/fsl_devices.h>
-#include <linux/gpio.h>
 #include <asm/mpc52xx_psc.h>
 
 enum {
@@ -99,7 +98,7 @@ static void mpc512x_psc_spi_set_cs(struct spi_device *spi, bool enable)
 	u16 bclkdiv;
 
 	if (!enable) {
-		if (mps->cs_control && gpio_is_valid(spi->cs_gpio))
+		if (mps->cs_control && spi->cs_gpiod)
 			mps->cs_control(spi, (spi->mode & SPI_CS_HIGH) ? 0 : 1);
 		return;
 	}
@@ -134,7 +133,7 @@ static void mpc512x_psc_spi_set_cs(struct spi_device *spi, bool enable)
 	out_be32(psc_addr(mps, ccr), ccr);
 	mps->bits_per_word = cs->bits_per_word;
 
-	if (mps->cs_control && gpio_is_valid(spi->cs_gpio))
+	if (mps->cs_control && spi->cs_gpiod)
 		mps->cs_control(spi, (spi->mode & SPI_CS_HIGH) ? 1 : 0);
 }
 
@@ -358,18 +357,6 @@ static int mpc512x_psc_spi_setup(struct spi_device *spi)
 		if (!cs)
 			return -ENOMEM;
 
-		if (gpio_is_valid(spi->cs_gpio)) {
-			ret = gpio_request(spi->cs_gpio, dev_name(&spi->dev));
-			if (ret) {
-				dev_err(&spi->dev, "can't get CS gpio: %d\n",
-					ret);
-				kfree(cs);
-				return ret;
-			}
-			gpio_direction_output(spi->cs_gpio,
-					spi->mode & SPI_CS_HIGH ? 0 : 1);
-		}
-
 		spi->controller_state = cs;
 	}
 
@@ -381,8 +368,6 @@ static int mpc512x_psc_spi_setup(struct spi_device *spi)
 
 static void mpc512x_psc_spi_cleanup(struct spi_device *spi)
 {
-	if (gpio_is_valid(spi->cs_gpio))
-		gpio_free(spi->cs_gpio);
 	kfree(spi->controller_state);
 }
 
@@ -461,11 +446,6 @@ static irqreturn_t mpc512x_psc_spi_isr(int irq, void *dev_id)
 	return IRQ_NONE;
 }
 
-static void mpc512x_spi_cs_control(struct spi_device *spi, bool onoff)
-{
-	gpio_set_value(spi->cs_gpio, onoff);
-}
-
 static int mpc512x_psc_spi_do_probe(struct device *dev, u32 regaddr,
 					      u32 size, unsigned int irq)
 {
@@ -485,10 +465,8 @@ static int mpc512x_psc_spi_do_probe(struct device *dev, u32 regaddr,
 	mps->type = (int)of_device_get_match_data(dev);
 	mps->irq = irq;
 
-	if (pdata == NULL) {
-		mps->cs_control = mpc512x_spi_cs_control;
-	} else {
-		mps->cs_control = pdata->cs_control;
+	if (pdata) {
+		mps->cs_control = pdata->cs_control;x
 		master->bus_num = pdata->bus_num;
 		master->num_chipselect = pdata->max_chipselect;
 	}
@@ -499,6 +477,9 @@ static int mpc512x_psc_spi_do_probe(struct device *dev, u32 regaddr,
 	master->transfer_one_message = mpc512x_psc_spi_msg_xfer;
 	master->unprepare_transfer_hardware = mpc512x_psc_spi_unprep_xfer_hw;
 	master->set_cs = mpc512x_psc_spi_set_cs;
+	/* This makes sure our custom .set_cs() is always called */
+	master->flags = SPI_MASTER_GPIO_SS;
+	master->use_gpio_descriptors = true;
 	master->cleanup = mpc512x_psc_spi_cleanup;
 	master->dev.of_node = dev->of_node;
 
-- 
2.26.2


^ permalink raw reply related

* [PATCH 1/2] spi: mpc512x-psc: Use the framework .set_cs()
From: Linus Walleij @ 2020-07-29 21:48 UTC (permalink / raw)
  To: Mark Brown, linux-spi
  Cc: Linus Walleij, Anatolij Gustschin, linuxppc-dev,
	Uwe Kleine-König

The mpc512x-psc is rolling its own chip select control code,
but the SPI master framework can handle this. It was also
evaluating the CS status for each transfer but the CS change
should be per-message not per-transfer.

Switch to use the core .set_cs() to control the chip select.

Cc: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Cc: Anatolij Gustschin <agust@denx.de>
Cc: linuxppc-dev@lists.ozlabs.org
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/spi/spi-mpc512x-psc.c | 30 ++++++++----------------------
 1 file changed, 8 insertions(+), 22 deletions(-)

diff --git a/drivers/spi/spi-mpc512x-psc.c b/drivers/spi/spi-mpc512x-psc.c
index ea1b07953d38..35313a77f977 100644
--- a/drivers/spi/spi-mpc512x-psc.c
+++ b/drivers/spi/spi-mpc512x-psc.c
@@ -89,7 +89,7 @@ static int mpc512x_psc_spi_transfer_setup(struct spi_device *spi,
 	return 0;
 }
 
-static void mpc512x_psc_spi_activate_cs(struct spi_device *spi)
+static void mpc512x_psc_spi_set_cs(struct spi_device *spi, bool enable)
 {
 	struct mpc512x_psc_spi_cs *cs = spi->controller_state;
 	struct mpc512x_psc_spi *mps = spi_master_get_devdata(spi->master);
@@ -98,6 +98,12 @@ static void mpc512x_psc_spi_activate_cs(struct spi_device *spi)
 	int speed;
 	u16 bclkdiv;
 
+	if (!enable) {
+		if (mps->cs_control && gpio_is_valid(spi->cs_gpio))
+			mps->cs_control(spi, (spi->mode & SPI_CS_HIGH) ? 0 : 1);
+		return;
+	}
+
 	sicr = in_be32(psc_addr(mps, sicr));
 
 	/* Set clock phase and polarity */
@@ -132,15 +138,6 @@ static void mpc512x_psc_spi_activate_cs(struct spi_device *spi)
 		mps->cs_control(spi, (spi->mode & SPI_CS_HIGH) ? 1 : 0);
 }
 
-static void mpc512x_psc_spi_deactivate_cs(struct spi_device *spi)
-{
-	struct mpc512x_psc_spi *mps = spi_master_get_devdata(spi->master);
-
-	if (mps->cs_control && gpio_is_valid(spi->cs_gpio))
-		mps->cs_control(spi, (spi->mode & SPI_CS_HIGH) ? 0 : 1);
-
-}
-
 /* extract and scale size field in txsz or rxsz */
 #define MPC512x_PSC_FIFO_SZ(sz) ((sz & 0x7ff) << 2);
 
@@ -290,40 +287,28 @@ static int mpc512x_psc_spi_msg_xfer(struct spi_master *master,
 				    struct spi_message *m)
 {
 	struct spi_device *spi;
-	unsigned cs_change;
 	int status;
 	struct spi_transfer *t;
 
 	spi = m->spi;
-	cs_change = 1;
 	status = 0;
 	list_for_each_entry(t, &m->transfers, transfer_list) {
 		status = mpc512x_psc_spi_transfer_setup(spi, t);
 		if (status < 0)
 			break;
 
-		if (cs_change)
-			mpc512x_psc_spi_activate_cs(spi);
-		cs_change = t->cs_change;
-
 		status = mpc512x_psc_spi_transfer_rxtx(spi, t);
 		if (status)
 			break;
 		m->actual_length += t->len;
 
 		spi_transfer_delay_exec(t);
-
-		if (cs_change)
-			mpc512x_psc_spi_deactivate_cs(spi);
 	}
 
 	m->status = status;
 	if (m->complete)
 		m->complete(m->context);
 
-	if (status || !cs_change)
-		mpc512x_psc_spi_deactivate_cs(spi);
-
 	mpc512x_psc_spi_transfer_setup(spi, NULL);
 
 	spi_finalize_current_message(master);
@@ -513,6 +498,7 @@ static int mpc512x_psc_spi_do_probe(struct device *dev, u32 regaddr,
 	master->prepare_transfer_hardware = mpc512x_psc_spi_prep_xfer_hw;
 	master->transfer_one_message = mpc512x_psc_spi_msg_xfer;
 	master->unprepare_transfer_hardware = mpc512x_psc_spi_unprep_xfer_hw;
+	master->set_cs = mpc512x_psc_spi_set_cs;
 	master->cleanup = mpc512x_psc_spi_cleanup;
 	master->dev.of_node = dev->of_node;
 
-- 
2.26.2


^ permalink raw reply related

* [PATCH net] ibmvnic: Fix IRQ mapping disposal in error path
From: Thomas Falcon @ 2020-07-29 21:36 UTC (permalink / raw)
  To: netdev; +Cc: drt, Thomas Falcon, linuxppc-dev

RX queue IRQ mappings are disposed in both the TX IRQ and RX IRQ
error paths. Fix this and dispose of TX IRQ mappings correctly in
case of an error.

Signed-off-by: Thomas Falcon <tlfalcon@linux.ibm.com>
---
 drivers/net/ethernet/ibm/ibmvnic.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c
index 0fd7eae..5afb3c9 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -3206,7 +3206,7 @@ static int init_sub_crq_irqs(struct ibmvnic_adapter *adapter)
 req_tx_irq_failed:
 	for (j = 0; j < i; j++) {
 		free_irq(adapter->tx_scrq[j]->irq, adapter->tx_scrq[j]);
-		irq_dispose_mapping(adapter->rx_scrq[j]->irq);
+		irq_dispose_mapping(adapter->tx_scrq[j]->irq);
 	}
 	release_sub_crqs(adapter, 1);
 	return rc;
-- 
1.8.3.1


^ permalink raw reply related

* Re: [PATCH] powerpc: fix function annotations to avoid section mismatch warnings with gcc-10
From: Vladis Dronov @ 2020-07-29 19:44 UTC (permalink / raw)
  To: Segher Boessenkool
  Cc: Aneesh Kumar K . V, linuxppc-dev, linux-kernel, Paul Mackerras
In-Reply-To: <20200729144949.GF17447@gate.crashing.org>

Hello,

----- Original Message -----
> From: "Segher Boessenkool" <segher@kernel.crashing.org>
> To: "Vladis Dronov" <vdronov@redhat.com>
> Cc: linuxppc-dev@lists.ozlabs.org, "Aneesh Kumar K . V" <aneesh.kumar@linux.ibm.com>, linux-kernel@vger.kernel.org,
> "Paul Mackerras" <paulus@samba.org>
> Sent: Wednesday, July 29, 2020 4:49:49 PM
> Subject: Re: [PATCH] powerpc: fix function annotations to avoid section mismatch warnings with gcc-10
> 
> On Wed, Jul 29, 2020 at 03:37:41PM +0200, Vladis Dronov wrote:
> > Certain warnings are emitted for powerpc code when building with a gcc-10
> > toolset:
> > 
> >     WARNING: modpost: vmlinux.o(.text.unlikely+0x377c): Section mismatch in
> >     reference from the function remove_pmd_table() to the function
> >     .meminit.text:split_kernel_mapping()
> >     The function remove_pmd_table() references
> >     the function __meminit split_kernel_mapping().
> >     This is often because remove_pmd_table lacks a __meminit
> >     annotation or the annotation of split_kernel_mapping is wrong.
> > 
> > Add the appropriate __init and __meminit annotations to make modpost not
> > complain. In all the cases there are just a single callsite from another
> > __init or __meminit function:
> > 
> > __meminit remove_pagetable() -> remove_pud_table() -> remove_pmd_table()
> > __init prom_init() -> setup_secure_guest()
> > __init xive_spapr_init() -> xive_spapr_disabled()
> 
> So what changed?  These functions were inlined with older compilers, but
> not anymore?

Yes, exactly. Gcc-10 does not inline them anymore. If this is because of my
build system, this can happen to others also.

The same thing was fixed by Linus in e99332e7b4cd ("gcc-10: mark more functions
__init to avoid section mismatch warnings").

> 
> Segher

Best regards,
Vladis Dronov | Red Hat, Inc. | The Core Kernel | Senior Software Engineer


^ permalink raw reply

* Re: [PATCH v2] selftests: powerpc: Fix online CPU selection
From: Srikar Dronamraju @ 2020-07-29 16:03 UTC (permalink / raw)
  To: Sandipan Das; +Cc: kamalesh, shiganta, nasastry, harish, linuxppc-dev
In-Reply-To: <20200609073733.997643-1-sandipan@linux.ibm.com>

* Sandipan Das <sandipan@linux.ibm.com> [2020-06-09 13:07:33]:

> The size of the CPU affinity mask must be large enough for
> systems with a very large number of CPUs. Otherwise, tests
> which try to determine the first online CPU by calling
> sched_getaffinity() will fail. This makes sure that the size
> of the allocated affinity mask is dependent on the number of
> CPUs as reported by get_nprocs().
> 
> Fixes: 3752e453f6ba ("selftests/powerpc: Add tests of PMU EBBs")
> Reported-by: Shirisha Ganta <shiganta@in.ibm.com>
> Signed-off-by: Sandipan Das <sandipan@linux.ibm.com>
> Reviewed-by: Kamalesh Babulal <kamalesh@linux.vnet.ibm.com>
> ---
> Previous versions can be found at:
> v1: https://lore.kernel.org/linuxppc-dev/20200608144212.985144-1-sandipan@linux.ibm.com/
> 
> @@ -88,28 +89,40 @@ void *get_auxv_entry(int type)
> 
>  int pick_online_cpu(void)
>  {
> -	cpu_set_t mask;
> -	int cpu;
> +	int ncpus, cpu = -1;
> +	cpu_set_t *mask;
> +	size_t size;
> +
> +	ncpus = get_nprocs();

Please use get_nprocs_conf or sysconf(_SC_NPROCESSORS_CONF). The manpage
seems to suggest the latter. Not sure how accurate the manpage is.

get_nprocs is returning online cpus and when smt is off, the cpu numbers
would be sparse and hence the result from get_nprocs wouldn't be ideal for
allocating cpumask. However get_nprocs_conf would return the max configured
cpus and would be able to handle it. 

I think this was the same situation hit by Michael Ellerman.

> +	size = CPU_ALLOC_SIZE(ncpus);
> +	mask = CPU_ALLOC(ncpus);
> +	if (!mask) {
> +		perror("malloc");
> +		return -1;
> +	}
> 

-- 
Thanks and Regards
Srikar Dronamraju

^ permalink raw reply

* Re: [PATCH] powerpc/fadump: Fix build error with CONFIG_PRESERVE_FA_DUMP=y
From: Gustavo Romero @ 2020-07-29 15:03 UTC (permalink / raw)
  To: Michael Ellerman, linuxppc-dev
In-Reply-To: <20200727070341.595634-1-mpe@ellerman.id.au>

On 7/27/20 4:03 AM, Michael Ellerman wrote:
> skiroot_defconfig fails:
> 
> arch/powerpc/kernel/fadump.c:48:17: error: ‘cpus_in_fadump’ defined but not used
>     48 | static atomic_t cpus_in_fadump;
> 
> Fix it by moving the definition into the #ifdef where it's used.
> 
> Fixes: ba608c4fa12c ("powerpc/fadump: fix race between pstore write and fadump crash trigger")
> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
> ---
>   arch/powerpc/kernel/fadump.c | 4 +++-
>   1 file changed, 3 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
> index 1858896d6809..10ebb4bf71ad 100644
> --- a/arch/powerpc/kernel/fadump.c
> +++ b/arch/powerpc/kernel/fadump.c
> @@ -45,10 +45,12 @@ static struct fw_dump fw_dump;
>   static void __init fadump_reserve_crash_area(u64 base);
> 
>   struct kobject *fadump_kobj;
> -static atomic_t cpus_in_fadump;
> 
>   #ifndef CONFIG_PRESERVE_FA_DUMP
> +
> +static atomic_t cpus_in_fadump;
>   static DEFINE_MUTEX(fadump_mutex);
> +
>   struct fadump_mrange_info crash_mrange_info = { "crash", NULL, 0, 0, 0, false };
> 
>   #define RESERVED_RNGS_SZ	16384 /* 16K - 128 entries */
> 

Tested-by: Gustavo Romero <gromero@linux.ibm.com>


Thanks,
Gustavo

^ permalink raw reply

* Re: [PATCH] powerpc: fix function annotations to avoid section mismatch warnings with gcc-10
From: Segher Boessenkool @ 2020-07-29 14:49 UTC (permalink / raw)
  To: Vladis Dronov
  Cc: Aneesh Kumar K . V, linuxppc-dev, linux-kernel, Paul Mackerras
In-Reply-To: <20200729133741.62789-1-vdronov@redhat.com>

On Wed, Jul 29, 2020 at 03:37:41PM +0200, Vladis Dronov wrote:
> Certain warnings are emitted for powerpc code when building with a gcc-10
> toolset:
> 
>     WARNING: modpost: vmlinux.o(.text.unlikely+0x377c): Section mismatch in
>     reference from the function remove_pmd_table() to the function
>     .meminit.text:split_kernel_mapping()
>     The function remove_pmd_table() references
>     the function __meminit split_kernel_mapping().
>     This is often because remove_pmd_table lacks a __meminit
>     annotation or the annotation of split_kernel_mapping is wrong.
> 
> Add the appropriate __init and __meminit annotations to make modpost not
> complain. In all the cases there are just a single callsite from another
> __init or __meminit function:
> 
> __meminit remove_pagetable() -> remove_pud_table() -> remove_pmd_table()
> __init prom_init() -> setup_secure_guest()
> __init xive_spapr_init() -> xive_spapr_disabled()

So what changed?  These functions were inlined with older compilers, but
not anymore?


Segher

^ permalink raw reply

* [PATCH] powerpc: fix function annotations to avoid section mismatch warnings with gcc-10
From: Vladis Dronov @ 2020-07-29 13:37 UTC (permalink / raw)
  To: linuxppc-dev
  Cc: Aneesh Kumar K . V, linux-kernel, Paul Mackerras, Vladis Dronov

Certain warnings are emitted for powerpc code when building with a gcc-10
toolset:

    WARNING: modpost: vmlinux.o(.text.unlikely+0x377c): Section mismatch in
    reference from the function remove_pmd_table() to the function
    .meminit.text:split_kernel_mapping()
    The function remove_pmd_table() references
    the function __meminit split_kernel_mapping().
    This is often because remove_pmd_table lacks a __meminit
    annotation or the annotation of split_kernel_mapping is wrong.

Add the appropriate __init and __meminit annotations to make modpost not
complain. In all the cases there are just a single callsite from another
__init or __meminit function:

__meminit remove_pagetable() -> remove_pud_table() -> remove_pmd_table()
__init prom_init() -> setup_secure_guest()
__init xive_spapr_init() -> xive_spapr_disabled()

Signed-off-by: Vladis Dronov <vdronov@redhat.com>
---
 arch/powerpc/kernel/prom_init.c          | 4 ++--
 arch/powerpc/mm/book3s64/radix_pgtable.c | 4 ++--
 arch/powerpc/sysdev/xive/spapr.c         | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c
index 90c604d00b7d..f6ca7f450361 100644
--- a/arch/powerpc/kernel/prom_init.c
+++ b/arch/powerpc/kernel/prom_init.c
@@ -3262,7 +3262,7 @@ static int enter_secure_mode(unsigned long kbase, unsigned long fdt)
 /*
  * Call the Ultravisor to transfer us to secure memory if we have an ESM blob.
  */
-static void setup_secure_guest(unsigned long kbase, unsigned long fdt)
+static void __init setup_secure_guest(unsigned long kbase, unsigned long fdt)
 {
 	int ret;
 
@@ -3292,7 +3292,7 @@ static void setup_secure_guest(unsigned long kbase, unsigned long fdt)
 	}
 }
 #else
-static void setup_secure_guest(unsigned long kbase, unsigned long fdt)
+static void __init setup_secure_guest(unsigned long kbase, unsigned long fdt)
 {
 }
 #endif /* CONFIG_PPC_SVM */
diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c
index bb00e0cba119..b868c07110e3 100644
--- a/arch/powerpc/mm/book3s64/radix_pgtable.c
+++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
@@ -800,7 +800,7 @@ static void __meminit split_kernel_mapping(unsigned long addr, unsigned long end
 	pte_clear(&init_mm, addr, pte);
 }
 
-static void remove_pmd_table(pmd_t *pmd_start, unsigned long addr,
+static void __meminit remove_pmd_table(pmd_t *pmd_start, unsigned long addr,
 			     unsigned long end)
 {
 	unsigned long next;
@@ -825,7 +825,7 @@ static void remove_pmd_table(pmd_t *pmd_start, unsigned long addr,
 	}
 }
 
-static void remove_pud_table(pud_t *pud_start, unsigned long addr,
+static void __meminit remove_pud_table(pud_t *pud_start, unsigned long addr,
 			     unsigned long end)
 {
 	unsigned long next;
diff --git a/arch/powerpc/sysdev/xive/spapr.c b/arch/powerpc/sysdev/xive/spapr.c
index f0551a2be9df..1e3674d7ea7b 100644
--- a/arch/powerpc/sysdev/xive/spapr.c
+++ b/arch/powerpc/sysdev/xive/spapr.c
@@ -768,7 +768,7 @@ static const u8 *get_vec5_feature(unsigned int index)
 	return vec5 + index;
 }
 
-static bool xive_spapr_disabled(void)
+static bool __init xive_spapr_disabled(void)
 {
 	const u8 *vec5_xive;
 
-- 
2.26.2


^ permalink raw reply related

* Re: [patch 01/15] mm/memory.c: avoid access flag update TLB flush for retried page fault
From: Michael Ellerman @ 2020-07-29 13:58 UTC (permalink / raw)
  To: Nicholas Piggin, Linus Torvalds
  Cc: linux-arch, Hillf Danton, mm-commits, Catalin Marinas,
	Hugh Dickins, Josef Bacik, Will Deacon, Linux-MM, Matthew Wilcox,
	Johannes Weiner, Yu Xu, Andrew Morton, linuxppc-dev, Yang Shi,
	Kirill A . Shutemov
In-Reply-To: <1595974242.esf9644sf3.astroid@bobo.none>

Nicholas Piggin <npiggin@gmail.com> writes:
> Excerpts from Linus Torvalds's message of July 29, 2020 5:02 am:
>> On Tue, Jul 28, 2020 at 3:53 AM Nicholas Piggin <npiggin@gmail.com> wrote:
>>>
>>> The quirk is a problem with coprocessor where it's supposed to
>>> invalidate the translation after a fault but it doesn't, so we can get a
>>> read-only TLB stuck after something else does a RO->RW upgrade on the
>>> TLB. Something like that IIRC.  Coprocessors have their own MMU which
>>> lives in the nest not the core, so you need a global TLB flush to
>>> invalidate that thing.
>> 
>> So I assumed, but it does seem confused.
>> 
>> Why? Because if there are stale translations on the co-processor,
>> there's no guarantee that one of the CPU's will have them and take a
>> fault.
>> 
>> So I'm not seeing why a core CPU doing spurious TLB invalidation would
>> follow from "stale TLB in the Nest".
>
> If the nest MMU access faults, it sends an interrupt to the CPU and
> the driver tries to handle the page fault for it (I think that's how
> it works).

Yeah AFAIK. I think they all end up calling copro_handle_mm_fault().

Except for NX/vas, where the model is a fault just causes the request to
be dropped and sent back to userspace to fix things up.

cheers

^ permalink raw reply

* Re: [PATCH v2] selftests: powerpc: Fix online CPU selection
From: Michael Ellerman @ 2020-07-29 13:47 UTC (permalink / raw)
  To: Sandipan Das; +Cc: srikar, kamalesh, shiganta, nasastry, harish, linuxppc-dev
In-Reply-To: <20200609073733.997643-1-sandipan@linux.ibm.com>

Sandipan Das <sandipan@linux.ibm.com> writes:
> The size of the CPU affinity mask must be large enough for
> systems with a very large number of CPUs. Otherwise, tests
> which try to determine the first online CPU by calling
> sched_getaffinity() will fail. This makes sure that the size
> of the allocated affinity mask is dependent on the number of
> CPUs as reported by get_nprocs().
>
> Fixes: 3752e453f6ba ("selftests/powerpc: Add tests of PMU EBBs")
> Reported-by: Shirisha Ganta <shiganta@in.ibm.com>
> Signed-off-by: Sandipan Das <sandipan@linux.ibm.com>
> Reviewed-by: Kamalesh Babulal <kamalesh@linux.vnet.ibm.com>
> ---
> Previous versions can be found at:
> v1: https://lore.kernel.org/linuxppc-dev/20200608144212.985144-1-sandipan@linux.ibm.com/
>
> Changes in v2:
> - Added NULL check for the affinity mask as suggested by Kamalesh.
> - Changed "cpu set" to "CPU affinity mask" in the commit message.

This sometimes breaks, eg:

  # ./count_instructions 
  test: count_instructions
  tags: git_version:v5.8-rc2-327-g9a1d992a7eb7
  sched_getaffinity: Invalid argument
  [FAIL] Test FAILED on line 123
  failure: count_instructions


This system has a messed up SMT setup, but the old code was able to cope
with it:

  # ppc64_cpu --info
  Core   0:    0*    1*    2     3     4     5     6     7  
  Core   1:    8     9    10*   11*   12    13    14    15  
  Core   2:   16    17    18    19    20    21    22    23  
  Core   3:   24    25    26    27    28    29    30    31  
  Core   4:   32    33    34    35    36    37    38    39  
  Core   5:   40    41    42    43    44    45    46    47  
  Core   6:   48    49    50    51    52    53    54    55  
  Core   7:   56    57    58    59    60    61    62    63  
  Core   8:   64    65    66    67    68    69    70    71  
  Core   9:   72    73    74    75    76    77    78    79  
  Core  10:   80    81    82    83    84    85    86    87  
  Core  11:   88    89    90    91    92    93    94    95  
  Core  12:   96    97    98    99   100*  101*  102*  103* 
  Core  13:  104*  105*  106*  107*  108*  109*  110*  111* 
  Core  14:  112*  113*  114*  115*  116*  117*  118*  119* 
  Core  15:  120   121   122   123   124   125   126   127  
  Core  16:  128   129   130   131   132   133   134   135  
  Core  17:  136   137   138   139   140   141   142   143  
  Core  18:  144   145   146   147   148   149   150   151  
  Core  19:  152   153   154   155   156   157   158   159 

cheers

^ permalink raw reply

* Re: ASMedia ASM2142 USB host controller tries to DMA to address zero when doing bulk reads from multiple devices
From: Oliver O'Halloran @ 2020-07-29 13:22 UTC (permalink / raw)
  To: Forest Crossman; +Cc: Alexey Kardashevskiy, linux-usb, linuxppc-dev
In-Reply-To: <CAO3ALPyB1JDvvC27JGgAoTuHh0w+897tPhmTKX9PQWBFCrrnbQ@mail.gmail.com>

On Tue, Jul 21, 2020 at 3:51 PM Forest Crossman <cyrozap@gmail.com> wrote:
>
> Hello, again!
>
> After fixing the issue in my previous thread using this patch[1], I
> decided to do some stress-testing of the controller to make sure it
> could handle my intended workloads and that there were no further DMA
> address issues that would need to be fixed. Unfortunately, it looks
> like there's still more work to be done: when I try to do long bulk
> reads from multiple devices simultaneously, eventually the host
> controller sends a DMA write to address zero, which then triggers EEH
> in my POWER9 system, causing the controller card to get hotplug-reset,
> which of course kills the disk-reading processes. For more details on
> the EEH errors, you can see my kernel's EEH message log[2].

Take the logged address with a grain of salt. If an error occurs while
translating the DMA address the PHB logs all zeros as the "DMA
Address" because it only keeps around the bits that it needs to fetch
the next level of the TCE table. The EEH dump says the error is due to
a TCE permission mis-match so odds the ASmedia controller is writing
to an address that's already been DMA unmapped, hence the logged
address being zeros.

Sorry, I probably should have mentioned that quirk in the last mail.

> The results of the various tests I performed are listed below.
>
> Test results (all failures are due to DMA writes to address zero, all
> hubs are USB 3.0/3.1 Gen1 only, and all disks are accessed via the
> usb-storage driver):
> - Reading simultaneously from two or more disks behind a hub connected
> to one port on the host controller:
>   - FAIL after 20-50 GB of data transferred for each device.
> - Reading simultaneously from two disks, each connected directly to
> one port on the host controller:
>   - FAIL after about 800 GB of data transferred for each device.
> - Reading from one disk behind a hub connected to one port on the host
> controller:
>   - OK for at least 2.7 TB of data transferred (I didn't test the
> whole 8 TB disk).
> - Writing simultaneously to two FL2000 dongles (using osmo-fl2k's
> "fl2k_test"), each connected directly to one port on the host
> controller:
>   - OK, was able to write several dozen terabytes to each device over
> the course of a little over 21 hours.
>
> Seeing how simultaneous writes to multiple devices and reads from
> single devices both seem to work fine, I assume that means this is
> being caused by some race condition in the host controller firmware
> when it responds to multiple read requests.

Most likely. It's possible it's a platform specific race with DMA
map/unmap too, but I think we would be seeing similar issues with
other devices if it was.

> I also assume we're not
> going to be able to convince ASMedia to both fix the bug in their
> firmware and release the details on how to flash it from Linux, so I
> guess we'll just have to figure out how to make the driver talk to the
> controller in a way that avoids triggering the bad DMA write. As
> before, I decided to try a little kernel hacking of my own before
> sending this email, and tried separately enabling the
> XHCI_BROKEN_STREAMS and XHCI_ASMEDIA_MODIFY_FLOWCONTROL quirks in an
> attempt to fix this. As you might expect since you're reading this
> message, neither of those quirks fixed the issue, nor did they even
> make the transfers last any longer before failing.
>
> So now I've reached the limits of my understanding, and I need some
> help devising a fix. If anyone has any comments to that effect, or any
> questions about my hardware configuration, testing methodology, etc.,
> please don't hesitate to air them. Also, if anyone needs me to perform
> additional tests, or collect more log information, I'd be happy to do
> that as well.

I started writing a tool a while ago to use the internal trace bus to
log incoming TLPs. Something like that might allow you to get a better
idea what the faulting access pattern is, but you would still need to
find a way to mitigate the issue. I'm not all that familiar with USB3
so I'm not much help on that front.

^ permalink raw reply

* Re: [PATCH 0/7] Optimization to improve cpu online/offline on Powerpc
From: Satheesh Rajendran @ 2020-07-29 12:54 UTC (permalink / raw)
  To: Srikar Dronamraju
  Cc: Nathan Lynch, Gautham R Shenoy, Michael Neuling, Peter Zijlstra,
	LKML, Nicholas Piggin, Valentin Schneider, Oliver O'Halloran,
	Satheesh Rajendran, linuxppc-dev, Ingo Molnar
In-Reply-To: <20200727075532.30058-1-srikar@linux.vnet.ibm.com>

On Mon, Jul 27, 2020 at 01:25:25PM +0530, Srikar Dronamraju wrote:
> Anton reported that his 4096 cpu (1024 cores in a socket) was taking too
> long to boot. He also analyzed that most of the time was being spent on
> updating cpu_core_mask.
> 
> Here are some optimizations and fixes to make ppc64_cpu --smt=8/ppc64_cpu
> --smt=1 run faster and hence boot the kernel also faster.
> 
> Its based on top of my v4 coregroup support patchset.
> http://lore.kernel.org/lkml/20200727053230.19753-1-srikar@linux.vnet.ibm.com/t/#u
> 
> The first two patches should solve Anton's immediate problem.
> On the unofficial patches, Anton reported that the boot time came from 30
> mins to 6 seconds. (Basically a high core count in a single socket
> configuration). Satheesh also reported similar numbers.
> 
> The rest are simple cleanups/optimizations.
> 
> Since cpu_core_mask is an exported symbol for a long duration, lets retain
> as a snapshot of cpumask_of_node.

boot tested on P9 KVM guest.

without this series:
# dmesg|grep smp
[    0.066624] smp: Bringing up secondary CPUs ...
[  347.521264] smp: Brought up 1 node, 2048 CPUs

with this series:
# dmesg|grep smp
[    0.067744] smp: Bringing up secondary CPUs ...
[    5.416910] smp: Brought up 1 node, 2048 CPUs

Tested-by: Satheesh Rajendran <sathnaga@linux.vnet.ibm.com>

Regards,
-Satheesh
> 
> Architecture:        ppc64le
> Byte Order:          Little Endian
> CPU(s):              160
> On-line CPU(s) list: 0-159
> Thread(s) per core:  4
> Core(s) per socket:  20
> Socket(s):           2
> NUMA node(s):        2
> Model:               2.2 (pvr 004e 1202)
> Model name:          POWER9, altivec supported
> CPU max MHz:         3800.0000
> CPU min MHz:         2166.0000
> L1d cache:           32K
> L1i cache:           32K
> L2 cache:            512K
> L3 cache:            10240K
> NUMA node0 CPU(s):   0-79
> NUMA node8 CPU(s):   80-159
> 
> without patch (powerpc/next)
> [    0.099347] smp: Bringing up secondary CPUs ...
> [    0.832513] smp: Brought up 2 nodes, 160 CPUs
> 
> with powerpc/next + coregroup support patchset
> [    0.099241] smp: Bringing up secondary CPUs ...
> [    0.835627] smp: Brought up 2 nodes, 160 CPUs
> 
> with powerpc/next + coregroup + this patchset
> [    0.097232] smp: Bringing up secondary CPUs ...
> [    0.528457] smp: Brought up 2 nodes, 160 CPUs
> 
> x ppc64_cpu --smt=1
> + ppc64_cpu --smt=4
> 
> without patch
>     N           Min           Max        Median           Avg        Stddev
> x 100         11.82         17.06         14.01         14.05     1.2665247
> + 100         12.25         16.59         13.86       14.1143      1.164293
> 
> with patch
>     N           Min           Max        Median           Avg        Stddev
> x 100         12.68         16.15         14.24        14.238    0.75489246
> + 100         12.93         15.85         14.35       14.2897    0.60041813
> 
> Cc: linuxppc-dev <linuxppc-dev@lists.ozlabs.org>
> Cc: LKML <linux-kernel@vger.kernel.org>
> Cc: Michael Ellerman <mpe@ellerman.id.au>
> Cc: Nicholas Piggin <npiggin@gmail.com>
> Cc: Anton Blanchard <anton@ozlabs.org>
> Cc: Oliver O'Halloran <oohall@gmail.com>
> Cc: Nathan Lynch <nathanl@linux.ibm.com>
> Cc: Michael Neuling <mikey@neuling.org>
> Cc: Gautham R Shenoy <ego@linux.vnet.ibm.com>
> Cc: Satheesh Rajendran <sathnaga@linux.vnet.ibm.com>
> Cc: Ingo Molnar <mingo@kernel.org>
> Cc: Peter Zijlstra <peterz@infradead.org>
> Cc: Valentin Schneider <valentin.schneider@arm.com>
> 
> Srikar Dronamraju (7):
>   powerpc/topology: Update topology_core_cpumask
>   powerpc/smp: Stop updating cpu_core_mask
>   powerpc/smp: Remove get_physical_package_id
>   powerpc/smp: Optimize remove_cpu_from_masks
>   powerpc/smp: Limit cpus traversed to within a node.
>   powerpc/smp: Stop passing mask to update_mask_by_l2
>   powerpc/smp: Depend on cpu_l1_cache_map when adding cpus
> 
>  arch/powerpc/include/asm/smp.h      |  5 --
>  arch/powerpc/include/asm/topology.h |  7 +--
>  arch/powerpc/kernel/smp.c           | 79 +++++++++--------------------
>  3 files changed, 24 insertions(+), 67 deletions(-)
> 
> -- 
> 2.17.1
> 

^ permalink raw reply

* Re: [PATCH] powerpc/powernv/pci: Fix build of pci-ioda.o
From: Gustavo Romero @ 2020-07-29 12:31 UTC (permalink / raw)
  To: Oliver O'Halloran, Gustavo Romero; +Cc: linuxppc-dev
In-Reply-To: <CAOSf1CG4b_G_dzb2f2pRo-P=Ku4f7QB8VJpRmGjejHi+R2qQGg@mail.gmail.com>

Hi Oliver,

On 7/28/20 7:50 PM, Oliver O'Halloran wrote:
> On Wed, Jul 29, 2020 at 8:35 AM Gustavo Romero <gromero@linux.ibm.com> wrote:
>>
>> Currently pnv_ioda_setup_bus_dma() is outside of a CONFIG_IOMMU_API guard
>> and if CONFIG_IOMMU_API=n the build can fail if the compiler sets
>> -Werror=unused-function, because pnv_ioda_setup_bus_dma() is only used in
>> functions guarded by a CONFIG_IOMMU_API guard.
>>
>> That issue can be easily reproduced using the skiroot_defconfig. For other
>> configs, like powernv_defconfig, that issue is hidden by the fact that
>> if CONFIG_IOMMU_SUPPORT is enabled plus other common IOMMU options, like
>> CONFIG_OF_IOMMU, by default CONFIG_IOMMU_API is enabled as well. Hence, for
>> powernv_defconfig, it's necessary to set CONFIG_IOMMU_SUPPORT=n to make the
>> build fail, because CONFIG_PCI=y and pci-ioda.c is included in the build,
>> but since CONFIG_IOMMU_SUPPORT=n the CONFIG_IOMMU_API is disabled, breaking
>> the build.
>>
>> This commit fixes that build issue by moving the pnv_ioda_setup_bus_dma()
>> inside a CONFIG_IOMMU_API guard, so when CONFIG_IOMMU_API is disabled that
>> function is not defined.
> 
> I think a fix for this is already in -next.

Indeed.

For the records, it's fixed in -next by:

commit e3417faec526cbf97773dca691dcd743f5bfeb64
Author: Oliver O'Halloran <oohall@gmail.com>
Date:   Sun Jul 5 23:35:57 2020 +1000

     powerpc/powernv: Move pnv_ioda_setup_bus_dma under CONFIG_IOMMU_API
     
     pnv_ioda_setup_bus_dma() is only used when a passed through PE is
     returned to the host. If the kernel is built without IOMMU support
     this is dead code. Move it under the #ifdef with the rest of the
     IOMMU API support.
     
     Reported-by: kernel test robot <lkp@intel.com>
     Signed-off-by: Oliver O'Halloran <oohall@gmail.com>
     Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru>
     Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
     Link: https://lore.kernel.org/r/20200705133557.443607-2-oohall@gmail.com


Thanks.


Cheers,
Gustavo

^ permalink raw reply

* [PATCH v6 11/11] ppc64/kexec_file: enable early kernel's OPAL calls
From: Hari Bathini @ 2020-07-29 11:44 UTC (permalink / raw)
  To: Michael Ellerman
  Cc: Pingfan Liu, Kexec-ml, Mimi Zohar, Nayna Jain, Petr Tesarik,
	Mahesh J Salgaonkar, Sourabh Jain, lkml, linuxppc-dev,
	Vivek Goyal, Andrew Morton, Dave Young, Thiago Jung Bauermann,
	Eric Biederman
In-Reply-To: <159602259854.575379.16910915605574571585.stgit@hbathini>

Kernel built with CONFIG_PPC_EARLY_DEBUG_OPAL enabled expects r8 & r9
to be filled with OPAL base & entry addresses respectively. Setting
these registers allows the kernel to perform OPAL calls before the
device tree is parsed.

Signed-off-by: Hari Bathini <hbathini@linux.ibm.com>
Reviewed-by: Thiago Jung Bauermann <bauerman@linux.ibm.com>
---

v5 -> v6:
* Added Reviewed-by tag from Thiago.
* Moved the patch to end of the series for mpe to take a call on whether
  to have it or not.

v4 -> v5:
* New patch. Updated opal_base & opal_entry values in r8 & r9 respectively.
  This change was part of the below dropped patch in v4:
    - https://lore.kernel.org/patchwork/patch/1275667/


 arch/powerpc/kexec/file_load_64.c      |   20 ++++++++++++++++++++
 arch/powerpc/purgatory/trampoline_64.S |   16 ++++++++++++++++
 2 files changed, 36 insertions(+)

diff --git a/arch/powerpc/kexec/file_load_64.c b/arch/powerpc/kexec/file_load_64.c
index c6a37ad5a0a4..53bb71e3a2e1 100644
--- a/arch/powerpc/kexec/file_load_64.c
+++ b/arch/powerpc/kexec/file_load_64.c
@@ -876,6 +876,7 @@ int setup_purgatory_ppc64(struct kimage *image, const void *slave_code,
 			  const void *fdt, unsigned long kernel_load_addr,
 			  unsigned long fdt_load_addr)
 {
+	struct device_node *dn = NULL;
 	int ret;
 
 	ret = setup_purgatory(image, slave_code, fdt, kernel_load_addr,
@@ -903,9 +904,28 @@ int setup_purgatory_ppc64(struct kimage *image, const void *slave_code,
 					     &image->arch.backup_start,
 					     sizeof(image->arch.backup_start),
 					     false);
+	if (ret)
+		goto out;
+
+	/* Setup OPAL base & entry values */
+	dn = of_find_node_by_path("/ibm,opal");
+	if (dn) {
+		u64 val;
+
+		of_property_read_u64(dn, "opal-base-address", &val);
+		ret = kexec_purgatory_get_set_symbol(image, "opal_base", &val,
+						     sizeof(val), false);
+		if (ret)
+			goto out;
+
+		of_property_read_u64(dn, "opal-entry-address", &val);
+		ret = kexec_purgatory_get_set_symbol(image, "opal_entry", &val,
+						     sizeof(val), false);
+	}
 out:
 	if (ret)
 		pr_err("Failed to setup purgatory symbols");
+	of_node_put(dn);
 	return ret;
 }
 
diff --git a/arch/powerpc/purgatory/trampoline_64.S b/arch/powerpc/purgatory/trampoline_64.S
index e79077ff1355..e6a6e7e6dfe4 100644
--- a/arch/powerpc/purgatory/trampoline_64.S
+++ b/arch/powerpc/purgatory/trampoline_64.S
@@ -87,6 +87,10 @@ master:
 	li	%r4,28
 	STWX_BE	%r17,%r3,%r4	/* Store my cpu as __be32 at byte 28 */
 1:
+	/* Load opal base and entry values in r8 & r9 respectively */
+	ld	%r8,(opal_base - 0b)(%r18)
+	ld	%r9,(opal_entry - 0b)(%r18)
+
 	/* load the kernel address */
 	ld	%r4,(kernel - 0b)(%r18)
 
@@ -133,6 +137,18 @@ backup_start:
 	.8byte  0x0
 	.size backup_start, . - backup_start
 
+	.balign 8
+	.globl opal_base
+opal_base:
+	.8byte  0x0
+	.size opal_base, . - opal_base
+
+	.balign 8
+	.globl opal_entry
+opal_entry:
+	.8byte  0x0
+	.size opal_entry, . - opal_entry
+
 	.data
 	.balign 8
 .globl purgatory_sha256_digest



^ permalink raw reply related

* [PATCH v6 10/11] ppc64/kexec_file: fix kexec load failure with lack of memory hole
From: Hari Bathini @ 2020-07-29 11:43 UTC (permalink / raw)
  To: Michael Ellerman
  Cc: Pingfan Liu, Kexec-ml, Mimi Zohar, Nayna Jain, Petr Tesarik,
	Mahesh J Salgaonkar, Sourabh Jain, lkml, linuxppc-dev,
	Thiago Jung Bauermann, Andrew Morton, Dave Young, Vivek Goyal,
	Eric Biederman
In-Reply-To: <159602259854.575379.16910915605574571585.stgit@hbathini>

The kexec purgatory has to run in real mode. Only the first memory
block maybe accessible in real mode. And, unlike the case with panic
kernel, no memory is set aside for regular kexec load. Another thing
to note is, the memory for crashkernel is reserved at an offset of
128MB. So, when crashkernel memory is reserved, the memory ranges to
load kexec segments shrink further as the generic code only looks for
memblock free memory ranges and in all likelihood only a tiny bit of
memory from 0 to 128MB would be available to load kexec segments.

With kdump being used by default in general, kexec file load is likely
to fail almost always. This can be fixed by changing the memory hole
lookup logic for regular kexec to use the same method as kdump. This
would mean that most kexec segments will overlap with crashkernel
memory region. That should still be ok as the pages, whose destination
address isn't available while loading, are placed in an intermediate
location till a flush to the actual destination address happens during
kexec boot sequence.

Signed-off-by: Hari Bathini <hbathini@linux.ibm.com>
Tested-by: Pingfan Liu <piliu@redhat.com>
Reviewed-by: Thiago Jung Bauermann <bauerman@linux.ibm.com>
---

v5 -> v6:
* Unchanged.

v4 -> v5:
* Unchanged.

v3 -> v4:
* Unchanged. Added Reviewed-by tag from Thiago.

v2 -> v3:
* Unchanged. Added Tested-by tag from Pingfan.

v1 -> v2:
* New patch to fix locating memory hole for kexec_file_load (kexec -s -l)
  when memory is reserved for crashkernel.


 arch/powerpc/kexec/file_load_64.c |   33 ++++++++++++++-------------------
 1 file changed, 14 insertions(+), 19 deletions(-)

diff --git a/arch/powerpc/kexec/file_load_64.c b/arch/powerpc/kexec/file_load_64.c
index f13c5b8399e1..c6a37ad5a0a4 100644
--- a/arch/powerpc/kexec/file_load_64.c
+++ b/arch/powerpc/kexec/file_load_64.c
@@ -1012,13 +1012,6 @@ int arch_kexec_locate_mem_hole(struct kexec_buf *kbuf)
 	u64 buf_min, buf_max;
 	int ret;
 
-	/*
-	 * Use the generic kexec_locate_mem_hole for regular
-	 * kexec_file_load syscall
-	 */
-	if (kbuf->image->type != KEXEC_TYPE_CRASH)
-		return kexec_locate_mem_hole(kbuf);
-
 	/* Look up the exclude ranges list while locating the memory hole */
 	emem = &(kbuf->image->arch.exclude_ranges);
 	if (!(*emem) || ((*emem)->nr_ranges == 0)) {
@@ -1026,11 +1019,15 @@ int arch_kexec_locate_mem_hole(struct kexec_buf *kbuf)
 		return kexec_locate_mem_hole(kbuf);
 	}
 
+	buf_min = kbuf->buf_min;
+	buf_max = kbuf->buf_max;
 	/* Segments for kdump kernel should be within crashkernel region */
-	buf_min = (kbuf->buf_min < crashk_res.start ?
-		   crashk_res.start : kbuf->buf_min);
-	buf_max = (kbuf->buf_max > crashk_res.end ?
-		   crashk_res.end : kbuf->buf_max);
+	if (kbuf->image->type == KEXEC_TYPE_CRASH) {
+		buf_min = (buf_min < crashk_res.start ?
+			   crashk_res.start : buf_min);
+		buf_max = (buf_max > crashk_res.end ?
+			   crashk_res.end : buf_max);
+	}
 
 	if (buf_min > buf_max) {
 		pr_err("Invalid buffer min and/or max values\n");
@@ -1067,15 +1064,13 @@ int arch_kexec_locate_mem_hole(struct kexec_buf *kbuf)
 int arch_kexec_kernel_image_probe(struct kimage *image, void *buf,
 				  unsigned long buf_len)
 {
-	if (image->type == KEXEC_TYPE_CRASH) {
-		int ret;
+	int ret;
 
-		/* Get exclude memory ranges needed for setting up kdump segments */
-		ret = get_exclude_memory_ranges(&(image->arch.exclude_ranges));
-		if (ret) {
-			pr_err("Failed to setup exclude memory ranges for buffer lookup\n");
-			return ret;
-		}
+	/* Get exclude memory ranges needed for setting up kexec segments */
+	ret = get_exclude_memory_ranges(&(image->arch.exclude_ranges));
+	if (ret) {
+		pr_err("Failed to setup exclude memory ranges for buffer lookup\n");
+		return ret;
 	}
 
 	return kexec_image_probe_default(image, buf, buf_len);



^ permalink raw reply related

* [PATCH v6 09/11] ppc64/kexec_file: add appropriate regions for memory reserve map
From: Hari Bathini @ 2020-07-29 11:43 UTC (permalink / raw)
  To: Michael Ellerman
  Cc: Pingfan Liu, Kexec-ml, Mimi Zohar, Nayna Jain, Petr Tesarik,
	Mahesh J Salgaonkar, Sourabh Jain, lkml, linuxppc-dev,
	Thiago Jung Bauermann, Andrew Morton, Dave Young, Vivek Goyal,
	Eric Biederman
In-Reply-To: <159602259854.575379.16910915605574571585.stgit@hbathini>

While initrd, elfcorehdr and backup regions are already added to the
reserve map, there are a few missing regions that need to be added to
the memory reserve map. Add them here. And now that all the changes
to load panic kernel are in place, claim likewise.

Signed-off-by: Hari Bathini <hbathini@linux.ibm.com>
Tested-by: Pingfan Liu <piliu@redhat.com>
Reviewed-by: Thiago Jung Bauermann <bauerman@linux.ibm.com>
---

v5 -> v6:
* Unchanged.

v4 -> v5:
* Unchanged.

v3 -> v4:
* Fixed a spellcheck and added Reviewed-by tag from Thiago.

v2 -> v3:
* Unchanged. Added Tested-by tag from Pingfan.

v1 -> v2:
* Updated add_rtas_mem_range() & add_opal_mem_range() callsites based on
  the new prototype for these functions.


 arch/powerpc/kexec/file_load_64.c |   58 ++++++++++++++++++++++++++++++++++---
 1 file changed, 53 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/kexec/file_load_64.c b/arch/powerpc/kexec/file_load_64.c
index 0d280d097cd6..f13c5b8399e1 100644
--- a/arch/powerpc/kexec/file_load_64.c
+++ b/arch/powerpc/kexec/file_load_64.c
@@ -205,6 +205,34 @@ static int get_crash_memory_ranges(struct crash_mem **mem_ranges)
 	return ret;
 }
 
+/**
+ * get_reserved_memory_ranges - Get reserve memory ranges. This list includes
+ *                              memory regions that should be added to the
+ *                              memory reserve map to ensure the region is
+ *                              protected from any mischief.
+ * @mem_ranges:                 Range list to add the memory ranges to.
+ *
+ * Returns 0 on success, negative errno on error.
+ */
+static int get_reserved_memory_ranges(struct crash_mem **mem_ranges)
+{
+	int ret;
+
+	ret = add_rtas_mem_range(mem_ranges);
+	if (ret)
+		goto out;
+
+	ret = add_tce_mem_ranges(mem_ranges);
+	if (ret)
+		goto out;
+
+	ret = add_reserved_mem_ranges(mem_ranges);
+out:
+	if (ret)
+		pr_err("Failed to setup reserved memory ranges\n");
+	return ret;
+}
+
 /**
  * __locate_mem_hole_top_down - Looks top down for a large enough memory hole
  *                              in the memory regions between buf_min & buf_max
@@ -897,8 +925,8 @@ int setup_new_fdt_ppc64(const struct kimage *image, void *fdt,
 			unsigned long initrd_load_addr,
 			unsigned long initrd_len, const char *cmdline)
 {
-	struct crash_mem *umem = NULL;
-	int ret;
+	struct crash_mem *umem = NULL, *rmem = NULL;
+	int i, nr_ranges, ret;
 
 	ret = setup_new_fdt(image, fdt, initrd_load_addr, initrd_len, cmdline);
 	if (ret)
@@ -941,7 +969,27 @@ int setup_new_fdt_ppc64(const struct kimage *image, void *fdt,
 		}
 	}
 
+	/* Update memory reserve map */
+	ret = get_reserved_memory_ranges(&rmem);
+	if (ret)
+		goto out;
+
+	nr_ranges = rmem ? rmem->nr_ranges : 0;
+	for (i = 0; i < nr_ranges; i++) {
+		u64 base, size;
+
+		base = rmem->ranges[i].start;
+		size = rmem->ranges[i].end - base + 1;
+		ret = fdt_add_mem_rsv(fdt, base, size);
+		if (ret) {
+			pr_err("Error updating memory reserve map: %s\n",
+			       fdt_strerror(ret));
+			goto out;
+		}
+	}
+
 out:
+	kfree(rmem);
 	kfree(umem);
 	return ret;
 }
@@ -1024,10 +1072,10 @@ int arch_kexec_kernel_image_probe(struct kimage *image, void *buf,
 
 		/* Get exclude memory ranges needed for setting up kdump segments */
 		ret = get_exclude_memory_ranges(&(image->arch.exclude_ranges));
-		if (ret)
+		if (ret) {
 			pr_err("Failed to setup exclude memory ranges for buffer lookup\n");
-		/* Return this until all changes for panic kernel are in */
-		return -EOPNOTSUPP;
+			return ret;
+		}
 	}
 
 	return kexec_image_probe_default(image, buf, buf_len);



^ permalink raw reply related

* [PATCH v6 08/11] ppc64/kexec_file: prepare elfcore header for crashing kernel
From: Hari Bathini @ 2020-07-29 11:43 UTC (permalink / raw)
  To: Michael Ellerman
  Cc: Pingfan Liu, Kexec-ml, Mimi Zohar, Nayna Jain, Petr Tesarik,
	Mahesh J Salgaonkar, Sourabh Jain, lkml, linuxppc-dev,
	Thiago Jung Bauermann, Andrew Morton, Dave Young, Vivek Goyal,
	Eric Biederman
In-Reply-To: <159602259854.575379.16910915605574571585.stgit@hbathini>

Prepare elf headers for the crashing kernel's core file using
crash_prepare_elf64_headers() and pass on this info to kdump
kernel by updating its command line with elfcorehdr parameter.
Also, add elfcorehdr location to reserve map to avoid it from
being stomped on while booting.

Signed-off-by: Hari Bathini <hbathini@linux.ibm.com>
Tested-by: Pingfan Liu <piliu@redhat.com>
Reviewed-by: Thiago Jung Bauermann <bauerman@linux.ibm.com>
---

v5 -> v6:
* Unchanged.

v4 -> v5:
* Unchanged. Added Reviewed-by tag from Thiago.

v3 -> v4:
* Added a FIXME tag to indicate issue in adding opal/rtas regions to
  core image.
* Folded prepare_elf_headers() function into load_elfcorehdr_segment().

v2 -> v3:
* Unchanged. Added Tested-by tag from Pingfan.

v1 -> v2:
* Tried merging adjacent memory ranges on hitting maximum ranges limit
  to reduce reallocations for memory ranges and also, minimize PT_LOAD
  segments for elfcore.
* Updated add_rtas_mem_range() & add_opal_mem_range() callsites based on
  the new prototype for these functions.


 arch/powerpc/include/asm/kexec.h  |    6 +
 arch/powerpc/kexec/elf_64.c       |   12 +++
 arch/powerpc/kexec/file_load.c    |   49 +++++++++++
 arch/powerpc/kexec/file_load_64.c |  165 +++++++++++++++++++++++++++++++++++++
 4 files changed, 232 insertions(+)

diff --git a/arch/powerpc/include/asm/kexec.h b/arch/powerpc/include/asm/kexec.h
index f9514ebeffaa..fe885bc3127e 100644
--- a/arch/powerpc/include/asm/kexec.h
+++ b/arch/powerpc/include/asm/kexec.h
@@ -108,12 +108,18 @@ struct kimage_arch {
 	unsigned long backup_start;
 	void *backup_buf;
 
+	unsigned long elfcorehdr_addr;
+	unsigned long elf_headers_sz;
+	void *elf_headers;
+
 #ifdef CONFIG_IMA_KEXEC
 	phys_addr_t ima_buffer_addr;
 	size_t ima_buffer_size;
 #endif
 };
 
+char *setup_kdump_cmdline(struct kimage *image, char *cmdline,
+			  unsigned long cmdline_len);
 int setup_purgatory(struct kimage *image, const void *slave_code,
 		    const void *fdt, unsigned long kernel_load_addr,
 		    unsigned long fdt_load_addr);
diff --git a/arch/powerpc/kexec/elf_64.c b/arch/powerpc/kexec/elf_64.c
index 76e2fc7e6dc3..d0e459bb2f05 100644
--- a/arch/powerpc/kexec/elf_64.c
+++ b/arch/powerpc/kexec/elf_64.c
@@ -35,6 +35,7 @@ static void *elf64_load(struct kimage *image, char *kernel_buf,
 	void *fdt;
 	const void *slave_code;
 	struct elfhdr ehdr;
+	char *modified_cmdline = NULL;
 	struct kexec_elf_info elf_info;
 	struct kexec_buf kbuf = { .image = image, .buf_min = 0,
 				  .buf_max = ppc64_rma_size };
@@ -75,6 +76,16 @@ static void *elf64_load(struct kimage *image, char *kernel_buf,
 			pr_err("Failed to load kdump kernel segments\n");
 			goto out;
 		}
+
+		/* Setup cmdline for kdump kernel case */
+		modified_cmdline = setup_kdump_cmdline(image, cmdline,
+						       cmdline_len);
+		if (!modified_cmdline) {
+			pr_err("Setting up cmdline for kdump kernel failed\n");
+			ret = -EINVAL;
+			goto out;
+		}
+		cmdline = modified_cmdline;
 	}
 
 	if (initrd != NULL) {
@@ -131,6 +142,7 @@ static void *elf64_load(struct kimage *image, char *kernel_buf,
 		pr_err("Error setting up the purgatory.\n");
 
 out:
+	kfree(modified_cmdline);
 	kexec_free_elf_info(&elf_info);
 
 	/* Make kimage_file_post_load_cleanup free the fdt buffer for us. */
diff --git a/arch/powerpc/kexec/file_load.c b/arch/powerpc/kexec/file_load.c
index 38439aba27d7..d52c09729edd 100644
--- a/arch/powerpc/kexec/file_load.c
+++ b/arch/powerpc/kexec/file_load.c
@@ -18,10 +18,45 @@
 #include <linux/kexec.h>
 #include <linux/of_fdt.h>
 #include <linux/libfdt.h>
+#include <asm/setup.h>
 #include <asm/ima.h>
 
 #define SLAVE_CODE_SIZE		256	/* First 0x100 bytes */
 
+/**
+ * setup_kdump_cmdline - Prepend "elfcorehdr=<addr> " to command line
+ *                       of kdump kernel for exporting the core.
+ * @image:               Kexec image
+ * @cmdline:             Command line parameters to update.
+ * @cmdline_len:         Length of the cmdline parameters.
+ *
+ * kdump segment must be setup before calling this function.
+ *
+ * Returns new cmdline buffer for kdump kernel on success, NULL otherwise.
+ */
+char *setup_kdump_cmdline(struct kimage *image, char *cmdline,
+			  unsigned long cmdline_len)
+{
+	int elfcorehdr_strlen;
+	char *cmdline_ptr;
+
+	cmdline_ptr = kzalloc(COMMAND_LINE_SIZE, GFP_KERNEL);
+	if (!cmdline_ptr)
+		return NULL;
+
+	elfcorehdr_strlen = sprintf(cmdline_ptr, "elfcorehdr=0x%lx ",
+				    image->arch.elfcorehdr_addr);
+
+	if (elfcorehdr_strlen + cmdline_len > COMMAND_LINE_SIZE) {
+		pr_err("Appending elfcorehdr=<addr> exceeds cmdline size\n");
+		kfree(cmdline_ptr);
+		return NULL;
+	}
+
+	memcpy(cmdline_ptr + elfcorehdr_strlen, cmdline, cmdline_len);
+	return cmdline_ptr;
+}
+
 /**
  * setup_purgatory - initialize the purgatory's global variables
  * @image:		kexec image.
@@ -221,6 +256,20 @@ int setup_new_fdt(const struct kimage *image, void *fdt,
 		}
 	}
 
+	if (image->type == KEXEC_TYPE_CRASH) {
+		/*
+		 * Avoid elfcorehdr from being stomped on in kdump kernel by
+		 * setting up memory reserve map.
+		 */
+		ret = fdt_add_mem_rsv(fdt, image->arch.elfcorehdr_addr,
+				      image->arch.elf_headers_sz);
+		if (ret) {
+			pr_err("Error reserving elfcorehdr memory: %s\n",
+			       fdt_strerror(ret));
+			goto err;
+		}
+	}
+
 	ret = setup_ima_buffer(image, fdt, chosen_node);
 	if (ret) {
 		pr_err("Error setting up the new device tree.\n");
diff --git a/arch/powerpc/kexec/file_load_64.c b/arch/powerpc/kexec/file_load_64.c
index a81bffb72cc5..0d280d097cd6 100644
--- a/arch/powerpc/kexec/file_load_64.c
+++ b/arch/powerpc/kexec/file_load_64.c
@@ -128,6 +128,83 @@ static int get_usable_memory_ranges(struct crash_mem **mem_ranges)
 	return ret;
 }
 
+/**
+ * get_crash_memory_ranges - Get crash memory ranges. This list includes
+ *                           first/crashing kernel's memory regions that
+ *                           would be exported via an elfcore.
+ * @mem_ranges:              Range list to add the memory ranges to.
+ *
+ * Returns 0 on success, negative errno on error.
+ */
+static int get_crash_memory_ranges(struct crash_mem **mem_ranges)
+{
+	struct memblock_region *reg;
+	struct crash_mem *tmem;
+	int ret;
+
+	for_each_memblock(memory, reg) {
+		u64 base, size;
+
+		base = (u64)reg->base;
+		size = (u64)reg->size;
+
+		/* Skip backup memory region, which needs a separate entry */
+		if (base == BACKUP_SRC_START) {
+			if (size > BACKUP_SRC_SIZE) {
+				base = BACKUP_SRC_END + 1;
+				size -= BACKUP_SRC_SIZE;
+			} else
+				continue;
+		}
+
+		ret = add_mem_range(mem_ranges, base, size);
+		if (ret)
+			goto out;
+
+		/* Try merging adjacent ranges before reallocation attempt */
+		if ((*mem_ranges)->nr_ranges == (*mem_ranges)->max_nr_ranges)
+			sort_memory_ranges(*mem_ranges, true);
+	}
+
+	/* Reallocate memory ranges if there is no space to split ranges */
+	tmem = *mem_ranges;
+	if (tmem && (tmem->nr_ranges == tmem->max_nr_ranges)) {
+		tmem = realloc_mem_ranges(mem_ranges);
+		if (!tmem)
+			goto out;
+	}
+
+	/* Exclude crashkernel region */
+	ret = crash_exclude_mem_range(tmem, crashk_res.start, crashk_res.end);
+	if (ret)
+		goto out;
+
+	/*
+	 * FIXME: For now, stay in parity with kexec-tools but if RTAS/OPAL
+	 *        regions are exported to save their context at the time of
+	 *        crash, they should actually be backed up just like the
+	 *        first 64K bytes of memory.
+	 */
+	ret = add_rtas_mem_range(mem_ranges);
+	if (ret)
+		goto out;
+
+	ret = add_opal_mem_range(mem_ranges);
+	if (ret)
+		goto out;
+
+	/* create a separate program header for the backup region */
+	ret = add_mem_range(mem_ranges, BACKUP_SRC_START, BACKUP_SRC_SIZE);
+	if (ret)
+		goto out;
+
+	sort_memory_ranges(*mem_ranges, false);
+out:
+	if (ret)
+		pr_err("Failed to setup crash memory ranges\n");
+	return ret;
+}
+
 /**
  * __locate_mem_hole_top_down - Looks top down for a large enough memory hole
  *                              in the memory regions between buf_min & buf_max
@@ -647,6 +724,81 @@ static int load_backup_segment(struct kimage *image, struct kexec_buf *kbuf)
 	return 0;
 }
 
+/**
+ * update_backup_region_phdr - Update backup region's offset for the core to
+ *                             export the region appropriately.
+ * @image:                     Kexec image.
+ * @ehdr:                      ELF core header.
+ *
+ * Assumes an exclusive program header is setup for the backup region
+ * in the ELF headers
+ *
+ * Returns nothing.
+ */
+static void update_backup_region_phdr(struct kimage *image, Elf64_Ehdr *ehdr)
+{
+	Elf64_Phdr *phdr;
+	unsigned int i;
+
+	phdr = (Elf64_Phdr *)(ehdr + 1);
+	for (i = 0; i < ehdr->e_phnum; i++) {
+		if (phdr->p_paddr == BACKUP_SRC_START) {
+			phdr->p_offset = image->arch.backup_start;
+			pr_debug("Backup region offset updated to 0x%lx\n",
+				 image->arch.backup_start);
+			return;
+		}
+	}
+}
+
+/**
+ * load_elfcorehdr_segment - Setup crash memory ranges and initialize elfcorehdr
+ *                           segment needed to load kdump kernel.
+ * @image:                   Kexec image.
+ * @kbuf:                    Buffer contents and memory parameters.
+ *
+ * Returns 0 on success, negative errno on error.
+ */
+static int load_elfcorehdr_segment(struct kimage *image, struct kexec_buf *kbuf)
+{
+	struct crash_mem *cmem = NULL;
+	unsigned long headers_sz;
+	void *headers = NULL;
+	int ret;
+
+	ret = get_crash_memory_ranges(&cmem);
+	if (ret)
+		goto out;
+
+	/* Setup elfcorehdr segment */
+	ret = crash_prepare_elf64_headers(cmem, false, &headers, &headers_sz);
+	if (ret) {
+		pr_err("Failed to prepare elf headers for the core\n");
+		goto out;
+	}
+
+	/* Fix the offset for backup region in the ELF header */
+	update_backup_region_phdr(image, headers);
+
+	kbuf->buffer = headers;
+	kbuf->mem = KEXEC_BUF_MEM_UNKNOWN;
+	kbuf->bufsz = kbuf->memsz = headers_sz;
+	kbuf->top_down = false;
+
+	ret = kexec_add_buffer(kbuf);
+	if (ret) {
+		vfree(headers);
+		goto out;
+	}
+
+	image->arch.elfcorehdr_addr = kbuf->mem;
+	image->arch.elf_headers_sz = headers_sz;
+	image->arch.elf_headers = headers;
+out:
+	kfree(cmem);
+	return ret;
+}
+
 /**
  * load_crashdump_segments_ppc64 - Initialize the additional segements needed
  *                                 to load kdump kernel.
@@ -668,6 +820,15 @@ int load_crashdump_segments_ppc64(struct kimage *image,
 	}
 	pr_debug("Loaded the backup region at 0x%lx\n", kbuf->mem);
 
+	/* Load elfcorehdr segment - to export crashing kernel's vmcore */
+	ret = load_elfcorehdr_segment(image, kbuf);
+	if (ret) {
+		pr_err("Failed to load elfcorehdr segment\n");
+		return ret;
+	}
+	pr_debug("Loaded elf core header at 0x%lx, bufsz=0x%lx memsz=0x%lx\n",
+		 image->arch.elfcorehdr_addr, kbuf->bufsz, kbuf->memsz);
+
 	return 0;
 }
 
@@ -887,5 +1048,9 @@ int arch_kimage_file_post_load_cleanup(struct kimage *image)
 	vfree(image->arch.backup_buf);
 	image->arch.backup_buf = NULL;
 
+	vfree(image->arch.elf_headers);
+	image->arch.elf_headers = NULL;
+	image->arch.elf_headers_sz = 0;
+
 	return kexec_image_post_load_cleanup_default(image);
 }



^ permalink raw reply related

* [PATCH v6 07/11] ppc64/kexec_file: setup backup region for kdump kernel
From: Hari Bathini @ 2020-07-29 11:42 UTC (permalink / raw)
  To: Michael Ellerman
  Cc: Pingfan Liu, Kexec-ml, Mimi Zohar, Nayna Jain, Petr Tesarik,
	Mahesh J Salgaonkar, Sourabh Jain, lkml, linuxppc-dev,
	Vivek Goyal, Andrew Morton, Dave Young, Thiago Jung Bauermann,
	Eric Biederman
In-Reply-To: <159602259854.575379.16910915605574571585.stgit@hbathini>

Though kdump kernel boots from loaded address, the first 64KB of it is
copied down to real 0. So, setup a backup region and let purgatory
copy the first 64KB of crashed kernel into this backup region before
booting into kdump kernel. Update reserve map with backup region and
crashed kernel's memory to avoid kdump kernel from accidentially using
that memory.

Signed-off-by: Hari Bathini <hbathini@linux.ibm.com>
Reviewed-by: Thiago Jung Bauermann <bauerman@linux.ibm.com>
---

v5 -> v6:
* Added Reviewed-by tag from Thiago.
* The comment explaining why a source buffer is needed for backup segment
  is moved to appropriate place.
* Used the special branching instruction mpe suggested instead of "bl 0f"
* Added local labels & space between arguments in assembler code.

v4 -> v5:
* Did not add Reviewed-by tag from Thiago yet as he might want to reconsider
  it with the changes in this patch.
* Wrote backup region copy code in assembler. Also, dropped the patch that
  applies RELA relocations & the patch that sets up stack as they are no
  longer needed.
* For correctness, updated fdt_add_mem_rsv() to take "BACKUP_SRC_END + 1"
  as start address instead of BACKUP_SRC_SIZE.

v3 -> v4:
* Moved fdt_add_mem_rsv() for backup region under kdump flag, on Thiago's
  suggestion, as it is only relevant for kdump.

v2 -> v3:
* Dropped check for backup_start in trampoline_64.S as purgatory() takes
  care of it anyway.

v1 -> v2:
* Check if backup region is available before branching out. This is
  to keep `kexec -l -s` flow as before as much as possible. This would
  eventually change with more testing and addition of sha256 digest
  verification support.
* Fixed missing prototype for purgatory() as reported by lkp.
  lkp report for reference:
    - https://lore.kernel.org/patchwork/patch/1264423/


 arch/powerpc/include/asm/crashdump-ppc64.h |   19 ++++++
 arch/powerpc/include/asm/kexec.h           |    7 ++
 arch/powerpc/kexec/elf_64.c                |    9 +++
 arch/powerpc/kexec/file_load_64.c          |   93 +++++++++++++++++++++++++++-
 arch/powerpc/purgatory/trampoline_64.S     |   38 ++++++++++-
 5 files changed, 159 insertions(+), 7 deletions(-)
 create mode 100644 arch/powerpc/include/asm/crashdump-ppc64.h

diff --git a/arch/powerpc/include/asm/crashdump-ppc64.h b/arch/powerpc/include/asm/crashdump-ppc64.h
new file mode 100644
index 000000000000..68d9717cc5ee
--- /dev/null
+++ b/arch/powerpc/include/asm/crashdump-ppc64.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef _ASM_POWERPC_CRASHDUMP_PPC64_H
+#define _ASM_POWERPC_CRASHDUMP_PPC64_H
+
+/*
+ * Backup region - first 64KB of System RAM
+ *
+ * If ever the below macros are to be changed, please be judicious.
+ * The implicit assumptions are:
+ *     - start, end & size are less than UINT32_MAX.
+ *     - start & size are at least 8 byte aligned.
+ *
+ * For implementation details: arch/powerpc/purgatory/trampoline_64.S
+ */
+#define BACKUP_SRC_START	0
+#define BACKUP_SRC_END		0xffff
+#define BACKUP_SRC_SIZE		(BACKUP_SRC_END - BACKUP_SRC_START + 1)
+
+#endif /* __ASM_POWERPC_CRASHDUMP_PPC64_H */
diff --git a/arch/powerpc/include/asm/kexec.h b/arch/powerpc/include/asm/kexec.h
index 835dc92e091c..f9514ebeffaa 100644
--- a/arch/powerpc/include/asm/kexec.h
+++ b/arch/powerpc/include/asm/kexec.h
@@ -105,6 +105,9 @@ extern const struct kexec_file_ops kexec_elf64_ops;
 struct kimage_arch {
 	struct crash_mem *exclude_ranges;
 
+	unsigned long backup_start;
+	void *backup_buf;
+
 #ifdef CONFIG_IMA_KEXEC
 	phys_addr_t ima_buffer_addr;
 	size_t ima_buffer_size;
@@ -120,6 +123,10 @@ int setup_new_fdt(const struct kimage *image, void *fdt,
 int delete_fdt_mem_rsv(void *fdt, unsigned long start, unsigned long size);
 
 #ifdef CONFIG_PPC64
+struct kexec_buf;
+
+int load_crashdump_segments_ppc64(struct kimage *image,
+				  struct kexec_buf *kbuf);
 int setup_purgatory_ppc64(struct kimage *image, const void *slave_code,
 			  const void *fdt, unsigned long kernel_load_addr,
 			  unsigned long fdt_load_addr);
diff --git a/arch/powerpc/kexec/elf_64.c b/arch/powerpc/kexec/elf_64.c
index 64c15a5a280b..76e2fc7e6dc3 100644
--- a/arch/powerpc/kexec/elf_64.c
+++ b/arch/powerpc/kexec/elf_64.c
@@ -68,6 +68,15 @@ static void *elf64_load(struct kimage *image, char *kernel_buf,
 
 	pr_debug("Loaded purgatory at 0x%lx\n", pbuf.mem);
 
+	/* Load additional segments needed for panic kernel */
+	if (image->type == KEXEC_TYPE_CRASH) {
+		ret = load_crashdump_segments_ppc64(image, &kbuf);
+		if (ret) {
+			pr_err("Failed to load kdump kernel segments\n");
+			goto out;
+		}
+	}
+
 	if (initrd != NULL) {
 		kbuf.buffer = initrd;
 		kbuf.bufsz = kbuf.memsz = initrd_len;
diff --git a/arch/powerpc/kexec/file_load_64.c b/arch/powerpc/kexec/file_load_64.c
index f94660874765..a81bffb72cc5 100644
--- a/arch/powerpc/kexec/file_load_64.c
+++ b/arch/powerpc/kexec/file_load_64.c
@@ -20,8 +20,10 @@
 #include <linux/of_device.h>
 #include <linux/memblock.h>
 #include <linux/slab.h>
+#include <linux/vmalloc.h>
 #include <asm/drmem.h>
 #include <asm/kexec_ranges.h>
+#include <asm/crashdump-ppc64.h>
 
 struct umem_info {
 	u64 *buf;		/* data buffer for usable-memory property */
@@ -605,6 +607,70 @@ static int update_usable_mem_fdt(void *fdt, struct crash_mem *usable_mem)
 	return ret;
 }
 
+/**
+ * load_backup_segment - Locate a memory hole to place the backup region.
+ * @image:               Kexec image.
+ * @kbuf:                Buffer contents and memory parameters.
+ *
+ * Returns 0 on success, negative errno on error.
+ */
+static int load_backup_segment(struct kimage *image, struct kexec_buf *kbuf)
+{
+	void *buf;
+	int ret;
+
+	/*
+	 * Setup a source buffer for backup segment.
+	 *
+	 * A source buffer has no meaning for backup region as data will
+	 * be copied from backup source, after crash, in the purgatory.
+	 * But as load segment code doesn't recognize such segments,
+	 * setup a dummy source buffer to keep it happy for now.
+	 */
+	buf = vzalloc(BACKUP_SRC_SIZE);
+	if (!buf)
+		return -ENOMEM;
+
+	kbuf->buffer = buf;
+	kbuf->mem = KEXEC_BUF_MEM_UNKNOWN;
+	kbuf->bufsz = kbuf->memsz = BACKUP_SRC_SIZE;
+	kbuf->top_down = false;
+
+	ret = kexec_add_buffer(kbuf);
+	if (ret) {
+		vfree(buf);
+		return ret;
+	}
+
+	image->arch.backup_buf = buf;
+	image->arch.backup_start = kbuf->mem;
+	return 0;
+}
+
+/**
+ * load_crashdump_segments_ppc64 - Initialize the additional segements needed
+ *                                 to load kdump kernel.
+ * @image:                         Kexec image.
+ * @kbuf:                          Buffer contents and memory parameters.
+ *
+ * Returns 0 on success, negative errno on error.
+ */
+int load_crashdump_segments_ppc64(struct kimage *image,
+				  struct kexec_buf *kbuf)
+{
+	int ret;
+
+	/* Load backup segment - first 64K bytes of the crashing kernel */
+	ret = load_backup_segment(image, kbuf);
+	if (ret) {
+		pr_err("Failed to load backup segment\n");
+		return ret;
+	}
+	pr_debug("Loaded the backup region at 0x%lx\n", kbuf->mem);
+
+	return 0;
+}
+
 /**
  * setup_purgatory_ppc64 - initialize PPC64 specific purgatory's global
  *                         variables and call setup_purgatory() to initialize
@@ -643,6 +709,11 @@ int setup_purgatory_ppc64(struct kimage *image, const void *slave_code,
 			goto out;
 	}
 
+	/* Tell purgatory where to look for backup region */
+	ret = kexec_purgatory_get_set_symbol(image, "backup_start",
+					     &image->arch.backup_start,
+					     sizeof(image->arch.backup_start),
+					     false);
 out:
 	if (ret)
 		pr_err("Failed to setup purgatory symbols");
@@ -674,7 +745,7 @@ int setup_new_fdt_ppc64(const struct kimage *image, void *fdt,
 
 	/*
 	 * Restrict memory usage for kdump kernel by setting up
-	 * usable memory ranges.
+	 * usable memory ranges and memory reserve map.
 	 */
 	if (image->type == KEXEC_TYPE_CRASH) {
 		ret = get_usable_memory_ranges(&umem);
@@ -687,13 +758,26 @@ int setup_new_fdt_ppc64(const struct kimage *image, void *fdt,
 			goto out;
 		}
 
-		/* Ensure we don't touch crashed kernel's memory */
-		ret = fdt_add_mem_rsv(fdt, 0, crashk_res.start);
+		/*
+		 * Ensure we don't touch crashed kernel's memory except the
+		 * first 64K of RAM, which will be backed up.
+		 */
+		ret = fdt_add_mem_rsv(fdt, BACKUP_SRC_END + 1,
+				      crashk_res.start - BACKUP_SRC_SIZE);
 		if (ret) {
 			pr_err("Error reserving crash memory: %s\n",
 			       fdt_strerror(ret));
 			goto out;
 		}
+
+		/* Ensure backup region is not used by kdump/capture kernel */
+		ret = fdt_add_mem_rsv(fdt, image->arch.backup_start,
+				      BACKUP_SRC_SIZE);
+		if (ret) {
+			pr_err("Error reserving memory for backup: %s\n",
+			       fdt_strerror(ret));
+			goto out;
+		}
 	}
 
 out:
@@ -800,5 +884,8 @@ int arch_kimage_file_post_load_cleanup(struct kimage *image)
 	kfree(image->arch.exclude_ranges);
 	image->arch.exclude_ranges = NULL;
 
+	vfree(image->arch.backup_buf);
+	image->arch.backup_buf = NULL;
+
 	return kexec_image_post_load_cleanup_default(image);
 }
diff --git a/arch/powerpc/purgatory/trampoline_64.S b/arch/powerpc/purgatory/trampoline_64.S
index a5a83c3f53e6..e79077ff1355 100644
--- a/arch/powerpc/purgatory/trampoline_64.S
+++ b/arch/powerpc/purgatory/trampoline_64.S
@@ -10,6 +10,7 @@
  */
 
 #include <asm/asm-compat.h>
+#include <asm/crashdump-ppc64.h>
 
 	.machine ppc64
 	.balign 256
@@ -43,14 +44,39 @@ master:
 	mr	%r17,%r3	/* save cpu id to r17 */
 	mr	%r15,%r4	/* save physical address in reg15 */
 
+	/* Work out where we're running */
+	bcl	20, 31, $+4
+0:	mflr	%r18
+
+	/*
+	 * Copy BACKUP_SRC_SIZE bytes from BACKUP_SRC_START to
+	 * backup_start 8 bytes at a time.
+	 *
+	 * Use r3 = dest, r4 = src, r5 = size, r6 = count
+	 */
+	ld	%r3, (backup_start - 0b)(%r18)
+	cmpdi	%cr0, %r3, 0
+	beq	.Lskip_copy	/* skip if there is no backup region */
+	lis	%r5, BACKUP_SRC_SIZE@h
+	ori	%r5, %r5, BACKUP_SRC_SIZE@l
+	cmpdi	%cr0, %r5, 0
+	beq	.Lskip_copy	/* skip if copy size is zero */
+	lis	%r4, BACKUP_SRC_START@h
+	ori	%r4, %r4, BACKUP_SRC_START@l
+	li	%r6, 0
+.Lcopy_loop:
+	ldx	%r0, %r6, %r4
+	stdx	%r0, %r6, %r3
+	addi	%r6, %r6, 8
+	cmpld	%cr0, %r6, %r5
+	blt	.Lcopy_loop
+
+.Lskip_copy:
 	or	%r3,%r3,%r3	/* ok now to high priority, lets boot */
 	lis	%r6,0x1
 	mtctr	%r6		/* delay a bit for slaves to catch up */
 	bdnz	.		/* before we overwrite 0-100 again */
 
-	bl	0f		/* Work out where we're running */
-0:	mflr	%r18
-
 	/* load device-tree address */
 	ld	%r3, (dt_offset - 0b)(%r18)
 	mr	%r16,%r3	/* save dt address in reg16 */
@@ -89,7 +115,6 @@ master:
 
 	rfid			/* update MSR and start kernel */
 
-
 	.balign 8
 	.globl kernel
 kernel:
@@ -102,6 +127,11 @@ dt_offset:
 	.8byte  0x0
 	.size dt_offset, . - dt_offset
 
+	.balign 8
+	.globl backup_start
+backup_start:
+	.8byte  0x0
+	.size backup_start, . - backup_start
 
 	.data
 	.balign 8



^ permalink raw reply related

* [PATCH v6 06/11] ppc64/kexec_file: restrict memory usage of kdump kernel
From: Hari Bathini @ 2020-07-29 11:42 UTC (permalink / raw)
  To: Michael Ellerman
  Cc: Pingfan Liu, Kexec-ml, Mimi Zohar, Nayna Jain, Petr Tesarik,
	Mahesh J Salgaonkar, Sourabh Jain, lkml, linuxppc-dev,
	Thiago Jung Bauermann, Andrew Morton, Dave Young, Vivek Goyal,
	Eric Biederman
In-Reply-To: <159602259854.575379.16910915605574571585.stgit@hbathini>

Kdump kernel, used for capturing the kernel core image, is supposed
to use only specific memory regions to avoid corrupting the image to
be captured. The regions are crashkernel range - the memory reserved
explicitly for kdump kernel, memory used for the tce-table, the OPAL
region and RTAS region as applicable. Restrict kdump kernel memory
to use only these regions by setting up usable-memory DT property.
Also, tell the kdump kernel to run at the loaded address by setting
the magic word at 0x5c.

Signed-off-by: Hari Bathini <hbathini@linux.ibm.com>
Tested-by: Pingfan Liu <piliu@redhat.com>
Reviewed-by: Thiago Jung Bauermann <bauerman@linux.ibm.com>
---

v5 -> v6:
* Added Reviewed-by tag from Thiago.
* Avoided pass by reference count parameter in add_usable_mem() function
  by calculating the range count added from index value before & after it.
* Instead of trying to reinvent the wheel with get_node_path() &
  get_node_path_size() functions, used %pOF format as suggested by mpe.
* Used kernel types instead of uint32_t/uint64_t.
* and Dropped 'struct crash_mem *' member & added 'struct crash_mem_range *',
  nr_ranges & max_entries fields to 'struct umem_info' to avoid bit of
  a clutter in check_realloc_usable_mem() & add_usable_mem() functions.
* Updated the comment as to why 0 till crashk_res.start was needed to be
  added to usable memory ranges. Note that kexec-tools also has been
  doing the same thing.

v4 -> v5:
* Renamed get_node_pathlen() function to get_node_path_size() and
  handled root node separately to avoid off-by-one error in
  calculating string size.
* Updated get_node_path() in line with change in get_node_path_size().

v3 -> v4:
* Updated get_node_path() to be an iterative function instead of a
  recursive one.
* Added comment explaining why low memory is added to kdump kernel's
  usable memory ranges though it doesn't fall in crashkernel region.
* For correctness, added fdt_add_mem_rsv() for the low memory being
  added to kdump kernel's usable memory ranges.
* Fixed prop pointer update in add_usable_mem_property() and changed
  duple to tuple as suggested by Thiago.

v2 -> v3:
* Unchanged. Added Tested-by tag from Pingfan.

v1 -> v2:
* Fixed off-by-one error while setting up usable-memory properties.
* Updated add_rtas_mem_range() & add_opal_mem_range() callsites based on
  the new prototype for these functions.


 arch/powerpc/kexec/file_load_64.c |  386 +++++++++++++++++++++++++++++++++++++
 1 file changed, 385 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/kexec/file_load_64.c b/arch/powerpc/kexec/file_load_64.c
index d09c7724efa8..f94660874765 100644
--- a/arch/powerpc/kexec/file_load_64.c
+++ b/arch/powerpc/kexec/file_load_64.c
@@ -17,9 +17,23 @@
 #include <linux/kexec.h>
 #include <linux/of_fdt.h>
 #include <linux/libfdt.h>
+#include <linux/of_device.h>
 #include <linux/memblock.h>
+#include <linux/slab.h>
+#include <asm/drmem.h>
 #include <asm/kexec_ranges.h>
 
+struct umem_info {
+	u64 *buf;		/* data buffer for usable-memory property */
+	u32 size;		/* size allocated for the data buffer */
+	u32 max_entries;	/* maximum no. of entries */
+	u32 idx;		/* index of current entry */
+
+	/* usable memory ranges to look up */
+	unsigned int nr_ranges;
+	const struct crash_mem_range *ranges;
+};
+
 const struct kexec_file_ops * const kexec_file_loaders[] = {
 	&kexec_elf64_ops,
 	NULL
@@ -74,6 +88,44 @@ static int get_exclude_memory_ranges(struct crash_mem **mem_ranges)
 	return ret;
 }
 
+/**
+ * get_usable_memory_ranges - Get usable memory ranges. This list includes
+ *                            regions like crashkernel, opal/rtas & tce-table,
+ *                            that kdump kernel could use.
+ * @mem_ranges:               Range list to add the memory ranges to.
+ *
+ * Returns 0 on success, negative errno on error.
+ */
+static int get_usable_memory_ranges(struct crash_mem **mem_ranges)
+{
+	int ret;
+
+	/*
+	 * Early boot failure observed on guests when low memory (first memory
+	 * block?) is not added to usable memory. So, add [0, crashk_res.end]
+	 * instead of [crashk_res.start, crashk_res.end] to workaround it.
+	 * Also, crashed kernel's memory must be added to reserve map to
+	 * avoid kdump kernel from using it.
+	 */
+	ret = add_mem_range(mem_ranges, 0, crashk_res.end + 1);
+	if (ret)
+		goto out;
+
+	ret = add_rtas_mem_range(mem_ranges);
+	if (ret)
+		goto out;
+
+	ret = add_opal_mem_range(mem_ranges);
+	if (ret)
+		goto out;
+
+	ret = add_tce_mem_ranges(mem_ranges);
+out:
+	if (ret)
+		pr_err("Failed to setup usable memory ranges\n");
+	return ret;
+}
+
 /**
  * __locate_mem_hole_top_down - Looks top down for a large enough memory hole
  *                              in the memory regions between buf_min & buf_max
@@ -273,6 +325,286 @@ static int locate_mem_hole_bottom_up_ppc64(struct kexec_buf *kbuf,
 	return ret;
 }
 
+/**
+ * check_realloc_usable_mem - Reallocate buffer if it can't accommodate entries
+ * @um_info:                  Usable memory buffer and ranges info.
+ * @cnt:                      No. of entries to accommodate.
+ *
+ * Frees up the old buffer if memory reallocation fails.
+ *
+ * Returns buffer on success, NULL on error.
+ */
+static u64 *check_realloc_usable_mem(struct umem_info *um_info, int cnt)
+{
+	u32 new_size;
+	u64 *tbuf;
+
+	if ((um_info->idx + cnt) <= um_info->max_entries)
+		return um_info->buf;
+
+	new_size = um_info->size + MEM_RANGE_CHUNK_SZ;
+	tbuf = krealloc(um_info->buf, new_size, GFP_KERNEL);
+	if (tbuf) {
+		um_info->buf = tbuf;
+		um_info->size = new_size;
+		um_info->max_entries = (um_info->size / sizeof(u64));
+	}
+
+	return tbuf;
+}
+
+/**
+ * add_usable_mem - Add the usable memory ranges within the given memory range
+ *                  to the buffer
+ * @um_info:        Usable memory buffer and ranges info.
+ * @base:           Base address of memory range to look for.
+ * @end:            End address of memory range to look for.
+ *
+ * Returns 0 on success, negative errno on error.
+ */
+static int add_usable_mem(struct umem_info *um_info, u64 base, u64 end)
+{
+	u64 loc_base, loc_end;
+	bool add;
+	int i;
+
+	for (i = 0; i < um_info->nr_ranges; i++) {
+		add = false;
+		loc_base = um_info->ranges[i].start;
+		loc_end = um_info->ranges[i].end;
+		if (loc_base >= base && loc_end <= end)
+			add = true;
+		else if (base < loc_end && end > loc_base) {
+			if (loc_base < base)
+				loc_base = base;
+			if (loc_end > end)
+				loc_end = end;
+			add = true;
+		}
+
+		if (add) {
+			if (!check_realloc_usable_mem(um_info, 2))
+				return -ENOMEM;
+
+			um_info->buf[um_info->idx++] = cpu_to_be64(loc_base);
+			um_info->buf[um_info->idx++] =
+					cpu_to_be64(loc_end - loc_base + 1);
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * kdump_setup_usable_lmb - This is a callback function that gets called by
+ *                          walk_drmem_lmbs for every LMB to set its
+ *                          usable memory ranges.
+ * @lmb:                    LMB info.
+ * @usm:                    linux,drconf-usable-memory property value.
+ * @data:                   Pointer to usable memory buffer and ranges info.
+ *
+ * Returns 0 on success, negative errno on error.
+ */
+static int kdump_setup_usable_lmb(struct drmem_lmb *lmb, const __be32 **usm,
+				  void *data)
+{
+	struct umem_info *um_info;
+	int tmp_idx, ret;
+	u64 base, end;
+
+	/*
+	 * kdump load isn't supported on kernels already booted with
+	 * linux,drconf-usable-memory property.
+	 */
+	if (*usm) {
+		pr_err("linux,drconf-usable-memory property already exists!");
+		return -EINVAL;
+	}
+
+	um_info = data;
+	tmp_idx = um_info->idx;
+	if (!check_realloc_usable_mem(um_info, 1))
+		return -ENOMEM;
+
+	um_info->idx++;
+	base = lmb->base_addr;
+	end = base + drmem_lmb_size() - 1;
+	ret = add_usable_mem(um_info, base, end);
+	if (!ret) {
+		/*
+		 * Update the no. of ranges added. Two entries (base & size)
+		 * for every range added.
+		 */
+		um_info->buf[tmp_idx] =
+				cpu_to_be64((um_info->idx - tmp_idx - 1) / 2);
+	}
+
+	return ret;
+}
+
+#define NODE_PATH_LEN		256
+/**
+ * add_usable_mem_property - Add usable memory property for the given
+ *                           memory node.
+ * @fdt:                     Flattened device tree for the kdump kernel.
+ * @dn:                      Memory node.
+ * @um_info:                 Usable memory buffer and ranges info.
+ *
+ * Returns 0 on success, negative errno on error.
+ */
+static int add_usable_mem_property(void *fdt, struct device_node *dn,
+				   struct umem_info *um_info)
+{
+	int n_mem_addr_cells, n_mem_size_cells, node;
+	char path[NODE_PATH_LEN];
+	int i, len, ranges, ret;
+	const __be32 *prop;
+	u64 base, end;
+
+	of_node_get(dn);
+
+	if (snprintf(path, NODE_PATH_LEN, "%pOF", dn) > (NODE_PATH_LEN - 1)) {
+		pr_err("Buffer (%d) too small for memory node: %pOF\n",
+		       NODE_PATH_LEN, dn);
+		return -EOVERFLOW;
+	}
+	pr_debug("Memory node path: %s\n", path);
+
+	/* Now that we know the path, find its offset in kdump kernel's fdt */
+	node = fdt_path_offset(fdt, path);
+	if (node < 0) {
+		pr_err("Malformed device tree: error reading %s\n", path);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	/* Get the address & size cells */
+	n_mem_addr_cells = of_n_addr_cells(dn);
+	n_mem_size_cells = of_n_size_cells(dn);
+	pr_debug("address cells: %d, size cells: %d\n", n_mem_addr_cells,
+		 n_mem_size_cells);
+
+	um_info->idx  = 0;
+	if (!check_realloc_usable_mem(um_info, 2)) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	prop = of_get_property(dn, "reg", &len);
+	if (!prop || len <= 0) {
+		ret = 0;
+		goto out;
+	}
+
+	/*
+	 * "reg" property represents sequence of (addr,size) tuples
+	 * each representing a memory range.
+	 */
+	ranges = (len >> 2) / (n_mem_addr_cells + n_mem_size_cells);
+
+	for (i = 0; i < ranges; i++) {
+		base = of_read_number(prop, n_mem_addr_cells);
+		prop += n_mem_addr_cells;
+		end = base + of_read_number(prop, n_mem_size_cells) - 1;
+		prop += n_mem_size_cells;
+
+		ret = add_usable_mem(um_info, base, end);
+		if (ret)
+			goto out;
+	}
+
+	/*
+	 * No kdump kernel usable memory found in this memory node.
+	 * Write (0,0) tuple in linux,usable-memory property for
+	 * this region to be ignored.
+	 */
+	if (um_info->idx == 0) {
+		um_info->buf[0] = 0;
+		um_info->buf[1] = 0;
+		um_info->idx = 2;
+	}
+
+	ret = fdt_setprop(fdt, node, "linux,usable-memory", um_info->buf,
+			  (um_info->idx * sizeof(u64)));
+
+out:
+	of_node_put(dn);
+	return ret;
+}
+
+
+/**
+ * update_usable_mem_fdt - Updates kdump kernel's fdt with linux,usable-memory
+ *                         and linux,drconf-usable-memory DT properties as
+ *                         appropriate to restrict its memory usage.
+ * @fdt:                   Flattened device tree for the kdump kernel.
+ * @usable_mem:            Usable memory ranges for kdump kernel.
+ *
+ * Returns 0 on success, negative errno on error.
+ */
+static int update_usable_mem_fdt(void *fdt, struct crash_mem *usable_mem)
+{
+	struct umem_info um_info;
+	struct device_node *dn;
+	int node, ret = 0;
+
+	if (!usable_mem) {
+		pr_err("Usable memory ranges for kdump kernel not found\n");
+		return -ENOENT;
+	}
+
+	node = fdt_path_offset(fdt, "/ibm,dynamic-reconfiguration-memory");
+	if (node == -FDT_ERR_NOTFOUND)
+		pr_debug("No dynamic reconfiguration memory found\n");
+	else if (node < 0) {
+		pr_err("Malformed device tree: error reading /ibm,dynamic-reconfiguration-memory.\n");
+		return -EINVAL;
+	}
+
+	um_info.buf  = NULL;
+	um_info.size = 0;
+	um_info.max_entries = 0;
+	um_info.idx  = 0;
+	/* Memory ranges to look up */
+	um_info.ranges = &(usable_mem->ranges[0]);
+	um_info.nr_ranges = usable_mem->nr_ranges;
+
+	dn = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
+	if (dn) {
+		ret = walk_drmem_lmbs(dn, &um_info, kdump_setup_usable_lmb);
+		of_node_put(dn);
+
+		if (ret) {
+			pr_err("Could not setup linux,drconf-usable-memory property for kdump\n");
+			goto out;
+		}
+
+		ret = fdt_setprop(fdt, node, "linux,drconf-usable-memory",
+				  um_info.buf, (um_info.idx * sizeof(u64)));
+		if (ret) {
+			pr_err("Failed to update fdt with linux,drconf-usable-memory property");
+			goto out;
+		}
+	}
+
+	/*
+	 * Walk through each memory node and set linux,usable-memory property
+	 * for the corresponding node in kdump kernel's fdt.
+	 */
+	for_each_node_by_type(dn, "memory") {
+		ret = add_usable_mem_property(fdt, dn, &um_info);
+		if (ret) {
+			pr_err("Failed to set linux,usable-memory property for %s node",
+			       dn->full_name);
+			goto out;
+		}
+	}
+
+out:
+	kfree(um_info.buf);
+	return ret;
+}
+
 /**
  * setup_purgatory_ppc64 - initialize PPC64 specific purgatory's global
  *                         variables and call setup_purgatory() to initialize
@@ -293,6 +625,25 @@ int setup_purgatory_ppc64(struct kimage *image, const void *slave_code,
 
 	ret = setup_purgatory(image, slave_code, fdt, kernel_load_addr,
 			      fdt_load_addr);
+	if (ret)
+		goto out;
+
+	if (image->type == KEXEC_TYPE_CRASH) {
+		u32 my_run_at_load = 1;
+
+		/*
+		 * Tell relocatable kernel to run at load address
+		 * via the word meant for that at 0x5c.
+		 */
+		ret = kexec_purgatory_get_set_symbol(image, "run_at_load",
+						     &my_run_at_load,
+						     sizeof(my_run_at_load),
+						     false);
+		if (ret)
+			goto out;
+	}
+
+out:
 	if (ret)
 		pr_err("Failed to setup purgatory symbols");
 	return ret;
@@ -314,7 +665,40 @@ int setup_new_fdt_ppc64(const struct kimage *image, void *fdt,
 			unsigned long initrd_load_addr,
 			unsigned long initrd_len, const char *cmdline)
 {
-	return setup_new_fdt(image, fdt, initrd_load_addr, initrd_len, cmdline);
+	struct crash_mem *umem = NULL;
+	int ret;
+
+	ret = setup_new_fdt(image, fdt, initrd_load_addr, initrd_len, cmdline);
+	if (ret)
+		goto out;
+
+	/*
+	 * Restrict memory usage for kdump kernel by setting up
+	 * usable memory ranges.
+	 */
+	if (image->type == KEXEC_TYPE_CRASH) {
+		ret = get_usable_memory_ranges(&umem);
+		if (ret)
+			goto out;
+
+		ret = update_usable_mem_fdt(fdt, umem);
+		if (ret) {
+			pr_err("Error setting up usable-memory property for kdump kernel\n");
+			goto out;
+		}
+
+		/* Ensure we don't touch crashed kernel's memory */
+		ret = fdt_add_mem_rsv(fdt, 0, crashk_res.start);
+		if (ret) {
+			pr_err("Error reserving crash memory: %s\n",
+			       fdt_strerror(ret));
+			goto out;
+		}
+	}
+
+out:
+	kfree(umem);
+	return ret;
 }
 
 /**



^ permalink raw reply related

* Re: [PATCH 05/15] h8300, nds32, openrisc: simplify detection of memory extents
From: Stafford Horne @ 2020-07-29 11:41 UTC (permalink / raw)
  To: Mike Rapoport
  Cc: linux-sh, Peter Zijlstra, Dave Hansen, linux-mips, Max Filippov,
	Paul Mackerras, sparclinux, linux-riscv, Will Deacon,
	Christoph Hellwig, Marek Szyprowski, linux-s390, linux-c6x-dev,
	Yoshinori Sato, x86, Russell King, Mike Rapoport,
	clang-built-linux, Ingo Molnar, Catalin Marinas, uclinux-h8-devel,
	linux-xtensa, openrisc, Borislav Petkov, Andy Lutomirski,
	Paul Walmsley, Thomas Gleixner, linux-arm-kernel, Michal Simek,
	linux-mm, linux-kernel, iommu, Palmer Dabbelt, Andrew Morton,
	linuxppc-dev
In-Reply-To: <20200728051153.1590-6-rppt@kernel.org>

On Tue, Jul 28, 2020 at 08:11:43AM +0300, Mike Rapoport wrote:
> From: Mike Rapoport <rppt@linux.ibm.com>
> 
> Instead of traversing memblock.memory regions to find memory_start and
> memory_end, simply query memblock_{start,end}_of_DRAM().
> 
> Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
> ---
>  arch/h8300/kernel/setup.c    | 8 +++-----
>  arch/nds32/kernel/setup.c    | 8 ++------
>  arch/openrisc/kernel/setup.c | 9 ++-------
>  3 files changed, 7 insertions(+), 18 deletions(-)

Hi Mike,

For the openrisc part:

Acked-by: Stafford Horne <shorne@gmail.com>

> --- a/arch/openrisc/kernel/setup.c
> +++ b/arch/openrisc/kernel/setup.c
> @@ -48,17 +48,12 @@ static void __init setup_memory(void)
>  	unsigned long ram_start_pfn;
>  	unsigned long ram_end_pfn;
>  	phys_addr_t memory_start, memory_end;
> -	struct memblock_region *region;
>  
>  	memory_end = memory_start = 0;
>  
>  	/* Find main memory where is the kernel, we assume its the only one */
> -	for_each_memblock(memory, region) {
> -		memory_start = region->base;
> -		memory_end = region->base + region->size;
> -		printk(KERN_INFO "%s: Memory: 0x%x-0x%x\n", __func__,
> -		       memory_start, memory_end);
> -	}
> +	memory_start = memblock_start_of_DRAM();
> +	memory_end = memblock_end_of_DRAM();
>  
>  	if (!memory_end) {
>  		panic("No memory!");
> -- 
> 2.26.2
> 

^ permalink raw reply

* [PATCH v6 05/11] powerpc/drmem: make lmb walk a bit more flexible
From: Hari Bathini @ 2020-07-29 11:40 UTC (permalink / raw)
  To: Michael Ellerman
  Cc: Pingfan Liu, Kexec-ml, Mimi Zohar, Nayna Jain, Petr Tesarik,
	Mahesh J Salgaonkar, Sourabh Jain, lkml, linuxppc-dev,
	Thiago Jung Bauermann, Andrew Morton, Dave Young, Vivek Goyal,
	Eric Biederman
In-Reply-To: <159602259854.575379.16910915605574571585.stgit@hbathini>

Currently, numa & prom are the users of drmem lmb walk code. Loading
kdump with kexec_file also needs to walk the drmem LMBs to setup the
usable memory ranges for kdump kernel. But there are couple of issues
in using the code as is. One, walk_drmem_lmb() code is built into the
.init section currently, while kexec_file needs it later. Two, there
is no scope to pass data to the callback function for processing and/
or erroring out on certain conditions.

Fix that by, moving drmem LMB walk code out of .init section, adding
scope to pass data to the callback function and bailing out when
an error is encountered in the callback function.

Signed-off-by: Hari Bathini <hbathini@linux.ibm.com>
Tested-by: Pingfan Liu <piliu@redhat.com>
Reviewed-by: Thiago Jung Bauermann <bauerman@linux.ibm.com>
---

v5 -> v6:
* Unchanged.

v4 -> v5:
* Unchanged.

v3 -> v4:
* Unchanged. Added Reviewed-by tag from Thiago.

v2 -> v3:
* Unchanged. Added Tested-by tag from Pingfan.

v1 -> v2:
* No changes.


 arch/powerpc/include/asm/drmem.h |    9 ++--
 arch/powerpc/kernel/prom.c       |   13 +++---
 arch/powerpc/mm/drmem.c          |   87 +++++++++++++++++++++++++-------------
 arch/powerpc/mm/numa.c           |   13 +++---
 4 files changed, 78 insertions(+), 44 deletions(-)

diff --git a/arch/powerpc/include/asm/drmem.h b/arch/powerpc/include/asm/drmem.h
index 414d209f45bb..17ccc6474ab6 100644
--- a/arch/powerpc/include/asm/drmem.h
+++ b/arch/powerpc/include/asm/drmem.h
@@ -90,13 +90,14 @@ static inline bool drmem_lmb_reserved(struct drmem_lmb *lmb)
 }
 
 u64 drmem_lmb_memory_max(void);
-void __init walk_drmem_lmbs(struct device_node *dn,
-			void (*func)(struct drmem_lmb *, const __be32 **));
+int walk_drmem_lmbs(struct device_node *dn, void *data,
+		    int (*func)(struct drmem_lmb *, const __be32 **, void *));
 int drmem_update_dt(void);
 
 #ifdef CONFIG_PPC_PSERIES
-void __init walk_drmem_lmbs_early(unsigned long node,
-			void (*func)(struct drmem_lmb *, const __be32 **));
+int __init
+walk_drmem_lmbs_early(unsigned long node, void *data,
+		      int (*func)(struct drmem_lmb *, const __be32 **, void *));
 #endif
 
 static inline void invalidate_lmb_associativity_index(struct drmem_lmb *lmb)
diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c
index 9cc49f265c86..7df78de378b0 100644
--- a/arch/powerpc/kernel/prom.c
+++ b/arch/powerpc/kernel/prom.c
@@ -468,8 +468,9 @@ static bool validate_mem_limit(u64 base, u64 *size)
  * This contains a list of memory blocks along with NUMA affinity
  * information.
  */
-static void __init early_init_drmem_lmb(struct drmem_lmb *lmb,
-					const __be32 **usm)
+static int  __init early_init_drmem_lmb(struct drmem_lmb *lmb,
+					const __be32 **usm,
+					void *data)
 {
 	u64 base, size;
 	int is_kexec_kdump = 0, rngs;
@@ -484,7 +485,7 @@ static void __init early_init_drmem_lmb(struct drmem_lmb *lmb,
 	 */
 	if ((lmb->flags & DRCONF_MEM_RESERVED) ||
 	    !(lmb->flags & DRCONF_MEM_ASSIGNED))
-		return;
+		return 0;
 
 	if (*usm)
 		is_kexec_kdump = 1;
@@ -499,7 +500,7 @@ static void __init early_init_drmem_lmb(struct drmem_lmb *lmb,
 		 */
 		rngs = dt_mem_next_cell(dt_root_size_cells, usm);
 		if (!rngs) /* there are no (base, size) duple */
-			return;
+			return 0;
 	}
 
 	do {
@@ -524,6 +525,8 @@ static void __init early_init_drmem_lmb(struct drmem_lmb *lmb,
 		if (lmb->flags & DRCONF_MEM_HOTREMOVABLE)
 			memblock_mark_hotplug(base, size);
 	} while (--rngs);
+
+	return 0;
 }
 #endif /* CONFIG_PPC_PSERIES */
 
@@ -534,7 +537,7 @@ static int __init early_init_dt_scan_memory_ppc(unsigned long node,
 #ifdef CONFIG_PPC_PSERIES
 	if (depth == 1 &&
 	    strcmp(uname, "ibm,dynamic-reconfiguration-memory") == 0) {
-		walk_drmem_lmbs_early(node, early_init_drmem_lmb);
+		walk_drmem_lmbs_early(node, NULL, early_init_drmem_lmb);
 		return 0;
 	}
 #endif
diff --git a/arch/powerpc/mm/drmem.c b/arch/powerpc/mm/drmem.c
index 59327cefbc6a..b2eeea39684c 100644
--- a/arch/powerpc/mm/drmem.c
+++ b/arch/powerpc/mm/drmem.c
@@ -14,6 +14,8 @@
 #include <asm/prom.h>
 #include <asm/drmem.h>
 
+static int n_root_addr_cells, n_root_size_cells;
+
 static struct drmem_lmb_info __drmem_info;
 struct drmem_lmb_info *drmem_info = &__drmem_info;
 
@@ -189,12 +191,13 @@ int drmem_update_dt(void)
 	return rc;
 }
 
-static void __init read_drconf_v1_cell(struct drmem_lmb *lmb,
+static void read_drconf_v1_cell(struct drmem_lmb *lmb,
 				       const __be32 **prop)
 {
 	const __be32 *p = *prop;
 
-	lmb->base_addr = dt_mem_next_cell(dt_root_addr_cells, &p);
+	lmb->base_addr = of_read_number(p, n_root_addr_cells);
+	p += n_root_addr_cells;
 	lmb->drc_index = of_read_number(p++, 1);
 
 	p++; /* skip reserved field */
@@ -205,29 +208,33 @@ static void __init read_drconf_v1_cell(struct drmem_lmb *lmb,
 	*prop = p;
 }
 
-static void __init __walk_drmem_v1_lmbs(const __be32 *prop, const __be32 *usm,
-			void (*func)(struct drmem_lmb *, const __be32 **))
+static int
+__walk_drmem_v1_lmbs(const __be32 *prop, const __be32 *usm, void *data,
+		     int (*func)(struct drmem_lmb *, const __be32 **, void *))
 {
 	struct drmem_lmb lmb;
 	u32 i, n_lmbs;
+	int ret = 0;
 
 	n_lmbs = of_read_number(prop++, 1);
-	if (n_lmbs == 0)
-		return;
-
 	for (i = 0; i < n_lmbs; i++) {
 		read_drconf_v1_cell(&lmb, &prop);
-		func(&lmb, &usm);
+		ret = func(&lmb, &usm, data);
+		if (ret)
+			break;
 	}
+
+	return ret;
 }
 
-static void __init read_drconf_v2_cell(struct of_drconf_cell_v2 *dr_cell,
+static void read_drconf_v2_cell(struct of_drconf_cell_v2 *dr_cell,
 				       const __be32 **prop)
 {
 	const __be32 *p = *prop;
 
 	dr_cell->seq_lmbs = of_read_number(p++, 1);
-	dr_cell->base_addr = dt_mem_next_cell(dt_root_addr_cells, &p);
+	dr_cell->base_addr = of_read_number(p, n_root_addr_cells);
+	p += n_root_addr_cells;
 	dr_cell->drc_index = of_read_number(p++, 1);
 	dr_cell->aa_index = of_read_number(p++, 1);
 	dr_cell->flags = of_read_number(p++, 1);
@@ -235,17 +242,16 @@ static void __init read_drconf_v2_cell(struct of_drconf_cell_v2 *dr_cell,
 	*prop = p;
 }
 
-static void __init __walk_drmem_v2_lmbs(const __be32 *prop, const __be32 *usm,
-			void (*func)(struct drmem_lmb *, const __be32 **))
+static int
+__walk_drmem_v2_lmbs(const __be32 *prop, const __be32 *usm, void *data,
+		     int (*func)(struct drmem_lmb *, const __be32 **, void *))
 {
 	struct of_drconf_cell_v2 dr_cell;
 	struct drmem_lmb lmb;
 	u32 i, j, lmb_sets;
+	int ret = 0;
 
 	lmb_sets = of_read_number(prop++, 1);
-	if (lmb_sets == 0)
-		return;
-
 	for (i = 0; i < lmb_sets; i++) {
 		read_drconf_v2_cell(&dr_cell, &prop);
 
@@ -259,21 +265,29 @@ static void __init __walk_drmem_v2_lmbs(const __be32 *prop, const __be32 *usm,
 			lmb.aa_index = dr_cell.aa_index;
 			lmb.flags = dr_cell.flags;
 
-			func(&lmb, &usm);
+			ret = func(&lmb, &usm, data);
+			if (ret)
+				break;
 		}
 	}
+
+	return ret;
 }
 
 #ifdef CONFIG_PPC_PSERIES
-void __init walk_drmem_lmbs_early(unsigned long node,
-			void (*func)(struct drmem_lmb *, const __be32 **))
+int __init walk_drmem_lmbs_early(unsigned long node, void *data,
+		int (*func)(struct drmem_lmb *, const __be32 **, void *))
 {
 	const __be32 *prop, *usm;
-	int len;
+	int len, ret = -ENODEV;
 
 	prop = of_get_flat_dt_prop(node, "ibm,lmb-size", &len);
 	if (!prop || len < dt_root_size_cells * sizeof(__be32))
-		return;
+		return ret;
+
+	/* Get the address & size cells */
+	n_root_addr_cells = dt_root_addr_cells;
+	n_root_size_cells = dt_root_size_cells;
 
 	drmem_info->lmb_size = dt_mem_next_cell(dt_root_size_cells, &prop);
 
@@ -281,20 +295,21 @@ void __init walk_drmem_lmbs_early(unsigned long node,
 
 	prop = of_get_flat_dt_prop(node, "ibm,dynamic-memory", &len);
 	if (prop) {
-		__walk_drmem_v1_lmbs(prop, usm, func);
+		ret = __walk_drmem_v1_lmbs(prop, usm, data, func);
 	} else {
 		prop = of_get_flat_dt_prop(node, "ibm,dynamic-memory-v2",
 					   &len);
 		if (prop)
-			__walk_drmem_v2_lmbs(prop, usm, func);
+			ret = __walk_drmem_v2_lmbs(prop, usm, data, func);
 	}
 
 	memblock_dump_all();
+	return ret;
 }
 
 #endif
 
-static int __init init_drmem_lmb_size(struct device_node *dn)
+static int init_drmem_lmb_size(struct device_node *dn)
 {
 	const __be32 *prop;
 	int len;
@@ -303,12 +318,12 @@ static int __init init_drmem_lmb_size(struct device_node *dn)
 		return 0;
 
 	prop = of_get_property(dn, "ibm,lmb-size", &len);
-	if (!prop || len < dt_root_size_cells * sizeof(__be32)) {
+	if (!prop || len < n_root_size_cells * sizeof(__be32)) {
 		pr_info("Could not determine LMB size\n");
 		return -1;
 	}
 
-	drmem_info->lmb_size = dt_mem_next_cell(dt_root_size_cells, &prop);
+	drmem_info->lmb_size = of_read_number(prop, n_root_size_cells);
 	return 0;
 }
 
@@ -329,24 +344,36 @@ static const __be32 *of_get_usable_memory(struct device_node *dn)
 	return prop;
 }
 
-void __init walk_drmem_lmbs(struct device_node *dn,
-			    void (*func)(struct drmem_lmb *, const __be32 **))
+int walk_drmem_lmbs(struct device_node *dn, void *data,
+		    int (*func)(struct drmem_lmb *, const __be32 **, void *))
 {
 	const __be32 *prop, *usm;
+	int ret = -ENODEV;
+
+	if (!of_root)
+		return ret;
+
+	/* Get the address & size cells */
+	of_node_get(of_root);
+	n_root_addr_cells = of_n_addr_cells(of_root);
+	n_root_size_cells = of_n_size_cells(of_root);
+	of_node_put(of_root);
 
 	if (init_drmem_lmb_size(dn))
-		return;
+		return ret;
 
 	usm = of_get_usable_memory(dn);
 
 	prop = of_get_property(dn, "ibm,dynamic-memory", NULL);
 	if (prop) {
-		__walk_drmem_v1_lmbs(prop, usm, func);
+		ret = __walk_drmem_v1_lmbs(prop, usm, data, func);
 	} else {
 		prop = of_get_property(dn, "ibm,dynamic-memory-v2", NULL);
 		if (prop)
-			__walk_drmem_v2_lmbs(prop, usm, func);
+			ret = __walk_drmem_v2_lmbs(prop, usm, data, func);
 	}
+
+	return ret;
 }
 
 static void __init init_drmem_v1_lmbs(const __be32 *prop)
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 9fcf2d195830..88eb6894418d 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -644,8 +644,9 @@ static inline int __init read_usm_ranges(const __be32 **usm)
  * Extract NUMA information from the ibm,dynamic-reconfiguration-memory
  * node.  This assumes n_mem_{addr,size}_cells have been set.
  */
-static void __init numa_setup_drmem_lmb(struct drmem_lmb *lmb,
-					const __be32 **usm)
+static int __init numa_setup_drmem_lmb(struct drmem_lmb *lmb,
+					const __be32 **usm,
+					void *data)
 {
 	unsigned int ranges, is_kexec_kdump = 0;
 	unsigned long base, size, sz;
@@ -657,7 +658,7 @@ static void __init numa_setup_drmem_lmb(struct drmem_lmb *lmb,
 	 */
 	if ((lmb->flags & DRCONF_MEM_RESERVED)
 	    || !(lmb->flags & DRCONF_MEM_ASSIGNED))
-		return;
+		return 0;
 
 	if (*usm)
 		is_kexec_kdump = 1;
@@ -669,7 +670,7 @@ static void __init numa_setup_drmem_lmb(struct drmem_lmb *lmb,
 	if (is_kexec_kdump) {
 		ranges = read_usm_ranges(usm);
 		if (!ranges) /* there are no (base, size) duple */
-			return;
+			return 0;
 	}
 
 	do {
@@ -686,6 +687,8 @@ static void __init numa_setup_drmem_lmb(struct drmem_lmb *lmb,
 		if (sz)
 			memblock_set_node(base, sz, &memblock.memory, nid);
 	} while (--ranges);
+
+	return 0;
 }
 
 static int __init parse_numa_properties(void)
@@ -787,7 +790,7 @@ static int __init parse_numa_properties(void)
 	 */
 	memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
 	if (memory) {
-		walk_drmem_lmbs(memory, numa_setup_drmem_lmb);
+		walk_drmem_lmbs(memory, NULL, numa_setup_drmem_lmb);
 		of_node_put(memory);
 	}
 



^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox