LinuxPPC-Dev Archive on lore.kernel.org

LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed

* Re: hugetlb: ensure hugepage access is denied if hugepages are not supported
From: Aneesh Kumar K.V @ 2014-04-04  0:42 UTC (permalink / raw)
  To: Nishanth Aravamudan, linux-mm; +Cc: paulus, anton, nyc, akpm, linuxppc-dev
In-Reply-To: <20140403231413.GB17412@linux.vnet.ibm.com>

Nishanth Aravamudan <nacc@linux.vnet.ibm.com> writes:

> In KVM guests on Power, in a guest not backed by hugepages, we see the
> following:
>
> AnonHugePages:         0 kB
> HugePages_Total:       0
> HugePages_Free:        0
> HugePages_Rsvd:        0
> HugePages_Surp:        0
> Hugepagesize:         64 kB
>
> HPAGE_SHIFT == 0 in this configuration, which indicates that hugepages
> are not supported at boot-time, but this is only checked in
> hugetlb_init(). Extract the check to a helper function, and use it in a
> few relevant places.
>
> This does make hugetlbfs not supported (not registered at all) in this
> environment. I believe this is fine, as there are no valid hugepages and
> that won't change at runtime.
>
> Signed-off-by: Nishanth Aravamudan <nacc@linux.vnet.ibm.com>

Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>

>
> diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
> index d19b30a..cc8fcc7 100644
> --- a/fs/hugetlbfs/inode.c
> +++ b/fs/hugetlbfs/inode.c
> @@ -1017,6 +1017,11 @@ static int __init init_hugetlbfs_fs(void)
>  	int error;
>  	int i;
>
> +	if (!hugepages_supported()) {
> +		printk(KERN_ERR "hugetlbfs: Disabling because there are no supported hugepage sizes\n");
> +		return -ENOTSUPP;
> +	}
> +
>  	error = bdi_init(&hugetlbfs_backing_dev_info);
>  	if (error)
>  		return error;
> diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
> index 8c43cc4..0aea8de 100644
> --- a/include/linux/hugetlb.h
> +++ b/include/linux/hugetlb.h
> @@ -450,4 +450,14 @@ static inline spinlock_t *huge_pte_lock(struct hstate *h,
>  	return ptl;
>  }
>
> +static inline bool hugepages_supported(void)
> +{
> +	/*
> +	 * Some platform decide whether they support huge pages at boot
> +	 * time. On these, such as powerpc, HPAGE_SHIFT is set to 0 when
> +	 * there is no such support
> +	 */
> +	return HPAGE_SHIFT != 0;
> +}
> +
>  #endif /* _LINUX_HUGETLB_H */
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index c01cb9f..1c99585 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -1949,11 +1949,7 @@ module_exit(hugetlb_exit);
>
>  static int __init hugetlb_init(void)
>  {
> -	/* Some platform decide whether they support huge pages at boot
> -	 * time. On these, such as powerpc, HPAGE_SHIFT is set to 0 when
> -	 * there is no such support
> -	 */
> -	if (HPAGE_SHIFT == 0)
> +	if (!hugepages_supported())
>  		return 0;
>
>  	if (!size_to_hstate(default_hstate_size)) {
> @@ -2069,6 +2065,9 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
>  	unsigned long tmp;
>  	int ret;
>
> +	if (!hugepages_supported())
> +		return -ENOTSUPP;
> +
>  	tmp = h->max_huge_pages;
>
>  	if (write && h->order >= MAX_ORDER)
> @@ -2122,6 +2121,9 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write,
>  	unsigned long tmp;
>  	int ret;
>
> +	if (!hugepages_supported())
> +		return -ENOTSUPP;
> +
>  	tmp = h->nr_overcommit_huge_pages;
>
>  	if (write && h->order >= MAX_ORDER)
> @@ -2147,6 +2149,8 @@ out:
>  void hugetlb_report_meminfo(struct seq_file *m)
>  {
>  	struct hstate *h = &default_hstate;
> +	if (!hugepages_supported())
> +		return;
>  	seq_printf(m,
>  			"HugePages_Total:   %5lu\n"
>  			"HugePages_Free:    %5lu\n"
> @@ -2163,6 +2167,8 @@ void hugetlb_report_meminfo(struct seq_file *m)
>  int hugetlb_report_node_meminfo(int nid, char *buf)
>  {
>  	struct hstate *h = &default_hstate;
> +	if (!hugepages_supported())
> +		return 0;
>  	return sprintf(buf,
>  		"Node %d HugePages_Total: %5u\n"
>  		"Node %d HugePages_Free:  %5u\n"
> @@ -2177,6 +2183,9 @@ void hugetlb_show_meminfo(void)
>  	struct hstate *h;
>  	int nid;
>
> +	if (!hugepages_supported())
> +		return;
> +
>  	for_each_node_state(nid, N_MEMORY)
>  		for_each_hstate(h)
>  			pr_info("Node %d hugepages_total=%u hugepages_free=%u hugepages_surp=%u hugepages_size=%lukB\n",

^ permalink raw reply

* [PATCH v2 1/8] DMA: Freescale: remove the unnecessary FSL_DMA_LD_DEBUG
From: hongbo.zhang @ 2014-04-04  3:27 UTC (permalink / raw)
  To: vkoul, dan.j.williams, dmaengine
  Cc: scottwood, Hongbo Zhang, linuxppc-dev, linux-kernel
In-Reply-To: <1396582037-23065-1-git-send-email-hongbo.zhang@freescale.com>

From: Hongbo Zhang <hongbo.zhang@freescale.com>

Some codes are calling chan_dbg with FSL_DMA_LD_DEBUG surrounded, it is really
unnecessary to use such a macro because chan_dbg is a wrapper of dev_dbg, we do
have corresponding DEBUG macro to switch on/off dev_dbg, and most of the other
codes are also calling chan_dbg directly without using FSL_DMA_LD_DEBUG.

Signed-off-by: Hongbo Zhang <hongbo.zhang@freescale.com>
---
 drivers/dma/fsldma.c |   10 ----------
 1 file changed, 10 deletions(-)

diff --git a/drivers/dma/fsldma.c b/drivers/dma/fsldma.c
index f157c6f..ec50420 100644
--- a/drivers/dma/fsldma.c
+++ b/drivers/dma/fsldma.c
@@ -426,9 +426,7 @@ static struct fsl_desc_sw *fsl_dma_alloc_descriptor(struct fsldma_chan *chan)
 	desc->async_tx.tx_submit = fsl_dma_tx_submit;
 	desc->async_tx.phys = pdesc;
 
-#ifdef FSL_DMA_LD_DEBUG
 	chan_dbg(chan, "LD %p allocated\n", desc);
-#endif
 
 	return desc;
 }
@@ -479,9 +477,7 @@ static void fsldma_free_desc_list(struct fsldma_chan *chan,
 
 	list_for_each_entry_safe(desc, _desc, list, node) {
 		list_del(&desc->node);
-#ifdef FSL_DMA_LD_DEBUG
 		chan_dbg(chan, "LD %p free\n", desc);
-#endif
 		dma_pool_free(chan->desc_pool, desc, desc->async_tx.phys);
 	}
 }
@@ -493,9 +489,7 @@ static void fsldma_free_desc_list_reverse(struct fsldma_chan *chan,
 
 	list_for_each_entry_safe_reverse(desc, _desc, list, node) {
 		list_del(&desc->node);
-#ifdef FSL_DMA_LD_DEBUG
 		chan_dbg(chan, "LD %p free\n", desc);
-#endif
 		dma_pool_free(chan->desc_pool, desc, desc->async_tx.phys);
 	}
 }
@@ -832,9 +826,7 @@ static void fsldma_cleanup_descriptor(struct fsldma_chan *chan,
 
 	/* Run the link descriptor callback function */
 	if (txd->callback) {
-#ifdef FSL_DMA_LD_DEBUG
 		chan_dbg(chan, "LD %p callback\n", desc);
-#endif
 		txd->callback(txd->callback_param);
 	}
 
@@ -842,9 +834,7 @@ static void fsldma_cleanup_descriptor(struct fsldma_chan *chan,
 	dma_run_dependencies(txd);
 
 	dma_descriptor_unmap(txd);
-#ifdef FSL_DMA_LD_DEBUG
 	chan_dbg(chan, "LD %p free\n", desc);
-#endif
 	dma_pool_free(chan->desc_pool, desc, txd->phys);
 }
 
-- 
1.7.9.5

^ permalink raw reply related

* [PATCH v2 2/8] DMA: Freescale: unify register access methods
From: hongbo.zhang @ 2014-04-04  3:27 UTC (permalink / raw)
  To: vkoul, dan.j.williams, dmaengine
  Cc: scottwood, Hongbo Zhang, linuxppc-dev, linux-kernel
In-Reply-To: <1396582037-23065-1-git-send-email-hongbo.zhang@freescale.com>

From: Hongbo Zhang <hongbo.zhang@freescale.com>

Methods of accessing DMA contorller registers are inconsistent, some registers
are accessed by DMA_IN/OUT directly, while others are accessed by functions
get/set_* which are wrappers of DMA_IN/OUT, and even for the BCR register, it
is read by get_bcr but written by DMA_OUT.
This patch unifies the inconsistent methods, all registers are accessed by
get/set_* now.

Signed-off-by: Hongbo Zhang <hongbo.zhang@freescale.com>
---
 drivers/dma/fsldma.c |   52 ++++++++++++++++++++++++++++++++------------------
 1 file changed, 33 insertions(+), 19 deletions(-)

diff --git a/drivers/dma/fsldma.c b/drivers/dma/fsldma.c
index ec50420..5f32cb8 100644
--- a/drivers/dma/fsldma.c
+++ b/drivers/dma/fsldma.c
@@ -61,6 +61,16 @@ static u32 get_sr(struct fsldma_chan *chan)
 	return DMA_IN(chan, &chan->regs->sr, 32);
 }
 
+static void set_mr(struct fsldma_chan *chan, u32 val)
+{
+	DMA_OUT(chan, &chan->regs->mr, val, 32);
+}
+
+static u32 get_mr(struct fsldma_chan *chan)
+{
+	return DMA_IN(chan, &chan->regs->mr, 32);
+}
+
 static void set_cdar(struct fsldma_chan *chan, dma_addr_t addr)
 {
 	DMA_OUT(chan, &chan->regs->cdar, addr | FSL_DMA_SNEN, 64);
@@ -71,6 +81,11 @@ static dma_addr_t get_cdar(struct fsldma_chan *chan)
 	return DMA_IN(chan, &chan->regs->cdar, 64) & ~FSL_DMA_SNEN;
 }
 
+static void set_bcr(struct fsldma_chan *chan, u32 val)
+{
+	DMA_OUT(chan, &chan->regs->bcr, val, 32);
+}
+
 static u32 get_bcr(struct fsldma_chan *chan)
 {
 	return DMA_IN(chan, &chan->regs->bcr, 32);
@@ -135,7 +150,7 @@ static void set_ld_eol(struct fsldma_chan *chan, struct fsl_desc_sw *desc)
 static void dma_init(struct fsldma_chan *chan)
 {
 	/* Reset the channel */
-	DMA_OUT(chan, &chan->regs->mr, 0, 32);
+	set_mr(chan, 0);
 
 	switch (chan->feature & FSL_DMA_IP_MASK) {
 	case FSL_DMA_IP_85XX:
@@ -144,16 +159,15 @@ static void dma_init(struct fsldma_chan *chan)
 		 * EOLNIE - End of links interrupt enable
 		 * BWC - Bandwidth sharing among channels
 		 */
-		DMA_OUT(chan, &chan->regs->mr, FSL_DMA_MR_BWC
-				| FSL_DMA_MR_EIE | FSL_DMA_MR_EOLNIE, 32);
+		set_mr(chan, FSL_DMA_MR_BWC | FSL_DMA_MR_EIE
+			| FSL_DMA_MR_EOLNIE);
 		break;
 	case FSL_DMA_IP_83XX:
 		/* Set the channel to below modes:
 		 * EOTIE - End-of-transfer interrupt enable
 		 * PRC_RM - PCI read multiple
 		 */
-		DMA_OUT(chan, &chan->regs->mr, FSL_DMA_MR_EOTIE
-				| FSL_DMA_MR_PRC_RM, 32);
+		set_mr(chan, FSL_DMA_MR_EOTIE | FSL_DMA_MR_PRC_RM);
 		break;
 	}
 }
@@ -175,10 +189,10 @@ static void dma_start(struct fsldma_chan *chan)
 {
 	u32 mode;
 
-	mode = DMA_IN(chan, &chan->regs->mr, 32);
+	mode = get_mr(chan);
 
 	if (chan->feature & FSL_DMA_CHAN_PAUSE_EXT) {
-		DMA_OUT(chan, &chan->regs->bcr, 0, 32);
+		set_bcr(chan, 0);
 		mode |= FSL_DMA_MR_EMP_EN;
 	} else {
 		mode &= ~FSL_DMA_MR_EMP_EN;
@@ -191,7 +205,7 @@ static void dma_start(struct fsldma_chan *chan)
 		mode |= FSL_DMA_MR_CS;
 	}
 
-	DMA_OUT(chan, &chan->regs->mr, mode, 32);
+	set_mr(chan, mode);
 }
 
 static void dma_halt(struct fsldma_chan *chan)
@@ -200,7 +214,7 @@ static void dma_halt(struct fsldma_chan *chan)
 	int i;
 
 	/* read the mode register */
-	mode = DMA_IN(chan, &chan->regs->mr, 32);
+	mode = get_mr(chan);
 
 	/*
 	 * The 85xx controller supports channel abort, which will stop
@@ -209,14 +223,14 @@ static void dma_halt(struct fsldma_chan *chan)
 	 */
 	if ((chan->feature & FSL_DMA_IP_MASK) == FSL_DMA_IP_85XX) {
 		mode |= FSL_DMA_MR_CA;
-		DMA_OUT(chan, &chan->regs->mr, mode, 32);
+		set_mr(chan, mode);
 
 		mode &= ~FSL_DMA_MR_CA;
 	}
 
 	/* stop the DMA controller */
 	mode &= ~(FSL_DMA_MR_CS | FSL_DMA_MR_EMS_EN);
-	DMA_OUT(chan, &chan->regs->mr, mode, 32);
+	set_mr(chan, mode);
 
 	/* wait for the DMA controller to become idle */
 	for (i = 0; i < 100; i++) {
@@ -245,7 +259,7 @@ static void fsl_chan_set_src_loop_size(struct fsldma_chan *chan, int size)
 {
 	u32 mode;
 
-	mode = DMA_IN(chan, &chan->regs->mr, 32);
+	mode = get_mr(chan);
 
 	switch (size) {
 	case 0:
@@ -259,7 +273,7 @@ static void fsl_chan_set_src_loop_size(struct fsldma_chan *chan, int size)
 		break;
 	}
 
-	DMA_OUT(chan, &chan->regs->mr, mode, 32);
+	set_mr(chan, mode);
 }
 
 /**
@@ -277,7 +291,7 @@ static void fsl_chan_set_dst_loop_size(struct fsldma_chan *chan, int size)
 {
 	u32 mode;
 
-	mode = DMA_IN(chan, &chan->regs->mr, 32);
+	mode = get_mr(chan);
 
 	switch (size) {
 	case 0:
@@ -291,7 +305,7 @@ static void fsl_chan_set_dst_loop_size(struct fsldma_chan *chan, int size)
 		break;
 	}
 
-	DMA_OUT(chan, &chan->regs->mr, mode, 32);
+	set_mr(chan, mode);
 }
 
 /**
@@ -312,10 +326,10 @@ static void fsl_chan_set_request_count(struct fsldma_chan *chan, int size)
 
 	BUG_ON(size > 1024);
 
-	mode = DMA_IN(chan, &chan->regs->mr, 32);
+	mode = get_mr(chan);
 	mode |= (__ilog2(size) << 24) & 0x0f000000;
 
-	DMA_OUT(chan, &chan->regs->mr, mode, 32);
+	set_mr(chan, mode);
 }
 
 /**
@@ -889,9 +903,9 @@ static void fsl_chan_xfer_ld_queue(struct fsldma_chan *chan)
 	if ((chan->feature & FSL_DMA_IP_MASK) == FSL_DMA_IP_85XX) {
 		u32 mode;
 
-		mode = DMA_IN(chan, &chan->regs->mr, 32);
+		mode = get_mr(chan);
 		mode &= ~FSL_DMA_MR_CS;
-		DMA_OUT(chan, &chan->regs->mr, mode, 32);
+		set_mr(chan, mode);
 	}
 
 	/*
-- 
1.7.9.5

^ permalink raw reply related

* [PATCH v2 0/8] DMA: Freescale: driver cleanups and enhancements
From: hongbo.zhang @ 2014-04-04  3:27 UTC (permalink / raw)
  To: vkoul, dan.j.williams, dmaengine
  Cc: scottwood, Hongbo Zhang, linuxppc-dev, linux-kernel

From: Hongbo Zhang <hongbo.zhang@freescale.com>

Hi Vinod Koul,
Please have a look at the v2 patch set.

v1 -> v2 change:
The only one change is introducing a new patch[1/7] to remove the unnecessary
macro FSL_DMA_LD_DEBUG, thus the total patches number is 8 now (was 7)

Hongbo Zhang (8):
  DMA: Freescale: remove the unnecessary FSL_DMA_LD_DEBUG
  DMA: Freescale: unify register access methods
  DMA: Freescale: remove attribute DMA_INTERRUPT of dmaengine
  DMA: Freescale: add fsl_dma_free_descriptor() to reduce code
    duplication
  DMA: Freescale: move functions to avoid forward declarations
  DMA: Freescale: change descriptor release process for supporting
    async_tx
  DMA: Freescale: use spin_lock_bh instead of spin_lock_irqsave
  DMA: Freescale: add suspend resume functions for DMA driver

 drivers/dma/fsldma.c |  590 ++++++++++++++++++++++++++++++++------------------
 drivers/dma/fsldma.h |   33 ++-
 2 files changed, 408 insertions(+), 215 deletions(-)

-- 
1.7.9.5

^ permalink raw reply

* [PATCH v2 3/8] DMA: Freescale: remove attribute DMA_INTERRUPT of dmaengine
From: hongbo.zhang @ 2014-04-04  3:27 UTC (permalink / raw)
  To: vkoul, dan.j.williams, dmaengine
  Cc: scottwood, Hongbo Zhang, linuxppc-dev, linux-kernel
In-Reply-To: <1396582037-23065-1-git-send-email-hongbo.zhang@freescale.com>

From: Hongbo Zhang <hongbo.zhang@freescale.com>

Delete attribute DMA_INTERRUPT because fsldma doesn't support this function,
exception will be thrown if talitos is used to offload xor at the same time.

Signed-off-by: Hongbo Zhang <hongbo.zhang@freescale.com>
Signed-off-by: Qiang Liu <qiang.liu@freescale.com>
---
 drivers/dma/fsldma.c |   31 -------------------------------
 1 file changed, 31 deletions(-)

diff --git a/drivers/dma/fsldma.c b/drivers/dma/fsldma.c
index 5f32cb8..b71cc04 100644
--- a/drivers/dma/fsldma.c
+++ b/drivers/dma/fsldma.c
@@ -528,35 +528,6 @@ static void fsl_dma_free_chan_resources(struct dma_chan *dchan)
 }
 
 static struct dma_async_tx_descriptor *
-fsl_dma_prep_interrupt(struct dma_chan *dchan, unsigned long flags)
-{
-	struct fsldma_chan *chan;
-	struct fsl_desc_sw *new;
-
-	if (!dchan)
-		return NULL;
-
-	chan = to_fsl_chan(dchan);
-
-	new = fsl_dma_alloc_descriptor(chan);
-	if (!new) {
-		chan_err(chan, "%s\n", msg_ld_oom);
-		return NULL;
-	}
-
-	new->async_tx.cookie = -EBUSY;
-	new->async_tx.flags = flags;
-
-	/* Insert the link descriptor to the LD ring */
-	list_add_tail(&new->node, &new->tx_list);
-
-	/* Set End-of-link to the last link descriptor of new list */
-	set_ld_eol(chan, new);
-
-	return &new->async_tx;
-}
-
-static struct dma_async_tx_descriptor *
 fsl_dma_prep_memcpy(struct dma_chan *dchan,
 	dma_addr_t dma_dst, dma_addr_t dma_src,
 	size_t len, unsigned long flags)
@@ -1308,12 +1279,10 @@ static int fsldma_of_probe(struct platform_device *op)
 	fdev->irq = irq_of_parse_and_map(op->dev.of_node, 0);
 
 	dma_cap_set(DMA_MEMCPY, fdev->common.cap_mask);
-	dma_cap_set(DMA_INTERRUPT, fdev->common.cap_mask);
 	dma_cap_set(DMA_SG, fdev->common.cap_mask);
 	dma_cap_set(DMA_SLAVE, fdev->common.cap_mask);
 	fdev->common.device_alloc_chan_resources = fsl_dma_alloc_chan_resources;
 	fdev->common.device_free_chan_resources = fsl_dma_free_chan_resources;
-	fdev->common.device_prep_dma_interrupt = fsl_dma_prep_interrupt;
 	fdev->common.device_prep_dma_memcpy = fsl_dma_prep_memcpy;
 	fdev->common.device_prep_dma_sg = fsl_dma_prep_sg;
 	fdev->common.device_tx_status = fsl_tx_status;
-- 
1.7.9.5

^ permalink raw reply related

* [PATCH v2 4/8] DMA: Freescale: add fsl_dma_free_descriptor() to reduce code duplication
From: hongbo.zhang @ 2014-04-04  3:27 UTC (permalink / raw)
  To: vkoul, dan.j.williams, dmaengine
  Cc: scottwood, Hongbo Zhang, linuxppc-dev, linux-kernel
In-Reply-To: <1396582037-23065-1-git-send-email-hongbo.zhang@freescale.com>

From: Hongbo Zhang <hongbo.zhang@freescale.com>

There are several places where descriptors are freed using identical code.
This patch puts this code into a function to reduce code duplication.

Signed-off-by: Hongbo Zhang <hongbo.zhang@freescale.com>
Signed-off-by: Qiang Liu <qiang.liu@freescale.com>
---
 drivers/dma/fsldma.c |   30 ++++++++++++++++++------------
 1 file changed, 18 insertions(+), 12 deletions(-)

diff --git a/drivers/dma/fsldma.c b/drivers/dma/fsldma.c
index b71cc04..b5a0ffa 100644
--- a/drivers/dma/fsldma.c
+++ b/drivers/dma/fsldma.c
@@ -418,6 +418,19 @@ static dma_cookie_t fsl_dma_tx_submit(struct dma_async_tx_descriptor *tx)
 }
 
 /**
+ * fsl_dma_free_descriptor - Free descriptor from channel's DMA pool.
+ * @chan : Freescale DMA channel
+ * @desc: descriptor to be freed
+ */
+static void fsl_dma_free_descriptor(struct fsldma_chan *chan,
+		struct fsl_desc_sw *desc)
+{
+	list_del(&desc->node);
+	chan_dbg(chan, "LD %p free\n", desc);
+	dma_pool_free(chan->desc_pool, desc, desc->async_tx.phys);
+}
+
+/**
  * fsl_dma_alloc_descriptor - Allocate descriptor from channel's DMA pool.
  * @chan : Freescale DMA channel
  *
@@ -489,11 +502,8 @@ static void fsldma_free_desc_list(struct fsldma_chan *chan,
 {
 	struct fsl_desc_sw *desc, *_desc;
 
-	list_for_each_entry_safe(desc, _desc, list, node) {
-		list_del(&desc->node);
-		chan_dbg(chan, "LD %p free\n", desc);
-		dma_pool_free(chan->desc_pool, desc, desc->async_tx.phys);
-	}
+	list_for_each_entry_safe(desc, _desc, list, node)
+		fsl_dma_free_descriptor(chan, desc);
 }
 
 static void fsldma_free_desc_list_reverse(struct fsldma_chan *chan,
@@ -501,11 +511,8 @@ static void fsldma_free_desc_list_reverse(struct fsldma_chan *chan,
 {
 	struct fsl_desc_sw *desc, *_desc;
 
-	list_for_each_entry_safe_reverse(desc, _desc, list, node) {
-		list_del(&desc->node);
-		chan_dbg(chan, "LD %p free\n", desc);
-		dma_pool_free(chan->desc_pool, desc, desc->async_tx.phys);
-	}
+	list_for_each_entry_safe_reverse(desc, _desc, list, node)
+		fsl_dma_free_descriptor(chan, desc);
 }
 
 /**
@@ -819,8 +826,7 @@ static void fsldma_cleanup_descriptor(struct fsldma_chan *chan,
 	dma_run_dependencies(txd);
 
 	dma_descriptor_unmap(txd);
-	chan_dbg(chan, "LD %p free\n", desc);
-	dma_pool_free(chan->desc_pool, desc, txd->phys);
+	fsl_dma_free_descriptor(chan, desc);
 }
 
 /**
-- 
1.7.9.5

^ permalink raw reply related

* [PATCH v2 5/8] DMA: Freescale: move functions to avoid forward declarations
From: hongbo.zhang @ 2014-04-04  3:27 UTC (permalink / raw)
  To: vkoul, dan.j.williams, dmaengine
  Cc: scottwood, Hongbo Zhang, linuxppc-dev, linux-kernel
In-Reply-To: <1396582037-23065-1-git-send-email-hongbo.zhang@freescale.com>

From: Hongbo Zhang <hongbo.zhang@freescale.com>

These functions will be modified in the next patch in the series. By moving the
function in a patch separate from the changes, it will make review easier.

Signed-off-by: Hongbo Zhang <hongbo.zhang@freescale.com>
Signed-off-by: Qiang Liu <qiang.liu@freescale.com>
---
 drivers/dma/fsldma.c |  188 +++++++++++++++++++++++++-------------------------
 1 file changed, 94 insertions(+), 94 deletions(-)

diff --git a/drivers/dma/fsldma.c b/drivers/dma/fsldma.c
index b5a0ffa..968877f 100644
--- a/drivers/dma/fsldma.c
+++ b/drivers/dma/fsldma.c
@@ -459,6 +459,100 @@ static struct fsl_desc_sw *fsl_dma_alloc_descriptor(struct fsldma_chan *chan)
 }
 
 /**
+ * fsl_chan_xfer_ld_queue - transfer any pending transactions
+ * @chan : Freescale DMA channel
+ *
+ * HARDWARE STATE: idle
+ * LOCKING: must hold chan->desc_lock
+ */
+static void fsl_chan_xfer_ld_queue(struct fsldma_chan *chan)
+{
+	struct fsl_desc_sw *desc;
+
+	/*
+	 * If the list of pending descriptors is empty, then we
+	 * don't need to do any work at all
+	 */
+	if (list_empty(&chan->ld_pending)) {
+		chan_dbg(chan, "no pending LDs\n");
+		return;
+	}
+
+	/*
+	 * The DMA controller is not idle, which means that the interrupt
+	 * handler will start any queued transactions when it runs after
+	 * this transaction finishes
+	 */
+	if (!chan->idle) {
+		chan_dbg(chan, "DMA controller still busy\n");
+		return;
+	}
+
+	/*
+	 * If there are some link descriptors which have not been
+	 * transferred, we need to start the controller
+	 */
+
+	/*
+	 * Move all elements from the queue of pending transactions
+	 * onto the list of running transactions
+	 */
+	chan_dbg(chan, "idle, starting controller\n");
+	desc = list_first_entry(&chan->ld_pending, struct fsl_desc_sw, node);
+	list_splice_tail_init(&chan->ld_pending, &chan->ld_running);
+
+	/*
+	 * The 85xx DMA controller doesn't clear the channel start bit
+	 * automatically at the end of a transfer. Therefore we must clear
+	 * it in software before starting the transfer.
+	 */
+	if ((chan->feature & FSL_DMA_IP_MASK) == FSL_DMA_IP_85XX) {
+		u32 mode;
+
+		mode = get_mr(chan);
+		mode &= ~FSL_DMA_MR_CS;
+		set_mr(chan, mode);
+	}
+
+	/*
+	 * Program the descriptor's address into the DMA controller,
+	 * then start the DMA transaction
+	 */
+	set_cdar(chan, desc->async_tx.phys);
+	get_cdar(chan);
+
+	dma_start(chan);
+	chan->idle = false;
+}
+
+/**
+ * fsldma_cleanup_descriptor - cleanup and free a single link descriptor
+ * @chan: Freescale DMA channel
+ * @desc: descriptor to cleanup and free
+ *
+ * This function is used on a descriptor which has been executed by the DMA
+ * controller. It will run any callbacks, submit any dependencies, and then
+ * free the descriptor.
+ */
+static void fsldma_cleanup_descriptor(struct fsldma_chan *chan,
+				      struct fsl_desc_sw *desc)
+{
+	struct dma_async_tx_descriptor *txd = &desc->async_tx;
+
+	/* Run the link descriptor callback function */
+	if (txd->callback) {
+		chan_dbg(chan, "LD %p callback\n", desc);
+		txd->callback(txd->callback_param);
+	}
+
+	/* Run any dependencies */
+	dma_run_dependencies(txd);
+
+	dma_descriptor_unmap(txd);
+	fsl_dma_free_descriptor(chan, desc);
+}
+
+/**
  * fsl_dma_alloc_chan_resources - Allocate resources for DMA channel.
  * @chan : Freescale DMA channel
  *
@@ -803,100 +897,6 @@ static int fsl_dma_device_control(struct dma_chan *dchan,
 }
 
 /**
- * fsldma_cleanup_descriptor - cleanup and free a single link descriptor
- * @chan: Freescale DMA channel
- * @desc: descriptor to cleanup and free
- *
- * This function is used on a descriptor which has been executed by the DMA
- * controller. It will run any callbacks, submit any dependencies, and then
- * free the descriptor.
- */
-static void fsldma_cleanup_descriptor(struct fsldma_chan *chan,
-				      struct fsl_desc_sw *desc)
-{
-	struct dma_async_tx_descriptor *txd = &desc->async_tx;
-
-	/* Run the link descriptor callback function */
-	if (txd->callback) {
-		chan_dbg(chan, "LD %p callback\n", desc);
-		txd->callback(txd->callback_param);
-	}
-
-	/* Run any dependencies */
-	dma_run_dependencies(txd);
-
-	dma_descriptor_unmap(txd);
-	fsl_dma_free_descriptor(chan, desc);
-}
-
-/**
- * fsl_chan_xfer_ld_queue - transfer any pending transactions
- * @chan : Freescale DMA channel
- *
- * HARDWARE STATE: idle
- * LOCKING: must hold chan->desc_lock
- */
-static void fsl_chan_xfer_ld_queue(struct fsldma_chan *chan)
-{
-	struct fsl_desc_sw *desc;
-
-	/*
-	 * If the list of pending descriptors is empty, then we
-	 * don't need to do any work at all
-	 */
-	if (list_empty(&chan->ld_pending)) {
-		chan_dbg(chan, "no pending LDs\n");
-		return;
-	}
-
-	/*
-	 * The DMA controller is not idle, which means that the interrupt
-	 * handler will start any queued transactions when it runs after
-	 * this transaction finishes
-	 */
-	if (!chan->idle) {
-		chan_dbg(chan, "DMA controller still busy\n");
-		return;
-	}
-
-	/*
-	 * If there are some link descriptors which have not been
-	 * transferred, we need to start the controller
-	 */
-
-	/*
-	 * Move all elements from the queue of pending transactions
-	 * onto the list of running transactions
-	 */
-	chan_dbg(chan, "idle, starting controller\n");
-	desc = list_first_entry(&chan->ld_pending, struct fsl_desc_sw, node);
-	list_splice_tail_init(&chan->ld_pending, &chan->ld_running);
-
-	/*
-	 * The 85xx DMA controller doesn't clear the channel start bit
-	 * automatically at the end of a transfer. Therefore we must clear
-	 * it in software before starting the transfer.
-	 */
-	if ((chan->feature & FSL_DMA_IP_MASK) == FSL_DMA_IP_85XX) {
-		u32 mode;
-
-		mode = get_mr(chan);
-		mode &= ~FSL_DMA_MR_CS;
-		set_mr(chan, mode);
-	}
-
-	/*
-	 * Program the descriptor's address into the DMA controller,
-	 * then start the DMA transaction
-	 */
-	set_cdar(chan, desc->async_tx.phys);
-	get_cdar(chan);
-
-	dma_start(chan);
-	chan->idle = false;
-}
-
-/**
  * fsl_dma_memcpy_issue_pending - Issue the DMA start command
  * @chan : Freescale DMA channel
  */
-- 
1.7.9.5

^ permalink raw reply related

* [PATCH v2 6/8] DMA: Freescale: change descriptor release process for supporting async_tx
From: hongbo.zhang @ 2014-04-04  3:27 UTC (permalink / raw)
  To: vkoul, dan.j.williams, dmaengine
  Cc: scottwood, Hongbo Zhang, linuxppc-dev, linux-kernel
In-Reply-To: <1396582037-23065-1-git-send-email-hongbo.zhang@freescale.com>

From: Hongbo Zhang <hongbo.zhang@freescale.com>

Fix the potential risk when enable config NET_DMA and ASYNC_TX. Async_tx is
lack of support in current release process of dma descriptor, all descriptors
will be released whatever is acked or no-acked by async_tx, so there is a
potential race condition when dma engine is uesd by others clients (e.g. when
enable NET_DMA to offload TCP).

In our case, a race condition which is raised when use both of talitos and
dmaengine to offload xor is because napi scheduler will sync all pending
requests in dma channels, it affects the process of raid operations due to
ack_tx is not checked in fsl dma. The no-acked descriptor is freed which is
submitted just now, as a dependent tx, this freed descriptor trigger
BUG_ON(async_tx_test_ack(depend_tx)) in async_tx_submit().

TASK = ee1a94a0[1390] 'md0_raid5' THREAD: ecf40000 CPU: 0
GPR00: 00000001 ecf41ca0 ee44/921a94a0 0000003f 00000001 c00593e4 00000000 00000001
GPR08: 00000000 a7a7a7a7 00000001 045/920000002 42028042 100a38d4 ed576d98 00000000
GPR16: ed5a11b0 00000000 2b162000 00000200 046/920000000 2d555000 ed3015e8 c15a7aa0
GPR24: 00000000 c155fc40 00000000 ecb63220 ecf41d28 e47/92f640bb0 ef640c30 ecf41ca0
NIP [c02b048c] async_tx_submit+0x6c/0x2b4
LR [c02b068c] async_tx_submit+0x26c/0x2b4
Call Trace:
[ecf41ca0] [c02b068c] async_tx_submit+0x26c/0x2b448/92 (unreliable)
[ecf41cd0] [c02b0a4c] async_memcpy+0x240/0x25c
[ecf41d20] [c0421064] async_copy_data+0xa0/0x17c
[ecf41d70] [c0421cf4] __raid_run_ops+0x874/0xe10
[ecf41df0] [c0426ee4] handle_stripe+0x820/0x25e8
[ecf41e90] [c0429080] raid5d+0x3d4/0x5b4
[ecf41f40] [c04329b8] md_thread+0x138/0x16c
[ecf41f90] [c008277c] kthread+0x8c/0x90
[ecf41ff0] [c0011630] kernel_thread+0x4c/0x68

Another modification in this patch is the change of completed descriptors,
there is a potential risk which caused by exception interrupt, all descriptors
in ld_running list are seemed completed when an interrupt raised, it works fine
under normal condition, but if there is an exception occured, it cannot work as
our excepted. Hardware should not be depend on s/w list, the right way is to
read current descriptor address register to find the last completed descriptor.
If an interrupt is raised by an error, all descriptors in ld_running should not
be seemed finished, or these unfinished descriptors in ld_running will be
released wrongly.

A simple way to reproduce:
Enable dmatest first, then insert some bad descriptors which can trigger
Programming Error interrupts before the good descriptors. Last, the good
descriptors will be freed before they are processsed because of the exception
intrerrupt.

Note: the bad descriptors are only for simulating an exception interrupt.  This
case can illustrate the potential risk in current fsl-dma very well.

Signed-off-by: Hongbo Zhang <hongbo.zhang@freescale.com>
Signed-off-by: Qiang Liu <qiang.liu@freescale.com>
Signed-off-by: Ira W. Snyder <iws@ovro.caltech.edu>
---
 drivers/dma/fsldma.c |  195 ++++++++++++++++++++++++++++++++++++--------------
 drivers/dma/fsldma.h |   17 ++++-
 2 files changed, 158 insertions(+), 54 deletions(-)

diff --git a/drivers/dma/fsldma.c b/drivers/dma/fsldma.c
index 968877f..f8eee60 100644
--- a/drivers/dma/fsldma.c
+++ b/drivers/dma/fsldma.c
@@ -459,6 +459,87 @@ static struct fsl_desc_sw *fsl_dma_alloc_descriptor(struct fsldma_chan *chan)
 }
 
 /**
+ * fsldma_clean_completed_descriptor - free all descriptors which
+ * has been completed and acked
+ * @chan: Freescale DMA channel
+ *
+ * This function is used on all completed and acked descriptors.
+ * All descriptors should only be freed in this function.
+ */
+static void fsldma_clean_completed_descriptor(struct fsldma_chan *chan)
+{
+	struct fsl_desc_sw *desc, *_desc;
+
+	/* Run the callback for each descriptor, in order */
+	list_for_each_entry_safe(desc, _desc, &chan->ld_completed, node)
+		if (async_tx_test_ack(&desc->async_tx))
+			fsl_dma_free_descriptor(chan, desc);
+}
+
+/**
+ * fsldma_run_tx_complete_actions - cleanup a single link descriptor
+ * @chan: Freescale DMA channel
+ * @desc: descriptor to cleanup and free
+ * @cookie: Freescale DMA transaction identifier
+ *
+ * This function is used on a descriptor which has been executed by the DMA
+ * controller. It will run any callbacks, submit any dependencies.
+ */
+static dma_cookie_t fsldma_run_tx_complete_actions(struct fsldma_chan *chan,
+		struct fsl_desc_sw *desc, dma_cookie_t cookie)
+{
+	struct dma_async_tx_descriptor *txd = &desc->async_tx;
+
+	BUG_ON(txd->cookie < 0);
+
+	if (txd->cookie > 0) {
+		cookie = txd->cookie;
+
+		/* Run the link descriptor callback function */
+		if (txd->callback) {
+			chan_dbg(chan, "LD %p callback\n", desc);
+			txd->callback(txd->callback_param);
+		}
+	}
+
+	/* Run any dependencies */
+	dma_run_dependencies(txd);
+
+	return cookie;
+}
+
+/**
+ * fsldma_clean_running_descriptor - move the completed descriptor from
+ * ld_running to ld_completed
+ * @chan: Freescale DMA channel
+ * @desc: the descriptor which is completed
+ *
+ * Free the descriptor directly if acked by async_tx api, or move it to
+ * queue ld_completed.
+ */
+static void fsldma_clean_running_descriptor(struct fsldma_chan *chan,
+		struct fsl_desc_sw *desc)
+{
+	/* Remove from the list of transactions */
+	list_del(&desc->node);
+
+	/*
+	 * the client is allowed to attach dependent operations
+	 * until 'ack' is set
+	 */
+	if (!async_tx_test_ack(&desc->async_tx)) {
+		/*
+		 * Move this descriptor to the list of descriptors which is
+		 * completed, but still awaiting the 'ack' bit to be set.
+		 */
+		list_add_tail(&desc->node, &chan->ld_completed);
+		return;
+	}
+
+	dma_pool_free(chan->desc_pool, desc, desc->async_tx.phys);
+}
+
+/**
  * fsl_chan_xfer_ld_queue - transfer any pending transactions
  * @chan : Freescale DMA channel
  *
@@ -526,30 +607,58 @@ static void fsl_chan_xfer_ld_queue(struct fsldma_chan *chan)
 }
 
 /**
- * fsldma_cleanup_descriptor - cleanup and free a single link descriptor
+ * fsldma_cleanup_descriptors - cleanup link descriptors which are completed
+ * and move them to ld_completed to free until flag 'ack' is set
  * @chan: Freescale DMA channel
- * @desc: descriptor to cleanup and free
  *
- * This function is used on a descriptor which has been executed by the DMA
- * controller. It will run any callbacks, submit any dependencies, and then
- * free the descriptor.
+ * This function is used on descriptors which have been executed by the DMA
+ * controller. It will run any callbacks, submit any dependencies, then
+ * free these descriptors if flag 'ack' is set.
  */
-static void fsldma_cleanup_descriptor(struct fsldma_chan *chan,
-				      struct fsl_desc_sw *desc)
+static void fsldma_cleanup_descriptors(struct fsldma_chan *chan)
 {
-	struct dma_async_tx_descriptor *txd = &desc->async_tx;
+	struct fsl_desc_sw *desc, *_desc;
+	dma_cookie_t cookie = 0;
+	dma_addr_t curr_phys = get_cdar(chan);
+	int seen_current = 0;
+
+	fsldma_clean_completed_descriptor(chan);
+
+	/* Run the callback for each descriptor, in order */
+	list_for_each_entry_safe(desc, _desc, &chan->ld_running, node) {
+		/*
+		 * do not advance past the current descriptor loaded into the
+		 * hardware channel, subsequent descriptors are either in
+		 * process or have not been submitted
+		 */
+		if (seen_current)
+			break;
+
+		/*
+		 * stop the search if we reach the current descriptor and the
+		 * channel is busy
+		 */
+		if (desc->async_tx.phys == curr_phys) {
+			seen_current = 1;
+			if (!dma_is_idle(chan))
+				break;
+		}
+
+		cookie = fsldma_run_tx_complete_actions(chan, desc, cookie);
 
-	/* Run the link descriptor callback function */
-	if (txd->callback) {
-		chan_dbg(chan, "LD %p callback\n", desc);
-		txd->callback(txd->callback_param);
+		fsldma_clean_running_descriptor(chan, desc);
 	}
 
-	/* Run any dependencies */
-	dma_run_dependencies(txd);
+	/*
+	 * Start any pending transactions automatically
+	 *
+	 * In the ideal case, we keep the DMA controller busy while we go
+	 * ahead and free the descriptors below.
+	 */
+	fsl_chan_xfer_ld_queue(chan);
 
-	dma_descriptor_unmap(txd);
-	fsl_dma_free_descriptor(chan, desc);
+	if (cookie > 0)
+		chan->common.completed_cookie = cookie;
 }
 
 /**
@@ -620,8 +729,10 @@ static void fsl_dma_free_chan_resources(struct dma_chan *dchan)
 
 	chan_dbg(chan, "free all channel resources\n");
 	spin_lock_irqsave(&chan->desc_lock, flags);
+	fsldma_cleanup_descriptors(chan);
 	fsldma_free_desc_list(chan, &chan->ld_pending);
 	fsldma_free_desc_list(chan, &chan->ld_running);
+	fsldma_free_desc_list(chan, &chan->ld_completed);
 	spin_unlock_irqrestore(&chan->desc_lock, flags);
 
 	dma_pool_destroy(chan->desc_pool);
@@ -859,6 +970,7 @@ static int fsl_dma_device_control(struct dma_chan *dchan,
 		/* Remove and free all of the descriptors in the LD queue */
 		fsldma_free_desc_list(chan, &chan->ld_pending);
 		fsldma_free_desc_list(chan, &chan->ld_running);
+		fsldma_free_desc_list(chan, &chan->ld_completed);
 		chan->idle = true;
 
 		spin_unlock_irqrestore(&chan->desc_lock, flags);
@@ -918,6 +1030,17 @@ static enum dma_status fsl_tx_status(struct dma_chan *dchan,
 					dma_cookie_t cookie,
 					struct dma_tx_state *txstate)
 {
+	struct fsldma_chan *chan = to_fsl_chan(dchan);
+	enum dma_status ret;
+
+	ret = dma_cookie_status(dchan, cookie, txstate);
+	if (ret == DMA_COMPLETE)
+		return ret;
+
+	spin_lock_bh(&chan->desc_lock);
+	fsldma_cleanup_descriptors(chan);
+	spin_unlock_bh(&chan->desc_lock);
+
 	return dma_cookie_status(dchan, cookie, txstate);
 }
 
@@ -995,52 +1118,19 @@ static irqreturn_t fsldma_chan_irq(int irq, void *data)
 static void dma_do_tasklet(unsigned long data)
 {
 	struct fsldma_chan *chan = (struct fsldma_chan *)data;
-	struct fsl_desc_sw *desc, *_desc;
-	LIST_HEAD(ld_cleanup);
 	unsigned long flags;
 
 	chan_dbg(chan, "tasklet entry\n");
 
 	spin_lock_irqsave(&chan->desc_lock, flags);
 
-	/* update the cookie if we have some descriptors to cleanup */
-	if (!list_empty(&chan->ld_running)) {
-		dma_cookie_t cookie;
-
-		desc = to_fsl_desc(chan->ld_running.prev);
-		cookie = desc->async_tx.cookie;
-		dma_cookie_complete(&desc->async_tx);
-
-		chan_dbg(chan, "completed_cookie=%d\n", cookie);
-	}
-
-	/*
-	 * move the descriptors to a temporary list so we can drop the lock
-	 * during the entire cleanup operation
-	 */
-	list_splice_tail_init(&chan->ld_running, &ld_cleanup);
-
 	/* the hardware is now idle and ready for more */
 	chan->idle = true;
 
-	/*
-	 * Start any pending transactions automatically
-	 *
-	 * In the ideal case, we keep the DMA controller busy while we go
-	 * ahead and free the descriptors below.
-	 */
-	fsl_chan_xfer_ld_queue(chan);
-	spin_unlock_irqrestore(&chan->desc_lock, flags);
+	/* Run all cleanup for descriptors which have been completed */
+	fsldma_cleanup_descriptors(chan);
 
-	/* Run the callback for each descriptor, in order */
-	list_for_each_entry_safe(desc, _desc, &ld_cleanup, node) {
-
-		/* Remove from the list of transactions */
-		list_del(&desc->node);
-
-		/* Run all cleanup for this descriptor */
-		fsldma_cleanup_descriptor(chan, desc);
-	}
+	spin_unlock_irqrestore(&chan->desc_lock, flags);
 
 	chan_dbg(chan, "tasklet exit\n");
 }
@@ -1224,6 +1314,7 @@ static int fsl_dma_chan_probe(struct fsldma_device *fdev,
 	spin_lock_init(&chan->desc_lock);
 	INIT_LIST_HEAD(&chan->ld_pending);
 	INIT_LIST_HEAD(&chan->ld_running);
+	INIT_LIST_HEAD(&chan->ld_completed);
 	chan->idle = true;
 
 	chan->common.device = &fdev->common;
diff --git a/drivers/dma/fsldma.h b/drivers/dma/fsldma.h
index d56e835..ec19517 100644
--- a/drivers/dma/fsldma.h
+++ b/drivers/dma/fsldma.h
@@ -138,8 +138,21 @@ struct fsldma_chan {
 	char name[8];			/* Channel name */
 	struct fsldma_chan_regs __iomem *regs;
 	spinlock_t desc_lock;		/* Descriptor operation lock */
-	struct list_head ld_pending;	/* Link descriptors queue */
-	struct list_head ld_running;	/* Link descriptors queue */
+	/*
+	 * Descriptors which are queued to run, but have not yet been
+	 * submitted to the hardware for execution
+	 */
+	struct list_head ld_pending;
+	/*
+	 * Descriptors which are currently being executed by the hardware
+	 */
+	struct list_head ld_running;
+	/*
+	 * Descriptors which have finished execution by the hardware. These
+	 * descriptors have already had their cleanup actions run. They are
+	 * waiting for the ACK bit to be set by the async_tx API.
+	 */
+	struct list_head ld_completed;	/* Link descriptors queue */
 	struct dma_chan common;		/* DMA common channel */
 	struct dma_pool *desc_pool;	/* Descriptors pool */
 	struct device *dev;		/* Channel device */
-- 
1.7.9.5

^ permalink raw reply related

* [PATCH v2 7/8] DMA: Freescale: use spin_lock_bh instead of spin_lock_irqsave
From: hongbo.zhang @ 2014-04-04  3:27 UTC (permalink / raw)
  To: vkoul, dan.j.williams, dmaengine
  Cc: scottwood, Hongbo Zhang, linuxppc-dev, linux-kernel
In-Reply-To: <1396582037-23065-1-git-send-email-hongbo.zhang@freescale.com>

From: Hongbo Zhang <hongbo.zhang@freescale.com>

The usage of spin_lock_irqsave() is a stronger locking mechanism than is
required throughout the driver. The minimum locking required should be used
instead. Interrupts will be turned off and context will be saved, it is
unnecessary to use irqsave.

This patch changes all instances of spin_lock_irqsave() to spin_lock_bh(). All
manipulation of protected fields is done using tasklet context or weaker, which
makes spin_lock_bh() the correct choice.

Signed-off-by: Hongbo Zhang <hongbo.zhang@freescale.com>
Signed-off-by: Qiang Liu <qiang.liu@freescale.com>
---
 drivers/dma/fsldma.c |   25 ++++++++++---------------
 1 file changed, 10 insertions(+), 15 deletions(-)

diff --git a/drivers/dma/fsldma.c b/drivers/dma/fsldma.c
index f8eee60..c9bf54a 100644
--- a/drivers/dma/fsldma.c
+++ b/drivers/dma/fsldma.c
@@ -396,10 +396,9 @@ static dma_cookie_t fsl_dma_tx_submit(struct dma_async_tx_descriptor *tx)
 	struct fsldma_chan *chan = to_fsl_chan(tx->chan);
 	struct fsl_desc_sw *desc = tx_to_fsl_desc(tx);
 	struct fsl_desc_sw *child;
-	unsigned long flags;
 	dma_cookie_t cookie = -EINVAL;
 
-	spin_lock_irqsave(&chan->desc_lock, flags);
+	spin_lock_bh(&chan->desc_lock);
 
 	/*
 	 * assign cookies to all of the software descriptors
@@ -412,7 +411,7 @@ static dma_cookie_t fsl_dma_tx_submit(struct dma_async_tx_descriptor *tx)
 	/* put this transaction onto the tail of the pending queue */
 	append_ld_queue(chan, desc);
 
-	spin_unlock_irqrestore(&chan->desc_lock, flags);
+	spin_unlock_bh(&chan->desc_lock);
 
 	return cookie;
 }
@@ -725,15 +724,14 @@ static void fsldma_free_desc_list_reverse(struct fsldma_chan *chan,
 static void fsl_dma_free_chan_resources(struct dma_chan *dchan)
 {
 	struct fsldma_chan *chan = to_fsl_chan(dchan);
-	unsigned long flags;
 
 	chan_dbg(chan, "free all channel resources\n");
-	spin_lock_irqsave(&chan->desc_lock, flags);
+	spin_lock_bh(&chan->desc_lock);
 	fsldma_cleanup_descriptors(chan);
 	fsldma_free_desc_list(chan, &chan->ld_pending);
 	fsldma_free_desc_list(chan, &chan->ld_running);
 	fsldma_free_desc_list(chan, &chan->ld_completed);
-	spin_unlock_irqrestore(&chan->desc_lock, flags);
+	spin_unlock_bh(&chan->desc_lock);
 
 	dma_pool_destroy(chan->desc_pool);
 	chan->desc_pool = NULL;
@@ -952,7 +950,6 @@ static int fsl_dma_device_control(struct dma_chan *dchan,
 {
 	struct dma_slave_config *config;
 	struct fsldma_chan *chan;
-	unsigned long flags;
 	int size;
 
 	if (!dchan)
@@ -962,7 +959,7 @@ static int fsl_dma_device_control(struct dma_chan *dchan,
 
 	switch (cmd) {
 	case DMA_TERMINATE_ALL:
-		spin_lock_irqsave(&chan->desc_lock, flags);
+		spin_lock_bh(&chan->desc_lock);
 
 		/* Halt the DMA engine */
 		dma_halt(chan);
@@ -973,7 +970,7 @@ static int fsl_dma_device_control(struct dma_chan *dchan,
 		fsldma_free_desc_list(chan, &chan->ld_completed);
 		chan->idle = true;
 
-		spin_unlock_irqrestore(&chan->desc_lock, flags);
+		spin_unlock_bh(&chan->desc_lock);
 		return 0;
 
 	case DMA_SLAVE_CONFIG:
@@ -1015,11 +1012,10 @@ static int fsl_dma_device_control(struct dma_chan *dchan,
 static void fsl_dma_memcpy_issue_pending(struct dma_chan *dchan)
 {
 	struct fsldma_chan *chan = to_fsl_chan(dchan);
-	unsigned long flags;
 
-	spin_lock_irqsave(&chan->desc_lock, flags);
+	spin_lock_bh(&chan->desc_lock);
 	fsl_chan_xfer_ld_queue(chan);
-	spin_unlock_irqrestore(&chan->desc_lock, flags);
+	spin_unlock_bh(&chan->desc_lock);
 }
 
 /**
@@ -1118,11 +1114,10 @@ static irqreturn_t fsldma_chan_irq(int irq, void *data)
 static void dma_do_tasklet(unsigned long data)
 {
 	struct fsldma_chan *chan = (struct fsldma_chan *)data;
-	unsigned long flags;
 
 	chan_dbg(chan, "tasklet entry\n");
 
-	spin_lock_irqsave(&chan->desc_lock, flags);
+	spin_lock_bh(&chan->desc_lock);
 
 	/* the hardware is now idle and ready for more */
 	chan->idle = true;
@@ -1130,7 +1125,7 @@ static void dma_do_tasklet(unsigned long data)
 	/* Run all cleanup for descriptors which have been completed */
 	fsldma_cleanup_descriptors(chan);
 
-	spin_unlock_irqrestore(&chan->desc_lock, flags);
+	spin_unlock_bh(&chan->desc_lock);
 
 	chan_dbg(chan, "tasklet exit\n");
 }
-- 
1.7.9.5

^ permalink raw reply related

* [PATCH v2 8/8] DMA: Freescale: add suspend resume functions for DMA driver
From: hongbo.zhang @ 2014-04-04  3:27 UTC (permalink / raw)
  To: vkoul, dan.j.williams, dmaengine
  Cc: scottwood, Hongbo Zhang, linuxppc-dev, linux-kernel
In-Reply-To: <1396582037-23065-1-git-send-email-hongbo.zhang@freescale.com>

From: Hongbo Zhang <hongbo.zhang@freescale.com>

This patch adds suspend resume functions for Freescale DMA driver.
.prepare callback is used to stop further descriptors from being added into the
pending queue, and also issue pending queues into execution if there is any.
.suspend callback makes sure all the pending jobs are cleaned up and all the
channels are idle, and save the mode registers.
.resume callback re-initializes the channels by restore the mode registers.

Signed-off-by: Hongbo Zhang <hongbo.zhang@freescale.com>
---
 drivers/dma/fsldma.c |   99 ++++++++++++++++++++++++++++++++++++++++++++++++++
 drivers/dma/fsldma.h |   16 ++++++++
 2 files changed, 115 insertions(+)

diff --git a/drivers/dma/fsldma.c b/drivers/dma/fsldma.c
index c9bf54a..91482d2 100644
--- a/drivers/dma/fsldma.c
+++ b/drivers/dma/fsldma.c
@@ -400,6 +400,14 @@ static dma_cookie_t fsl_dma_tx_submit(struct dma_async_tx_descriptor *tx)
 
 	spin_lock_bh(&chan->desc_lock);
 
+#ifdef CONFIG_PM
+	if (unlikely(chan->pm_state != RUNNING)) {
+		chan_dbg(chan, "cannot submit due to suspend\n");
+		spin_unlock_bh(&chan->desc_lock);
+		return -1;
+	}
+#endif
+
 	/*
 	 * assign cookies to all of the software descriptors
 	 * that make up this transaction
@@ -1311,6 +1319,9 @@ static int fsl_dma_chan_probe(struct fsldma_device *fdev,
 	INIT_LIST_HEAD(&chan->ld_running);
 	INIT_LIST_HEAD(&chan->ld_completed);
 	chan->idle = true;
+#ifdef CONFIG_PM
+	chan->pm_state = RUNNING;
+#endif
 
 	chan->common.device = &fdev->common;
 	dma_cookie_init(&chan->common);
@@ -1450,6 +1461,91 @@ static int fsldma_of_remove(struct platform_device *op)
 	return 0;
 }
 
+#ifdef CONFIG_PM
+static int fsldma_prepare(struct device *dev)
+{
+	struct platform_device *pdev = to_platform_device(dev);
+	struct fsldma_device *fdev = platform_get_drvdata(pdev);
+	struct fsldma_chan *chan;
+	int i;
+
+	for (i = 0; i < FSL_DMA_MAX_CHANS_PER_DEVICE; i++) {
+		chan = fdev->chan[i];
+		if (!chan)
+			continue;
+
+		spin_lock_bh(&chan->desc_lock);
+		chan->pm_state = SUSPENDING;
+		if (!list_empty(&chan->ld_pending))
+			fsl_chan_xfer_ld_queue(chan);
+		spin_unlock_bh(&chan->desc_lock);
+	}
+
+	return 0;
+}
+
+static int fsldma_suspend(struct device *dev)
+{
+	struct platform_device *pdev = to_platform_device(dev);
+	struct fsldma_device *fdev = platform_get_drvdata(pdev);
+	struct fsldma_chan *chan;
+	int i;
+
+	for (i = 0; i < FSL_DMA_MAX_CHANS_PER_DEVICE; i++) {
+		chan = fdev->chan[i];
+		if (!chan)
+			continue;
+
+		spin_lock_bh(&chan->desc_lock);
+		if (!chan->idle)
+			goto out;
+		chan->regs_save.mr = DMA_IN(chan, &chan->regs->mr, 32);
+		chan->pm_state = SUSPENDED;
+		spin_unlock_bh(&chan->desc_lock);
+	}
+	return 0;
+
+out:
+	for (; i >= 0; i--) {
+		chan = fdev->chan[i];
+		if (!chan)
+			continue;
+		spin_unlock_bh(&chan->desc_lock);
+	}
+	return -EBUSY;
+}
+
+static int fsldma_resume(struct device *dev)
+{
+	struct platform_device *pdev = to_platform_device(dev);
+	struct fsldma_device *fdev = platform_get_drvdata(pdev);
+	struct fsldma_chan *chan;
+	u32 mode;
+	int i;
+
+	for (i = 0; i < FSL_DMA_MAX_CHANS_PER_DEVICE; i++) {
+		chan = fdev->chan[i];
+		if (!chan)
+			continue;
+
+		spin_lock_bh(&chan->desc_lock);
+		mode = chan->regs_save.mr
+			& ~FSL_DMA_MR_CS & ~FSL_DMA_MR_CC & ~FSL_DMA_MR_CA;
+		DMA_OUT(chan, &chan->regs->mr, mode, 32);
+		chan->pm_state = RUNNING;
+		spin_unlock_bh(&chan->desc_lock);
+	}
+
+	return 0;
+}
+
+static const struct dev_pm_ops fsldma_pm_ops = {
+	.prepare	= fsldma_prepare,
+	.suspend	= fsldma_suspend,
+	.resume		= fsldma_resume,
+};
+#endif
+
 static const struct of_device_id fsldma_of_ids[] = {
 	{ .compatible = "fsl,elo3-dma", },
 	{ .compatible = "fsl,eloplus-dma", },
@@ -1462,6 +1558,9 @@ static struct platform_driver fsldma_of_driver = {
 		.name = "fsl-elo-dma",
 		.owner = THIS_MODULE,
 		.of_match_table = fsldma_of_ids,
+#ifdef CONFIG_PM
+		.pm = &fsldma_pm_ops,
+#endif
 	},
 	.probe = fsldma_of_probe,
 	.remove = fsldma_of_remove,
diff --git a/drivers/dma/fsldma.h b/drivers/dma/fsldma.h
index ec19517..eecaf9e 100644
--- a/drivers/dma/fsldma.h
+++ b/drivers/dma/fsldma.h
@@ -134,6 +134,18 @@ struct fsldma_device {
 #define FSL_DMA_CHAN_PAUSE_EXT	0x00001000
 #define FSL_DMA_CHAN_START_EXT	0x00002000
 
+#ifdef CONFIG_PM
+struct fsldma_chan_regs_save {
+	u32 mr;
+};
+
+enum fsldma_pm_state {
+	RUNNING = 0,
+	SUSPENDING,
+	SUSPENDED,
+};
+#endif
+
 struct fsldma_chan {
 	char name[8];			/* Channel name */
 	struct fsldma_chan_regs __iomem *regs;
@@ -161,6 +173,10 @@ struct fsldma_chan {
 	struct tasklet_struct tasklet;
 	u32 feature;
 	bool idle;			/* DMA controller is idle */
+#ifdef CONFIG_PM
+	struct fsldma_chan_regs_save regs_save;
+	enum fsldma_pm_state pm_state;
+#endif
 
 	void (*toggle_ext_pause)(struct fsldma_chan *fsl_chan, int enable);
 	void (*toggle_ext_start)(struct fsldma_chan *fsl_chan, int enable);
-- 
1.7.9.5

^ permalink raw reply related

* Re: [PATCH] power, sched: stop updating inside arch_update_cpu_topology() when nothing to be update
From: Michael wang @ 2014-04-04  3:48 UTC (permalink / raw)
  To: Srivatsa S. Bhat
  Cc: sfr, Srikar Dronamraju, LKML, jlarrew, paulus, alistair, nfont,
	Andrew Morton, rcj, linuxppc-dev
In-Reply-To: <533D20E1.4000008@linux.vnet.ibm.com>

Hi, Srivatsa

Thanks for your reply :)

On 04/03/2014 04:50 PM, Srivatsa S. Bhat wrote:
[snip]
> 
> Now, the interesting thing to note here is that, if CPU0's node was already
> set as node0, *nothing* should go wrong, since its just a redundant update.
> However, if CPU0's original node mapping was something different, or if
> node0 doesn't even exist in the machine, then the system can crash.

By printk I confirmed all cpus was belong to node 1 at very beginning,
and things become magically after the wrong updating...

> 
> Have you verified that CPU0's node mapping is different from node 0?
> That is, boot the kernel with "numa=debug" in the kernel command line and
> it will print out the cpu-to-node associativity during boot. That way you
> can figure out what was the original associativity that was set. This will
> confirm the theory that the hypervisor sent a redundant update, but because
> of the weird pre-allocation using kzalloc that we do inside
> arch_update_cpu_topology(), we wrongly updated CPU0's mapping as CPU0 <-> Node0.

Associativity should changes, otherwise we won't continue the updating,
and empty updates[] was confirmed to show up inside
arch_update_cpu_topology().

What I can't make sure is whether this is legal, notify changes but no
changes happen sounds weird...however, even if it's legal, a check in
here still make sense IMHO.

Regards,
Michael Wang

> 
> 
> Regards,
> Srivatsa S. Bhat
> 
>> Thus we should stop the updating in such cases, this patch will achieve
>> this and fix the issue.
>>
>> CC: Benjamin Herrenschmidt <benh@kernel.crashing.org>
>> CC: Paul Mackerras <paulus@samba.org>
>> CC: Nathan Fontenot <nfont@linux.vnet.ibm.com>
>> CC: Stephen Rothwell <sfr@canb.auug.org.au>
>> CC: Andrew Morton <akpm@linux-foundation.org>
>> CC: Robert Jennings <rcj@linux.vnet.ibm.com>
>> CC: Jesse Larrew <jlarrew@linux.vnet.ibm.com>
>> CC: "Srivatsa S. Bhat" <srivatsa.bhat@linux.vnet.ibm.com>
>> CC: Alistair Popple <alistair@popple.id.au>
>> Signed-off-by: Michael Wang <wangyun@linux.vnet.ibm.com>
>> ---
>>  arch/powerpc/mm/numa.c |    9 +++++++++
>>  1 file changed, 9 insertions(+)
>>
>> diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
>> index 30a42e2..6757690 100644
>> --- a/arch/powerpc/mm/numa.c
>> +++ b/arch/powerpc/mm/numa.c
>> @@ -1591,6 +1591,14 @@ int arch_update_cpu_topology(void)
>>  		cpu = cpu_last_thread_sibling(cpu);
>>  	}
>>
>> +	/*
>> +	 * The 'cpu_associativity_changes_mask' could be cleared if
>> +	 * all the cpus it indicates won't change their node, in
>> +	 * which case the 'updated_cpus' will be empty.
>> +	 */
>> +	if (!cpumask_weight(&updated_cpus))
>> +		goto out;
>> +
>>  	stop_machine(update_cpu_topology, &updates[0], &updated_cpus);
>>
>>  	/*
>> @@ -1612,6 +1620,7 @@ int arch_update_cpu_topology(void)
>>  		changed = 1;
>>  	}
>>
>> +out:
>>  	kfree(updates);
>>  	return changed;
>>  }
>>
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
> 

^ permalink raw reply

* [PATCH] powerpc/powernv: Add debugfs file to grab opalv3 trace data
From: Benjamin Herrenschmidt @ 2014-04-04  5:27 UTC (permalink / raw)
  To: linuxppc-dev list

From: Rusty Russell <rusty@rustcorp.com.au>

This adds files in debugfs that can be used to retrieve the
OPALv3 firmware "live binary traces" which can then be parsed
using a userspace tool.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/platforms/powernv/Makefile           |   2 +-
 arch/powerpc/platforms/powernv/opal-trace-types.h |  58 +++++++
 arch/powerpc/platforms/powernv/opal-trace.c       | 183 ++++++++++++++++++++++
 3 files changed, 242 insertions(+), 1 deletion(-)
 create mode 100644 arch/powerpc/platforms/powernv/opal-trace-types.h
 create mode 100644 arch/powerpc/platforms/powernv/opal-trace.c

diff --git a/arch/powerpc/platforms/powernv/Makefile b/arch/powerpc/platforms/powernv/Makefile
index f5d4149..e34a28d 100644
--- a/arch/powerpc/platforms/powernv/Makefile
+++ b/arch/powerpc/platforms/powernv/Makefile
@@ -2,7 +2,7 @@ obj-y			+= setup.o opal-takeover.o opal-wrappers.o opal.o opal-async.o
 obj-y			+= opal-rtc.o opal-nvram.o opal-lpc.o opal-flash.o opal-sysparam.o
 obj-y			+= rng.o opal-dump.o opal-elog.o opal-sensor.o opal-msglog.o
 obj-y			+= subcore.o subcore-asm.o
-
+obj-$(CONFIG_DEBUG_FS)	+= opal-trace.o
 obj-$(CONFIG_SMP)	+= smp.o
 obj-$(CONFIG_PCI)	+= pci.o pci-p5ioc2.o pci-ioda.o
 obj-$(CONFIG_EEH)	+= eeh-ioda.o eeh-powernv.o
diff --git a/arch/powerpc/platforms/powernv/opal-trace-types.h b/arch/powerpc/platforms/powernv/opal-trace-types.h
new file mode 100644
index 0000000..e9816d4
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/opal-trace-types.h
@@ -0,0 +1,58 @@
+/* API for kernel to read trace buffer. */
+#ifndef __OPAL_TRACE_TYPES_H
+#define __OPAL_TRACE_TYPES_H
+
+#define TRACE_REPEAT 1
+#define TRACE_OVERFLOW 2
+#define TRACE_OPAL 3
+#define TRACE_FSP 4
+
+/* One per cpu, plus one for NMIs */
+struct tracebuf {
+	/* Mask to apply to get buffer offset. */
+	u64 mask;
+	/* This where the buffer starts. */
+	u64 start;
+	/* This is where writer has written to. */
+	u64 end;
+	/* This is where the writer wrote to previously. */
+	u64 last;
+	/* This is where the reader is up to. */
+	u64 rpos;
+	/* If the last one we read was a repeat, this shows how many. */
+	u32 last_repeat;
+	/* Maximum possible size of a record. */
+	u32 max_size;
+
+	char buf[/* TBUF_SZ + max_size */];
+};
+
+/* Common header for all trace entries. */
+struct trace_hdr {
+	u64 timestamp;
+	u8 type;
+	u8 len_div_8;
+	u16 cpu;
+	u8 unused[4];
+};
+
+/* Note: all other entries must be at least as large as this! */
+struct trace_repeat {
+	u64 timestamp; /* Last repeat happened at this timestamp */
+	u8 type; /* == TRACE_REPEAT */
+	u8 len_div_8;
+	u16 cpu;
+	u16 prev_len;
+	u16 num; /* Starts at 1, ie. 1 repeat, or two traces. */
+	/* Note that the count can be one short, if read races a repeat. */
+};
+
+struct trace_overflow {
+	u64 unused64; /* Timestamp is unused */
+	u8 type; /* == TRACE_OVERFLOW */
+	u8 len_div_8;
+	u8 unused[6]; /* ie. hdr.cpu is indeterminate */
+	u64 bytes_missed;
+};
+
+#endif /* __OPAL_TRACE_TYPES_H */
diff --git a/arch/powerpc/platforms/powernv/opal-trace.c b/arch/powerpc/platforms/powernv/opal-trace.c
new file mode 100644
index 0000000..e445528
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/opal-trace.c
@@ -0,0 +1,183 @@
+/*
+ * Copyright (C) 2013 Rusty Russell, IBM Corporation
+ *
+ * Simple debugfs file firmware_trace to read out OPALv3 trace
+ * ringbuffers.
+ */
+#include <linux/mutex.h>
+#include <linux/debugfs.h>
+#include <linux/uaccess.h>
+#include <linux/of.h>
+#include <linux/slab.h>
+#include <asm/debug.h>
+
+#include "opal-trace-types.h"
+
+static DEFINE_MUTEX(tracelock);
+static struct tracebuf **tb;
+static size_t num_tb;
+
+/* Maximum possible size of record (since len is 8 bits). */
+union max_trace {
+	struct trace_hdr hdr;
+	struct trace_overflow overflow;
+	struct trace_repeat repeat;
+	char buf[255 * 8];
+};
+static union max_trace trace;
+
+static bool trace_empty(const struct tracebuf *tb)
+{
+	const struct trace_repeat *rep;
+
+	if (tb->rpos == tb->end)
+		return true;
+
+	/*
+	 * If we have a single element only, and it's a repeat buffer
+	 * we've already seen every repeat for (yet which may be
+	 * incremented in future), we're also empty.
+	 */
+	rep = (void *)tb->buf + (tb->rpos & tb->mask);
+	if (tb->end != tb->rpos + sizeof(*rep))
+		return false;
+
+	if (rep->type != TRACE_REPEAT)
+		return false;
+
+	if (rep->num != tb->last_repeat)
+		return false;
+
+	return true;
+}
+
+/* You can't read in parallel, so some locking required in caller. */
+static bool trace_get(union max_trace *t, struct tracebuf *tb)
+{
+	u64 start;
+
+	if (trace_empty(tb))
+		return false;
+
+again:
+	/*
+	 * The actual buffer is slightly larger than tbsize, so this
+	 * memcpy is always valid.
+	 */
+	memcpy(t, tb->buf + (tb->rpos & tb->mask), tb->max_size);
+
+	rmb(); /* read barrier, so we read tb->start after copying record. */
+
+	start = tb->start;
+
+	/* Now, was that overwritten? */
+	if (tb->rpos < start) {
+		/* Create overflow record. */
+		t->overflow.unused64 = 0;
+		t->overflow.type = TRACE_OVERFLOW;
+		t->overflow.len_div_8 = sizeof(t->overflow) / 8;
+		t->overflow.bytes_missed = start - tb->rpos;
+		tb->rpos += t->overflow.bytes_missed;
+		return true;
+	}
+
+	/* Repeat entries need special handling */
+	if (t->hdr.type == TRACE_REPEAT) {
+		u32 num = t->repeat.num;
+
+		/* In case we've read some already... */
+		t->repeat.num -= tb->last_repeat;
+
+		/* Record how many repeats we saw this time. */
+		tb->last_repeat = num;
+
+		/* Don't report an empty repeat buffer. */
+		if (t->repeat.num == 0) {
+			/*
+			 * This can't be the last buffer, otherwise
+			 * trace_empty would have returned true.
+			 */
+			BUG_ON(tb->end <= tb->rpos + t->hdr.len_div_8 * 8);
+			/* Skip to next entry. */
+			tb->rpos += t->hdr.len_div_8 * 8;
+			goto again;
+		}
+	} else {
+		tb->last_repeat = 0;
+		tb->rpos += t->hdr.len_div_8 * 8;
+	}
+
+	return true;
+}
+
+/* Horrible polling interface, designed for dumping. */
+static ssize_t read_opal_trace(struct file *file, char __user *ubuf,
+			       size_t count, loff_t *ppos)
+{
+	ssize_t err;
+	unsigned int i;
+
+	err = mutex_lock_interruptible(&tracelock);
+	if (err)
+		return err;
+
+	for (i = 0; i < num_tb; i++) {
+		if (trace_get(&trace, tb[i])) {
+			size_t len = trace.hdr.len_div_8 * 8;
+			if (len > count)
+				len = count;
+			if (copy_to_user(ubuf, &trace, len) != 0)
+				err = -EFAULT;
+			else
+				err = len;
+			break;
+		}
+	}
+
+	mutex_unlock(&tracelock);
+	return err;
+}
+
+static const struct file_operations opal_trace_fops = {
+	.read =		read_opal_trace,
+	.open =		simple_open,
+};
+
+static int opal_trace_init(void)
+{
+	struct device_node *dn;
+	const u64 *reg;
+	int len, i;
+
+	dn = of_find_node_by_name(NULL, "ibm,trace");
+	if (!dn)
+		return -ENODEV;
+
+	reg = of_get_property(dn, "reg", &len);
+	if (!reg) {
+		pr_warning("%s: OF node property %s::reg not found\n",
+			   __func__, dn->full_name);
+		goto fail;
+	}
+
+	num_tb = len / (sizeof(u64) * 2);
+	if (!num_tb) {
+		pr_warning("%s: OF node property %s::reg invalid length %i\n",
+			   __func__, dn->full_name, len);
+		goto fail;
+	}
+	tb = kmalloc(sizeof(*tb) * num_tb, GFP_KERNEL);
+	for (i = 0; i < num_tb; i++)
+		tb[i] = __va(be64_to_cpu(reg[i*2]));
+
+	debugfs_create_file("opal-trace", 0400, powerpc_debugfs_root,
+			    NULL, &opal_trace_fops);
+	of_node_put(dn);
+	return 0;
+
+fail:
+	of_node_put(dn);
+	return -EINVAL;
+}
+module_init(opal_trace_init);
+

^ permalink raw reply related

* [PATCH 0/7] Build ppc64le kernel using ABIv2, supplemental patches
From: Anton Blanchard @ 2014-04-04  6:09 UTC (permalink / raw)
  To: benh, paulus, rusty, ulrich.weigand, amodra, mikey, mjw, rostedt,
	philippe.bergheaud
  Cc: linuxppc-dev

These patches apply against my last series and fix all known
ABIv2 issues.

To stress the module loader and dynamic ftrace code, I built an
allmodconfig kernel and inserted every module I could. I found a bunch
of bugs in the modules themselves, but in the end I managed to
get quite a few modules to load:

# cat /proc/modules | wc -l
3830

It survived a fair bit of poking (enabling/disabling function tracers
etc).

Anton Blanchard (7):
  powerpc: Add _GLOBAL_TOC for ABIv2 assembly functions exported to
    modules
  powerpc: ftrace_caller, _mcount is exported to modules so needs
    _GLOBAL_TOC()
  powerpc/kprobes: Fix ABIv2 issues with kprobe_lookup_name
  powerpc/modules: Create is_module_trampoline()
  powerpc/modules: Create module_trampoline_target()
  powerpc/ftrace: Use module loader helpers to parse trampolines
  powerpc/ftrace: Fix ABIv2 issues with __ftrace_make_call

 arch/powerpc/include/asm/kprobes.h |   5 +-
 arch/powerpc/include/asm/module.h  |   3 +
 arch/powerpc/include/asm/ppc_asm.h |  12 ++++
 arch/powerpc/kernel/entry_64.S     |   7 +-
 arch/powerpc/kernel/ftrace.c       | 137 +++++++++++--------------------------
 arch/powerpc/kernel/module_64.c    |  80 ++++++++++++++++++++--
 arch/powerpc/lib/copyuser_64.S     |   2 +-
 arch/powerpc/lib/memcpy_64.S       |   2 +-
 8 files changed, 136 insertions(+), 112 deletions(-)

-- 
1.8.3.2

^ permalink raw reply

* [PATCH 1/7] powerpc: Add _GLOBAL_TOC for ABIv2 assembly functions exported to modules
From: Anton Blanchard @ 2014-04-04  6:09 UTC (permalink / raw)
  To: benh, paulus, rusty, ulrich.weigand, amodra, mikey, mjw, rostedt,
	philippe.bergheaud
  Cc: linuxppc-dev
In-Reply-To: <1396591750-8203-1-git-send-email-anton@samba.org>

If an assembly function that calls back into c code is exported to
modules, we need to ensure r2 is setup correctly. There are only
two places crazy enough to do it (two of which are my fault).

Signed-off-by: Anton Blanchard <anton@samba.org>
---
 arch/powerpc/include/asm/ppc_asm.h | 12 ++++++++++++
 arch/powerpc/lib/copyuser_64.S     |  2 +-
 arch/powerpc/lib/memcpy_64.S       |  2 +-
 3 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/ppc_asm.h b/arch/powerpc/include/asm/ppc_asm.h
index 2cc2511..6400f18 100644
--- a/arch/powerpc/include/asm/ppc_asm.h
+++ b/arch/powerpc/include/asm/ppc_asm.h
@@ -207,6 +207,16 @@ END_FW_FTR_SECTION_IFSET(FW_FEATURE_SPLPAR)
 	.globl name; \
 name:
 
+#define _GLOBAL_TOC(name) \
+	.section ".text"; \
+	.align 2 ; \
+	.type name,@function; \
+	.globl name; \
+name: \
+0:	addis r2,r12,(.TOC.-0b)@ha; \
+	addi r2,r2,(.TOC.-0b)@l; \
+	.localentry name,.-name
+
 #define _KPROBE(name) \
 	.section ".kprobes.text","a"; \
 	.align 2 ; \
@@ -235,6 +245,8 @@ name: \
 	.type GLUE(.,name),@function; \
 GLUE(.,name):
 
+#define _GLOBAL_TOC(name) _GLOBAL(name)
+
 #define _KPROBE(name) \
 	.section ".kprobes.text","a"; \
 	.align 2 ; \
diff --git a/arch/powerpc/lib/copyuser_64.S b/arch/powerpc/lib/copyuser_64.S
index 596a285..0860ee4 100644
--- a/arch/powerpc/lib/copyuser_64.S
+++ b/arch/powerpc/lib/copyuser_64.S
@@ -18,7 +18,7 @@
 #endif
 
 	.align	7
-_GLOBAL(__copy_tofrom_user)
+_GLOBAL_TOC(__copy_tofrom_user)
 BEGIN_FTR_SECTION
 	nop
 FTR_SECTION_ELSE
diff --git a/arch/powerpc/lib/memcpy_64.S b/arch/powerpc/lib/memcpy_64.S
index 9d3960c..bc9a2ca 100644
--- a/arch/powerpc/lib/memcpy_64.S
+++ b/arch/powerpc/lib/memcpy_64.S
@@ -10,7 +10,7 @@
 #include <asm/ppc_asm.h>
 
 	.align	7
-_GLOBAL(memcpy)
+_GLOBAL_TOC(memcpy)
 BEGIN_FTR_SECTION
 	std	r3,-STACKFRAMESIZE+STK_REG(R31)(r1)	/* save destination pointer for return value */
 FTR_SECTION_ELSE
-- 
1.8.3.2

^ permalink raw reply related

* [PATCH 2/7] powerpc: ftrace_caller, _mcount is exported to modules so needs _GLOBAL_TOC()
From: Anton Blanchard @ 2014-04-04  6:09 UTC (permalink / raw)
  To: benh, paulus, rusty, ulrich.weigand, amodra, mikey, mjw, rostedt,
	philippe.bergheaud
  Cc: linuxppc-dev
In-Reply-To: <1396591750-8203-1-git-send-email-anton@samba.org>

When testing the ftrace function tracer, I realised that ftrace_caller
and mcount are called from modules and they both call into C, therefore
they need the ABIv2 global entry point to establish r2.

Signed-off-by: Anton Blanchard <anton@samba.org>
---
 arch/powerpc/kernel/entry_64.S | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
index cf4f6e6..9fde8a1 100644
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@@ -1175,7 +1175,7 @@ _GLOBAL(mcount)
 _GLOBAL(_mcount)
 	blr
 
-_GLOBAL(ftrace_caller)
+_GLOBAL_TOC(ftrace_caller)
 	/* Taken from output of objdump from lib64/glibc */
 	mflr	r3
 	ld	r11, 0(r1)
@@ -1199,10 +1199,7 @@ _GLOBAL(ftrace_graph_stub)
 _GLOBAL(ftrace_stub)
 	blr
 #else
-_GLOBAL(mcount)
-	blr
-
-_GLOBAL(_mcount)
+_GLOBAL_TOC(_mcount)
 	/* Taken from output of objdump from lib64/glibc */
 	mflr	r3
 	ld	r11, 0(r1)
-- 
1.8.3.2

^ permalink raw reply related

* [PATCH 3/7] powerpc/kprobes: Fix ABIv2 issues with kprobe_lookup_name
From: Anton Blanchard @ 2014-04-04  6:09 UTC (permalink / raw)
  To: benh, paulus, rusty, ulrich.weigand, amodra, mikey, mjw, rostedt,
	philippe.bergheaud
  Cc: linuxppc-dev
In-Reply-To: <1396591750-8203-1-git-send-email-anton@samba.org>

Use ppc_function_entry in places where we previously assumed
function descriptors exist.

Signed-off-by: Anton Blanchard <anton@samba.org>
---
 arch/powerpc/include/asm/kprobes.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/kprobes.h b/arch/powerpc/include/asm/kprobes.h
index 7b6feab..af15d4d 100644
--- a/arch/powerpc/include/asm/kprobes.h
+++ b/arch/powerpc/include/asm/kprobes.h
@@ -30,6 +30,7 @@
 #include <linux/ptrace.h>
 #include <linux/percpu.h>
 #include <asm/probes.h>
+#include <asm/code-patching.h>
 
 #define  __ARCH_WANT_KPROBES_INSN_SLOT
 
@@ -56,9 +57,9 @@ typedef ppc_opcode_t kprobe_opcode_t;
 		if ((colon = strchr(name, ':')) != NULL) {		\
 			colon++;					\
 			if (*colon != '\0' && *colon != '.')		\
-				addr = *(kprobe_opcode_t **)addr;	\
+				addr = (kprobe_opcode_t *)ppc_function_entry(addr); \
 		} else if (name[0] != '.')				\
-			addr = *(kprobe_opcode_t **)addr;		\
+			addr = (kprobe_opcode_t *)ppc_function_entry(addr); \
 	} else {							\
 		char dot_name[KSYM_NAME_LEN];				\
 		dot_name[0] = '.';					\
-- 
1.8.3.2

^ permalink raw reply related

* [PATCH 4/7] powerpc/modules: Create is_module_trampoline()
From: Anton Blanchard @ 2014-04-04  6:09 UTC (permalink / raw)
  To: benh, paulus, rusty, ulrich.weigand, amodra, mikey, mjw, rostedt,
	philippe.bergheaud
  Cc: linuxppc-dev
In-Reply-To: <1396591750-8203-1-git-send-email-anton@samba.org>

ftrace has way too much knowledge of our kernel module trampoline
layout hidden inside it. Create is_module_trampoline() that can
abstract this away inside the module loader code.

Signed-off-by: Anton Blanchard <anton@samba.org>
---
 arch/powerpc/include/asm/module.h |  1 +
 arch/powerpc/kernel/module_64.c   | 51 +++++++++++++++++++++++++++++++++++----
 2 files changed, 47 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/include/asm/module.h b/arch/powerpc/include/asm/module.h
index c9c7aaa..f2711f0 100644
--- a/arch/powerpc/include/asm/module.h
+++ b/arch/powerpc/include/asm/module.h
@@ -78,6 +78,7 @@ struct mod_arch_specific {
 #    endif	/* MODULE */
 #endif
 
+bool is_module_trampoline(u32 *insns);
 
 struct exception_table_entry;
 void sort_ex_table(struct exception_table_entry *start,
diff --git a/arch/powerpc/kernel/module_64.c b/arch/powerpc/kernel/module_64.c
index 0423601..4db5ecd 100644
--- a/arch/powerpc/kernel/module_64.c
+++ b/arch/powerpc/kernel/module_64.c
@@ -22,6 +22,7 @@
 #include <linux/vmalloc.h>
 #include <linux/ftrace.h>
 #include <linux/bug.h>
+#include <linux/uaccess.h>
 #include <asm/module.h>
 #include <asm/firmware.h>
 #include <asm/code-patching.h>
@@ -102,7 +103,9 @@ static unsigned int local_entry_offset(const Elf64_Sym *sym)
    jump, actually, to reset r2 (TOC+0x8000). */
 struct ppc64_stub_entry
 {
-	/* 28 byte jump instruction sequence (7 instructions) */
+	/* 28 byte jump instruction sequence (7 instructions). We only
+	 * need 6 instructions on ABIv2 but we always allocate 7 so
+	 * so we don't have to modify the trampoline load instruction. */
 	u32 jump[7];
 	u32 unused;
 	/* Data for the above code */
@@ -122,8 +125,8 @@ struct ppc64_stub_entry
  * end of the stub code, and patch the stub address (32-bits relative
  * to the TOC ptr, r2) into the stub.
  */
-static struct ppc64_stub_entry ppc64_stub =
-{ .jump = {
+
+static u32 ppc64_stub_insns[] = {
 	0x3d620000,			/* addis   r11,r2, <high> */
 	0x396b0000,			/* addi    r11,r11, <low> */
 	/* Save current r2 value in magic place on the stack. */
@@ -135,7 +138,45 @@ static struct ppc64_stub_entry ppc64_stub =
 #endif
 	0x7d8903a6,			/* mtctr   r12 */
 	0x4e800420			/* bctr */
-} };
+};
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+
+static u32 ppc64_stub_mask[] = {
+	0xffff0000,
+	0xffff0000,
+	0xffffffff,
+	0xffffffff,
+#if !defined(_CALL_ELF) || _CALL_ELF != 2
+	0xffffffff,
+#endif
+	0xffffffff,
+	0xffffffff
+};
+
+bool is_module_trampoline(u32 *p)
+{
+	unsigned int i;
+	u32 insns[ARRAY_SIZE(ppc64_stub_insns)];
+
+	BUILD_BUG_ON(sizeof(ppc64_stub_insns) != sizeof(ppc64_stub_mask));
+
+	if (probe_kernel_read(insns, p, sizeof(insns)))
+		return -EFAULT;
+
+	for (i = 0; i < ARRAY_SIZE(ppc64_stub_insns); i++) {
+		u32 insna = insns[i];
+		u32 insnb = ppc64_stub_insns[i];
+		u32 mask = ppc64_stub_mask[i];
+
+		if ((insna & mask) != (insnb & mask))
+			return false;
+	}
+
+	return true;
+}
+
+#endif
 
 /* Count how many different 24-bit relocations (different symbol,
    different addend) */
@@ -350,7 +391,7 @@ static inline int create_stub(Elf64_Shdr *sechdrs,
 {
 	long reladdr;
 
-	*entry = ppc64_stub;
+	memcpy(entry->jump, ppc64_stub_insns, sizeof(ppc64_stub_insns));
 
 	/* Stub uses address relative to r2. */
 	reladdr = (unsigned long)entry - my_r2(sechdrs, me);
-- 
1.8.3.2

^ permalink raw reply related

* [PATCH 5/7] powerpc/modules: Create module_trampoline_target()
From: Anton Blanchard @ 2014-04-04  6:09 UTC (permalink / raw)
  To: benh, paulus, rusty, ulrich.weigand, amodra, mikey, mjw, rostedt,
	philippe.bergheaud
  Cc: linuxppc-dev
In-Reply-To: <1396591750-8203-1-git-send-email-anton@samba.org>

ftrace has way too much knowledge of our kernel module trampoline
layout hidden inside it. Create module_trampoline_target() that gives
the target address of a kernel module trampoline.

Signed-off-by: Anton Blanchard <anton@samba.org>
---
 arch/powerpc/include/asm/module.h |  2 ++
 arch/powerpc/kernel/module_64.c   | 29 +++++++++++++++++++++++++++++
 2 files changed, 31 insertions(+)

diff --git a/arch/powerpc/include/asm/module.h b/arch/powerpc/include/asm/module.h
index f2711f0..dcfcad1 100644
--- a/arch/powerpc/include/asm/module.h
+++ b/arch/powerpc/include/asm/module.h
@@ -79,6 +79,8 @@ struct mod_arch_specific {
 #endif
 
 bool is_module_trampoline(u32 *insns);
+int module_trampoline_target(struct module *mod, u32 *trampoline,
+			     unsigned long *target);
 
 struct exception_table_entry;
 void sort_ex_table(struct exception_table_entry *start,
diff --git a/arch/powerpc/kernel/module_64.c b/arch/powerpc/kernel/module_64.c
index 4db5ecd..ef349d0 100644
--- a/arch/powerpc/kernel/module_64.c
+++ b/arch/powerpc/kernel/module_64.c
@@ -176,6 +176,35 @@ bool is_module_trampoline(u32 *p)
 	return true;
 }
 
+int module_trampoline_target(struct module *mod, u32 *trampoline,
+			     unsigned long *target)
+{
+	u32 buf[2];
+	u16 upper, lower;
+	long offset;
+	void *toc_entry;
+
+	if (probe_kernel_read(buf, trampoline, sizeof(buf)))
+		return -EFAULT;
+
+	upper = buf[0] & 0xffff;
+	lower = buf[1] & 0xffff;
+
+	/* perform the addis/addi, both signed */
+	offset = ((short)upper << 16) + (short)lower;
+
+	/*
+	 * Now get the address this trampoline jumps to. This
+	 * is always 32 bytes into our trampoline stub.
+	 */
+	toc_entry = (void *)mod->arch.toc + offset + 32;
+
+	if (probe_kernel_read(target, toc_entry, sizeof(*target)))
+		return -EFAULT;
+
+	return 0;
+}
+
 #endif
 
 /* Count how many different 24-bit relocations (different symbol,
-- 
1.8.3.2

^ permalink raw reply related

* [PATCH 6/7] powerpc/ftrace: Use module loader helpers to parse trampolines
From: Anton Blanchard @ 2014-04-04  6:09 UTC (permalink / raw)
  To: benh, paulus, rusty, ulrich.weigand, amodra, mikey, mjw, rostedt,
	philippe.bergheaud
  Cc: linuxppc-dev
In-Reply-To: <1396591750-8203-1-git-send-email-anton@samba.org>

Now we have is_module_trampoline() and module_trampoline_target()
we can remove a bunch of intimate kernel module trampoline
knowledge from ftrace.

Signed-off-by: Anton Blanchard <anton@samba.org>
---
 arch/powerpc/kernel/ftrace.c | 97 +++++++++-----------------------------------
 1 file changed, 20 insertions(+), 77 deletions(-)

diff --git a/arch/powerpc/kernel/ftrace.c b/arch/powerpc/kernel/ftrace.c
index b0ded97..b68e0ef 100644
--- a/arch/powerpc/kernel/ftrace.c
+++ b/arch/powerpc/kernel/ftrace.c
@@ -105,11 +105,9 @@ __ftrace_make_nop(struct module *mod,
 		  struct dyn_ftrace *rec, unsigned long addr)
 {
 	unsigned int op;
-	unsigned int jmp[5];
 	unsigned long ptr;
 	unsigned long ip = rec->ip;
-	unsigned long tramp;
-	int offset;
+	void *tramp;
 
 	/* read where this goes */
 	if (probe_kernel_read(&op, (void *)ip, sizeof(int)))
@@ -122,96 +120,41 @@ __ftrace_make_nop(struct module *mod,
 	}
 
 	/* lets find where the pointer goes */
-	tramp = find_bl_target(ip, op);
-
-	/*
-	 * On PPC64 the trampoline looks like:
-	 * 0x3d, 0x82, 0x00, 0x00,    addis   r12,r2, <high>
-	 * 0x39, 0x8c, 0x00, 0x00,    addi    r12,r12, <low>
-	 *   Where the bytes 2,3,6 and 7 make up the 32bit offset
-	 *   to the TOC that holds the pointer.
-	 *   to jump to.
-	 * 0xf8, 0x41, 0x00, 0x28,    std     r2,40(r1)
-	 * 0xe9, 0x6c, 0x00, 0x20,    ld      r11,32(r12)
-	 *   The actually address is 32 bytes from the offset
-	 *   into the TOC.
-	 * 0xe8, 0x4c, 0x00, 0x28,    ld      r2,40(r12)
-	 */
+	tramp = (void *)find_bl_target(ip, op);
 
-	pr_devel("ip:%lx jumps to %lx r2: %lx", ip, tramp, mod->arch.toc);
+	pr_devel("ip:%lx jumps to %p", ip, tramp);
 
-	/* Find where the trampoline jumps to */
-	if (probe_kernel_read(jmp, (void *)tramp, sizeof(jmp))) {
-		printk(KERN_ERR "Failed to read %lx\n", tramp);
-		return -EFAULT;
-	}
-
-	pr_devel(" %08x %08x", jmp[0], jmp[1]);
-
-	/* verify that this is what we expect it to be */
-	if (((jmp[0] & 0xffff0000) != 0x3d820000) ||
-	    ((jmp[1] & 0xffff0000) != 0x398c0000) ||
-	    (jmp[2] != 0xf8410028) ||
-	    (jmp[3] != 0xe96c0020) ||
-	    (jmp[4] != 0xe84c0028)) {
+	if (!is_module_trampoline(tramp)) {
 		printk(KERN_ERR "Not a trampoline\n");
 		return -EINVAL;
 	}
 
-	/* The bottom half is signed extended */
-	offset = ((unsigned)((unsigned short)jmp[0]) << 16) +
-		(int)((short)jmp[1]);
-
-	pr_devel(" %x ", offset);
-
-	/* get the address this jumps too */
-	tramp = mod->arch.toc + offset + 32;
-	pr_devel("toc: %lx", tramp);
-
-	if (probe_kernel_read(jmp, (void *)tramp, 8)) {
-		printk(KERN_ERR "Failed to read %lx\n", tramp);
+	if (module_trampoline_target(mod, tramp, &ptr)) {
+		printk(KERN_ERR "Failed to get trampoline target\n");
 		return -EFAULT;
 	}
 
-	pr_devel(" %08x %08x\n", jmp[0], jmp[1]);
-
-#ifdef __LITTLE_ENDIAN__
-	ptr = ((unsigned long)jmp[1] << 32) + jmp[0];
-#else
-	ptr = ((unsigned long)jmp[0] << 32) + jmp[1];
-#endif
+	pr_devel("trampoline target %lx", ptr);
 
 	/* This should match what was called */
 	if (ptr != ppc_function_entry((void *)addr)) {
-		printk(KERN_ERR "addr does not match %lx\n", ptr);
+		printk(KERN_ERR "addr %lx does not match expected %lx\n",
+			ptr, ppc_function_entry((void *)addr));
 		return -EINVAL;
 	}
 
 	/*
-	 * We want to nop the line, but the next line is
-	 *  0xe8, 0x41, 0x00, 0x28   ld r2,40(r1)
-	 * This needs to be turned to a nop too.
-	 */
-	if (probe_kernel_read(&op, (void *)(ip+4), MCOUNT_INSN_SIZE))
-		return -EFAULT;
-
-	if (op != 0xe8410028) {
-		printk(KERN_ERR "Next line is not ld! (%08x)\n", op);
-		return -EINVAL;
-	}
-
-	/*
-	 * Milton Miller pointed out that we can not blindly do nops.
-	 * If a task was preempted when calling a trace function,
-	 * the nops will remove the way to restore the TOC in r2
-	 * and the r2 TOC will get corrupted.
-	 */
-
-	/*
-	 * Replace:
-	 *   bl <tramp>  <==== will be replaced with "b 1f"
-	 *   ld r2,40(r1)
-	 *  1:
+	 * Our original call site looks like:
+	 *
+	 * bl <tramp>
+	 * ld r2,XX(r1)
+	 *
+	 * Milton Miller pointed out that we can not simply nop the branch.
+	 * If a task was preempted when calling a trace function, the nops
+	 * will remove the way to restore the TOC in r2 and the r2 TOC will
+	 * get corrupted.
+	 *
+	 * Use a b +8 to jump over the load.
 	 */
 	op = 0x48000008;	/* b +8 */
 
-- 
1.8.3.2

^ permalink raw reply related

* [PATCH 7/7] powerpc/ftrace: Fix ABIv2 issues with __ftrace_make_call
From: Anton Blanchard @ 2014-04-04  6:09 UTC (permalink / raw)
  To: benh, paulus, rusty, ulrich.weigand, amodra, mikey, mjw, rostedt,
	philippe.bergheaud
  Cc: linuxppc-dev
In-Reply-To: <1396591750-8203-1-git-send-email-anton@samba.org>

__ftrace_make_call assumed ABIv1 TOC stack offsets, so it
broke on ABIv2.

While we are here, we can simplify the instruction modification
code. Since we always update one instruction there is no need to
probe_kernel_write and flush_icache_range, just use patch_branch.

Signed-off-by: Anton Blanchard <anton@samba.org>
---
 arch/powerpc/kernel/ftrace.c | 40 +++++++++++++++++++---------------------
 1 file changed, 19 insertions(+), 21 deletions(-)

diff --git a/arch/powerpc/kernel/ftrace.c b/arch/powerpc/kernel/ftrace.c
index b68e0ef..6ab510d 100644
--- a/arch/powerpc/kernel/ftrace.c
+++ b/arch/powerpc/kernel/ftrace.c
@@ -292,19 +292,24 @@ static int
 __ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
 {
 	unsigned int op[2];
-	unsigned long ip = rec->ip;
+	void *ip = (void *)rec->ip;
 
 	/* read where this goes */
-	if (probe_kernel_read(op, (void *)ip, MCOUNT_INSN_SIZE * 2))
+	if (probe_kernel_read(op, ip, sizeof(op)))
 		return -EFAULT;
 
 	/*
-	 * It should be pointing to two nops or
-	 *  b +8; ld r2,40(r1)
+	 * We expect to see:
+	 *
+	 * b +8
+	 * ld r2,XX(r1)
+	 *
+	 * The load offset is different depending on the ABI. For simplicity
+	 * just mask it out when doing the compare.
 	 */
-	if (((op[0] != 0x48000008) || (op[1] != 0xe8410028)) &&
-	    ((op[0] != PPC_INST_NOP) || (op[1] != PPC_INST_NOP))) {
-		printk(KERN_ERR "Expected NOPs but have %x %x\n", op[0], op[1]);
+	if ((op[0] != 0x48000008) || ((op[1] & 0xffff00000) != 0xe8410000)) {
+		printk(KERN_ERR "Unexpected call sequence: %x %x\n",
+			op[0], op[1]);
 		return -EINVAL;
 	}
 
@@ -314,23 +319,16 @@ __ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
 		return -EINVAL;
 	}
 
-	/* create the branch to the trampoline */
-	op[0] = create_branch((unsigned int *)ip,
-			      rec->arch.mod->arch.tramp, BRANCH_SET_LINK);
-	if (!op[0]) {
-		printk(KERN_ERR "REL24 out of range!\n");
+	/* Ensure branch is within 24 bits */
+	if (create_branch(ip, rec->arch.mod->arch.tramp, BRANCH_SET_LINK)) {
+		printk(KERN_ERR "Branch out of range");
 		return -EINVAL;
 	}
 
-	/* ld r2,40(r1) */
-	op[1] = 0xe8410028;
-
-	pr_devel("write to %lx\n", rec->ip);
-
-	if (probe_kernel_write((void *)ip, op, MCOUNT_INSN_SIZE * 2))
-		return -EPERM;
-
-	flush_icache_range(ip, ip + 8);
+	if (patch_branch(ip, rec->arch.mod->arch.tramp, BRANCH_SET_LINK)) {
+		printk(KERN_ERR "REL24 out of range!\n");
+		return -EINVAL;
+	}
 
 	return 0;
 }
-- 
1.8.3.2

^ permalink raw reply related

* Re: [PATCH 0/7] Build ppc64le kernel using ABIv2, supplemental patches
From: Michael Neuling @ 2014-04-04  6:19 UTC (permalink / raw)
  To: Anton Blanchard
  Cc: philippe.bergheaud, amodra, rusty, rostedt, ulrich.weigand, mjw,
	paulus, linuxppc-dev
In-Reply-To: <1396591750-8203-1-git-send-email-anton@samba.org>

Anton Blanchard <anton@samba.org> wrote:

> These patches apply against my last series and fix all known
> ABIv2 issues.
> 
> To stress the module loader and dynamic ftrace code, I built an
> allmodconfig kernel and inserted every module I could. I found a bunch
> of bugs in the modules themselves, but in the end I managed to
> get quite a few modules to load:
> 
> # cat /proc/modules | wc -l
> 3830

>From http://antonblanchardfacts.com/?270 :

  Anton Blanchard's machine always has at least 3800 modules installed.

Mikey

^ permalink raw reply

* [PATCH V2 0/2] FAULT_AROUND_ORDER patchset performance data for powerpc
From: Madhavan Srinivasan @ 2014-04-04  6:27 UTC (permalink / raw)
  To: linux-kernel, linuxppc-dev, linux-mm, linux-arch, x86
  Cc: riel, ak, peterz, rusty, Madhavan Srinivasan, paulus, mgorman,
	akpm, mingo, kirill.shutemov

Kirill A. Shutemov with faultaround patchset introduced
vm_ops->map_pages() for mapping easy accessible pages around
fault address in hope to reduce number of minor page faults.

This patchset creates infrastructure to move the FAULT_AROUND_ORDER
to arch/ using Kconfig. This will enable architecture maintainers
to decide on suitable FAULT_AROUND_ORDER value based on
performance data for that architecture. First patch also adds
FAULT_AROUND_ORDER Kconfig element for X86. Second patch list
out the performance numbers for powerpc (platform pseries) and
adds FAULT_AROUND_ORDER Kconfig element for powerpc.

V2 Changes:
  Created Kconfig parameter for FAULT_AROUND_ORDER
  Added check in do_read_fault to handle FAULT_AROUND_ORDER value of 0
  Made changes in commit messages.

Madhavan Srinivasan (2):
  mm: move FAULT_AROUND_ORDER to arch/
  mm: add FAULT_AROUND_ORDER Kconfig paramater for powerpc

 arch/powerpc/platforms/pseries/Kconfig |    5 +++++
 arch/x86/Kconfig                       |    4 ++++
 include/linux/mm.h                     |    9 +++++++++
 mm/memory.c                            |   12 +++++-------
 4 files changed, 23 insertions(+), 7 deletions(-)

-- 
1.7.10.4

^ permalink raw reply

* [PATCH V2 2/2] mm: add FAULT_AROUND_ORDER Kconfig paramater for powerpc
From: Madhavan Srinivasan @ 2014-04-04  6:27 UTC (permalink / raw)
  To: linux-kernel, linuxppc-dev, linux-mm, linux-arch, x86
  Cc: riel, ak, peterz, rusty, Madhavan Srinivasan, paulus, mgorman,
	akpm, mingo, kirill.shutemov
In-Reply-To: <1396592835-24767-1-git-send-email-maddy@linux.vnet.ibm.com>

Performance data for different FAULT_AROUND_ORDER values from 4 socket
Power7 system (128 Threads and 128GB memory) is below. perf stat with
repeat of 5 is used to get the stddev values. This patch create
FAULT_AROUND_ORDER Kconfig parameter and defaults it to 3 based on the
performance data.

FAULT_AROUND_ORDER      Baseline        1               3               4               5               7

Linux build (make -j64)
minor-faults            7184385         5874015         4567289         4318518         4193815         4159193
times in seconds        61.433776136    60.865935292    59.245368038    60.630675011    60.56587624     59.828271924
 stddev for time	( +-  1.18% )	( +-  1.78% )	( +-  0.44% )	( +-  2.03% )	( +-  1.66% )	( +-  1.45% )

Linux rebuild (make -j64)
minor-faults            303018          226392          146170          132480          126878          126236
times in seconds        5.659819172     5.723996942     5.591238319     5.622533357     5.878811995     5.550133096
 stddev for time	( +-  0.71% )	( +-  0.78% )	( +-  0.62% )	( +-  0.45% )	( +-  1.55% )	( +-  0.29% )

Two synthetic tests: access every word in file in sequential/random order.
Marginal Performance gains seen for FAO value of 3 when compared to value
of 4.

Sequential access 16GiB file
FAULT_AROUND_ORDER      Baseline        1               3               4               5               7
1 thread
       minor-faults     262302          131192          32873           16486           8291            2351
       times in seconds 53.071497352    52.945826882    52.931417302    52.928577184    52.859285439    53.116800539
       stddev for time	( +-  0.01% )	( +-  0.02% )	( +-  0.02% )	( +-  0.04% )	( +-  0.04% )	( +-  0.01% )
8 threads
       minor-faults     2097314         1051046         263336          131715          66098           16653
       times in seconds 54.385698561    54.603652339    54.771282004    54.488565674    54.496701531    54.962142189
       stddev for time	( +-  0.05% )	( +-  0.02% )	( +-  0.37% )	( +-  0.08% )	( +-  0.07% )	( +-  0.51% )
32 threads
       minor-faults     8389267         4218595         1059961         531319          266463          67271
       times in seconds 60.61715047     60.827964038    60.46412673     60.266045885    60.492398315    60.24531921
       stddev for time	( +-  0.65% )	( +-  0.21% )	( +-  0.25% )	( +-  0.29% )	( +-  0.19% )	( +-  0.35% )
64 threads
       minor-faults     16777455        8485998         2178582         1092106         544302          137693
       times in seconds 86.471334554    84.412415735    85.208303832    84.331473392    85.598793479    84.695469266
       stddev for time	( +-  0.60% )	( +-  1.47% )	( +-  0.74% )	( +-  1.55% )	( +-  0.92% )	( +-  1.16% )
128 threads
       minor-faults     33555267        17734522        4710107         2380821         1182707         292077
       times in seconds 117.535385569   114.291359037   112.593908276   113.081807611   114.358686588   114.491043011
       stddev for time	( +-  2.24% )	( +-  0.92% )	( +-  0.36% )	( +-  0.53% )	( +-  0.70% )	( +-  0.53% )

Random access 1GiB file
FAULT_AROUND_ORDER      Baseline        1               3               4               5               7
1 thread
       minor-faults     16503           8664            2149            1126            610             437
       times in seconds 43.843573808    48.042069805    50.580779682    54.282884593    52.641739876    51.803302129
       stddev for time	( +-  1.30% )	( +-  2.25% )	( +-  2.92% )	( +-  1.44% )	( +-  4.49% )	( +-  3.78% )
8 threads
       minor-faults     131201          70916           17760           8665            4250            1149
       times in seconds 46.262626804    55.942851041    56.629191584    57.97044714     55.417557594    56.019709166
       stddev for time	( +-  4.66% )	( +-  1.52% )	( +-  1.43% )	( +-  1.61% )	( +-  0.65% )	( +-  1.27% )
32 threads
       minor-faults     524959          265980          67282           33601           16930           4316
       times in seconds 67.754175928    69.85012331     71.750338061    71.053074643    68.90728294     71.250103217
       stddev for time	( +-  3.79% )	( +-  0.77% )	( +-  1.15% )	( +-  1.08% )	( +-  2.14% )	( +-  1.17% )
64 threads
       minor-faults     1048831         528829          133256          66700           33428           8776
       times in seconds 96.674025305    93.109961822    87.441777715    91.986332028    88.686748472    93.101434306
       stddev for time	( +-  2.85% )	( +-  2.42% )	( +-  0.42% )	( +-  1.58% )	( +-  1.29% )	( +-  2.01% )
128 threads
       minor-faults     2098043         1053224         266271          133702          66966           17276
       times in seconds 156.525792044   152.117971403   147.523673243   148.560226602   148.596575663   149.389288429
       stddev for time	( +-  2.39% )	( +-  0.71% )	( +-  0.72% )	( +-  0.35% )	( +-  0.41% )	( +-  1.04% )

Worst case scenario: we touch one page every 16M to demonstrate overhead.

Touch only one page in page table in 16GiB file
FAULT_AROUND_ORDER      Baseline        1               3               4               5               7
1 thread
       minor-faults     1077            1064            1051            1048            1046            1045
       times in seconds 0.00615347      0.008327379     0.019775282     0.034444003     0.05905971      0.220863339
       stddev for time	( +-  3.12% )	( +-  2.29% )	( +-  1.68% )	( +-  1.40% )	( +-  1.68% )	( +-  1.53% )
8 threads
       minor-faults     8252            8239            8226            8223            8220            8224
       times in seconds 0.04387392      0.059859294     0.113897648     0.199707764     0.361585762     1.343366843
       stddev for time	( +-  3.66% )	( +-  2.18% )	( +-  1.44% )	( +-  2.40% )	( +-  2.08% )	( +-  1.75% )
32 threads
       minor-faults     32852           32841           32825           32826           32824           32828
       times in seconds 0.191404544     0.21907773      0.433207123     0.72430447      1.334983196     4.97727449
       stddev for time	( +-  5.08% )	( +-  3.19% )	( +-  4.45% )	( +-  2.18% )	( +-  2.75% )	( +-  1.52% )
64 threads
       minor-faults     65652           65642           65629           65622           65623           65634
       times in seconds 0.402140429     0.510806718     0.854288645     1.412329805     2.556707704     8.711074863
       stddev for time	( +-  2.90% )	( +-  3.04% )	( +-  1.83% )	( +-  2.75% )	( +-  2.49% )	( +-  0.68% )
128 threads
       minor-faults     131255          131239          131228          131228          131229          131243
       times in seconds 0.817782148     1.124631348     2.023730928     3.184792382     5.331392072     17.309524609
       stddev for time	( +-  3.35% )	( +- 11.74% )	( +-  4.55% )	( +-  2.00% )	( +-  1.31% )	( +-  1.30% )

Signed-off-by: Madhavan Srinivasan <maddy@linux.vnet.ibm.com>
---
 arch/powerpc/platforms/pseries/Kconfig |    5 +++++
 1 file changed, 5 insertions(+)

diff --git a/arch/powerpc/platforms/pseries/Kconfig b/arch/powerpc/platforms/pseries/Kconfig
index 2cb8b77..2246d9f 100644
--- a/arch/powerpc/platforms/pseries/Kconfig
+++ b/arch/powerpc/platforms/pseries/Kconfig
@@ -132,3 +132,8 @@ config DTL
 	  which are accessible through a debugfs file.
 
 	  Say N if you are unsure.
+
+config FAULT_AROUND_ORDER
+	int
+	default "3"
+
-- 
1.7.10.4

^ permalink raw reply related

* [PATCH V2 1/2] mm: move FAULT_AROUND_ORDER to arch/
From: Madhavan Srinivasan @ 2014-04-04  6:27 UTC (permalink / raw)
  To: linux-kernel, linuxppc-dev, linux-mm, linux-arch, x86
  Cc: riel, ak, peterz, rusty, Madhavan Srinivasan, paulus, mgorman,
	akpm, mingo, kirill.shutemov
In-Reply-To: <1396592835-24767-1-git-send-email-maddy@linux.vnet.ibm.com>

Kirill A. Shutemov with faultaround patchset introduced
vm_ops->map_pages() for mapping easy accessible pages around
fault address in hope to reduce number of minor page faults.

This patch creates infrastructure to move the FAULT_AROUND_ORDER
to arch/ using Kconfig. This will enable architecture maintainers
to decide on suitable FAULT_AROUND_ORDER value based on
performance data for that architecture. Patch also adds
FAULT_AROUND_ORDER Kconfig element in arch/X86.

Signed-off-by: Madhavan Srinivasan <maddy@linux.vnet.ibm.com>
---
 arch/x86/Kconfig   |    4 ++++
 include/linux/mm.h |    9 +++++++++
 mm/memory.c        |   12 +++++-------
 3 files changed, 18 insertions(+), 7 deletions(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 9c0a657..5833f22 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1177,6 +1177,10 @@ config DIRECT_GBPAGES
 	  support it. This can improve the kernel's performance a tiny bit by
 	  reducing TLB pressure. If in doubt, say "Y".
 
+config FAULT_AROUND_ORDER
+	int
+	default "4"
+
 # Common NUMA Features
 config NUMA
 	bool "Numa Memory Allocation and Scheduler Support"
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 0bd4359..b93c1c3 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -26,6 +26,15 @@ struct file_ra_state;
 struct user_struct;
 struct writeback_control;
 
+/*
+ * Fault around order is a control knob to decide the fault around pages.
+ * Default value is set to 0UL (disabled), but the arch can override it as
+ * desired.
+ */
+#ifndef CONFIG_FAULT_AROUND_ORDER
+#define CONFIG_FAULT_AROUND_ORDER 0
+#endif
+
 #ifndef CONFIG_NEED_MULTIPLE_NODES	/* Don't use mapnrs, do it properly */
 extern unsigned long max_mapnr;
 
diff --git a/mm/memory.c b/mm/memory.c
index b02c584..22a4a89 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3358,10 +3358,8 @@ void do_set_pte(struct vm_area_struct *vma, unsigned long address,
 	update_mmu_cache(vma, address, pte);
 }
 
-#define FAULT_AROUND_ORDER 4
-
 #ifdef CONFIG_DEBUG_FS
-static unsigned int fault_around_order = FAULT_AROUND_ORDER;
+static unsigned int fault_around_order = CONFIG_FAULT_AROUND_ORDER;
 
 static int fault_around_order_get(void *data, u64 *val)
 {
@@ -3371,7 +3369,7 @@ static int fault_around_order_get(void *data, u64 *val)
 
 static int fault_around_order_set(void *data, u64 val)
 {
-	BUILD_BUG_ON((1UL << FAULT_AROUND_ORDER) > PTRS_PER_PTE);
+	BUILD_BUG_ON((1UL << CONFIG_FAULT_AROUND_ORDER) > PTRS_PER_PTE);
 	if (1UL << val > PTRS_PER_PTE)
 		return -EINVAL;
 	fault_around_order = val;
@@ -3406,14 +3404,14 @@ static inline unsigned long fault_around_pages(void)
 {
 	unsigned long nr_pages;
 
-	nr_pages = 1UL << FAULT_AROUND_ORDER;
+	nr_pages = 1UL << CONFIG_FAULT_AROUND_ORDER;
 	BUILD_BUG_ON(nr_pages > PTRS_PER_PTE);
 	return nr_pages;
 }
 
 static inline unsigned long fault_around_mask(void)
 {
-	return ~((1UL << (PAGE_SHIFT + FAULT_AROUND_ORDER)) - 1);
+	return ~((1UL << (PAGE_SHIFT + CONFIG_FAULT_AROUND_ORDER)) - 1);
 }
 #endif
 
@@ -3471,7 +3469,7 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	 * if page by the offset is not ready to be mapped (cold cache or
 	 * something).
 	 */
-	if (vma->vm_ops->map_pages) {
+	if ((vma->vm_ops->map_pages) && (fault_around_pages() > 1)) {
 		pte = pte_offset_map_lock(mm, pmd, address, &ptl);
 		do_fault_around(vma, address, pte, pgoff, flags);
 		if (!pte_same(*pte, orig_pte))
-- 
1.7.10.4

^ permalink raw reply related

* Re: [PATCH V2 2/2] mm: add FAULT_AROUND_ORDER Kconfig paramater for powerpc
From: Ingo Molnar @ 2014-04-04  7:02 UTC (permalink / raw)
  To: Madhavan Srinivasan
  Cc: linux-arch, riel, rusty, peterz, x86, linux-kernel, linux-mm, ak,
	paulus, mgorman, akpm, linuxppc-dev, kirill.shutemov
In-Reply-To: <1396592835-24767-3-git-send-email-maddy@linux.vnet.ibm.com>

* Madhavan Srinivasan <maddy@linux.vnet.ibm.com> wrote:

> Performance data for different FAULT_AROUND_ORDER values from 4 socket
> Power7 system (128 Threads and 128GB memory) is below. perf stat with
> repeat of 5 is used to get the stddev values. This patch create
> FAULT_AROUND_ORDER Kconfig parameter and defaults it to 3 based on the
> performance data.
> 
> FAULT_AROUND_ORDER      Baseline        1               3               4               5               7
> 
> Linux build (make -j64)
> minor-faults            7184385         5874015         4567289         4318518         4193815         4159193
> times in seconds        61.433776136    60.865935292    59.245368038    60.630675011    60.56587624     59.828271924
>  stddev for time	( +-  1.18% )	( +-  1.78% )	( +-  0.44% )	( +-  2.03% )	( +-  1.66% )	( +-  1.45% )

Ok, this is better, but it is still rather incomplete statistically, 
please also calculate the percentage difference to baseline, so that 
the stddev becomes meaningful and can be compared to something!

As an example I did this for the first line of measurements (all 
errors in the numbers are mine, this was done manually), and it gives:

>  stddev for time   ( +-  1.18% ) ( +-  1.78% ) ( +-  0.44% ) ( +-  2.03% ) ( +-  1.66% ) ( +-  1.45% )
                                        +0.9%         +3.5%         +1.3%         +1.4%         +2.6%

This shows that there is probably a statistically significant 
(positiv) effect from the change, but from these numbers alone I would 
not draw any quantitative (sizing, tuning) conclusions, because in 3 
out of 5 cases the stddev was larger than the effect, so the resulting 
percentages are not comparable.

Please do this calculation for all the other lines as well and also 
close all the numbers with a conclusion section where you *analyze* 
the results, outline the statistics and compare the various workloads 
and how the tuning affects them and don't force the readers of the 
commit guess what it all means and how significant it all is!

Thanks,

	Ingo

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox