Linux RAID subsystem development

Linux RAID subsystem development
 help / color / mirror / Atom feed

* [PATCH v4 3/4] dmaengine: Add Broadcom SBA RAID driver
From: Anup Patel @ 2017-02-14  6:51 UTC (permalink / raw)
  To: Vinod Koul, Rob Herring, Mark Rutland, Herbert Xu,
	David S . Miller, Jassi Brar
  Cc: Dan Williams, Ray Jui, Scott Branden, Jon Mason, Rob Rice,
	bcm-kernel-feedback-list, dmaengine, devicetree, linux-arm-kernel,
	linux-kernel, linux-crypto, linux-raid, Anup Patel
In-Reply-To: <1487055112-5185-1-git-send-email-anup.patel@broadcom.com>

The Broadcom stream buffer accelerator (SBA) provides offloading
capabilities for RAID operations. This SBA offload engine is
accessible via Broadcom SoC specific ring manager.

This patch adds Broadcom SBA RAID driver which provides one
DMA device with RAID capabilities using one or more Broadcom
SoC specific ring manager channels. The SBA RAID driver in its
current shape implements memcpy, xor, and pq operations.

Signed-off-by: Anup Patel <anup.patel@broadcom.com>
Reviewed-by: Ray Jui <ray.jui@broadcom.com>
---
 drivers/dma/Kconfig        |   13 +
 drivers/dma/Makefile       |    1 +
 drivers/dma/bcm-sba-raid.c | 1694 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 1708 insertions(+)
 create mode 100644 drivers/dma/bcm-sba-raid.c

diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig
index 263495d..bf8fb84 100644
--- a/drivers/dma/Kconfig
+++ b/drivers/dma/Kconfig
@@ -99,6 +99,19 @@ config AXI_DMAC
 	  controller is often used in Analog Device's reference designs for FPGA
 	  platforms.
 
+config BCM_SBA_RAID
+	tristate "Broadcom SBA RAID engine support"
+	depends on (ARM64 && MAILBOX && RAID6_PQ) || COMPILE_TEST
+	select DMA_ENGINE
+	select DMA_ENGINE_RAID
+	select ASYNC_TX_ENABLE_CHANNEL_SWITCH
+	default ARCH_BCM_IPROC
+	help
+	  Enable support for Broadcom SBA RAID Engine. The SBA RAID
+	  engine is available on most of the Broadcom iProc SoCs. It
+	  has the capability to offload memcpy, xor and pq computation
+	  for raid5/6.
+
 config COH901318
 	bool "ST-Ericsson COH901318 DMA support"
 	select DMA_ENGINE
diff --git a/drivers/dma/Makefile b/drivers/dma/Makefile
index a4fa336..ba96bdd 100644
--- a/drivers/dma/Makefile
+++ b/drivers/dma/Makefile
@@ -17,6 +17,7 @@ obj-$(CONFIG_AMCC_PPC440SPE_ADMA) += ppc4xx/
 obj-$(CONFIG_AT_HDMAC) += at_hdmac.o
 obj-$(CONFIG_AT_XDMAC) += at_xdmac.o
 obj-$(CONFIG_AXI_DMAC) += dma-axi-dmac.o
+obj-$(CONFIG_BCM_SBA_RAID) += bcm-sba-raid.o
 obj-$(CONFIG_COH901318) += coh901318.o coh901318_lli.o
 obj-$(CONFIG_DMA_BCM2835) += bcm2835-dma.o
 obj-$(CONFIG_DMA_JZ4740) += dma-jz4740.o
diff --git a/drivers/dma/bcm-sba-raid.c b/drivers/dma/bcm-sba-raid.c
new file mode 100644
index 0000000..279e5e2
--- /dev/null
+++ b/drivers/dma/bcm-sba-raid.c
@@ -0,0 +1,1694 @@
+/*
+ * Copyright (C) 2017 Broadcom
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/*
+ * Broadcom SBA RAID Driver
+ *
+ * The Broadcom stream buffer accelerator (SBA) provides offloading
+ * capabilities for RAID operations. The SBA offload engine is accessible
+ * via Broadcom SoC specific ring manager. Two or more offload engines
+ * can share same Broadcom SoC specific ring manager due to this Broadcom
+ * SoC specific ring manager driver is implemented as a mailbox controller
+ * driver and offload engine drivers are implemented as mallbox clients.
+ *
+ * Typically, Broadcom SoC specific ring manager will implement larger
+ * number of hardware rings over one or more SBA hardware devices. By
+ * design, the internal buffer size of SBA hardware device is limited
+ * but all offload operations supported by SBA can be broken down into
+ * multiple small size requests and executed parallely on multiple SBA
+ * hardware devices for achieving high through-put.
+ *
+ * The Broadcom SBA RAID driver does not require any register programming
+ * except submitting request to SBA hardware device via mailbox channels.
+ * This driver implements a DMA device with one DMA channel using a set
+ * of mailbox channels provided by Broadcom SoC specific ring manager
+ * driver. To exploit parallelism (as described above), all DMA request
+ * coming to SBA RAID DMA channel are broken down to smaller requests
+ * and submitted to multiple mailbox channels in round-robin fashion.
+ * For having more SBA DMA channels, we can create more SBA device nodes
+ * in Broadcom SoC specific DTS based on number of hardware rings supported
+ * by Broadcom SoC ring manager.
+ */
+
+#include <linux/bitops.h>
+#include <linux/dma-mapping.h>
+#include <linux/dmaengine.h>
+#include <linux/list.h>
+#include <linux/mailbox_client.h>
+#include <linux/mailbox/brcm-message.h>
+#include <linux/module.h>
+#include <linux/of_device.h>
+#include <linux/slab.h>
+#include <linux/raid/pq.h>
+
+#include "dmaengine.h"
+
+/* SBA command related defines */
+#define SBA_TYPE_SHIFT					48
+#define SBA_TYPE_MASK					GENMASK(1, 0)
+#define SBA_TYPE_A					0x0
+#define SBA_TYPE_B					0x2
+#define SBA_TYPE_C					0x3
+#define SBA_USER_DEF_SHIFT				32
+#define SBA_USER_DEF_MASK				GENMASK(15, 0)
+#define SBA_R_MDATA_SHIFT				24
+#define SBA_R_MDATA_MASK				GENMASK(7, 0)
+#define SBA_C_MDATA_MS_SHIFT				18
+#define SBA_C_MDATA_MS_MASK				GENMASK(1, 0)
+#define SBA_INT_SHIFT					17
+#define SBA_INT_MASK					BIT(0)
+#define SBA_RESP_SHIFT					16
+#define SBA_RESP_MASK					BIT(0)
+#define SBA_C_MDATA_SHIFT				8
+#define SBA_C_MDATA_MASK				GENMASK(7, 0)
+#define SBA_C_MDATA_BNUMx_SHIFT(__bnum)			(2 * (__bnum))
+#define SBA_C_MDATA_BNUMx_MASK				GENMASK(1, 0)
+#define SBA_C_MDATA_DNUM_SHIFT				5
+#define SBA_C_MDATA_DNUM_MASK				GENMASK(4, 0)
+#define SBA_C_MDATA_LS(__v)				((__v) & 0xff)
+#define SBA_C_MDATA_MS(__v)				(((__v) >> 8) & 0x3)
+#define SBA_CMD_SHIFT					0
+#define SBA_CMD_MASK					GENMASK(3, 0)
+#define SBA_CMD_ZERO_BUFFER				0x4
+#define SBA_CMD_ZERO_ALL_BUFFERS			0x8
+#define SBA_CMD_LOAD_BUFFER				0x9
+#define SBA_CMD_XOR					0xa
+#define SBA_CMD_GALOIS_XOR				0xb
+#define SBA_CMD_WRITE_BUFFER				0xc
+#define SBA_CMD_GALOIS					0xe
+
+/* Driver helper macros */
+#define to_sba_request(tx)		\
+	container_of(tx, struct sba_request, tx)
+#define to_sba_device(dchan)		\
+	container_of(dchan, struct sba_device, dma_chan)
+
+enum sba_request_state {
+	SBA_REQUEST_STATE_FREE = 1,
+	SBA_REQUEST_STATE_ALLOCED = 2,
+	SBA_REQUEST_STATE_PENDING = 3,
+	SBA_REQUEST_STATE_ACTIVE = 4,
+	SBA_REQUEST_STATE_RECEIVED = 5,
+	SBA_REQUEST_STATE_COMPLETED = 6,
+	SBA_REQUEST_STATE_ABORTED = 7,
+};
+
+struct sba_request {
+	/* Global state */
+	struct list_head node;
+	struct sba_device *sba;
+	enum sba_request_state state;
+	bool fence;
+	/* Chained requests management */
+	struct sba_request *first;
+	struct list_head next;
+	unsigned int next_count;
+	atomic_t next_pending_count;
+	/* BRCM message data */
+	void *resp;
+	dma_addr_t resp_dma;
+	struct brcm_sba_command *cmds;
+	struct brcm_message msg;
+	struct dma_async_tx_descriptor tx;
+};
+
+enum sba_version {
+	SBA_VER_1 = 0,
+	SBA_VER_2
+};
+
+struct sba_device {
+	/* Underlying device */
+	struct device *dev;
+	/* DT configuration parameters */
+	enum sba_version ver;
+	/* Derived configuration parameters */
+	u32 max_req;
+	u32 hw_buf_size;
+	u32 hw_resp_size;
+	u32 max_pq_coefs;
+	u32 max_pq_srcs;
+	u32 max_cmd_per_req;
+	u32 max_xor_srcs;
+	u32 max_resp_pool_size;
+	u32 max_cmds_pool_size;
+	/* Maibox client and Mailbox channels */
+	struct mbox_client client;
+	int mchans_count;
+	atomic_t mchans_current;
+	struct mbox_chan **mchans;
+	struct device *mbox_dev;
+	/* DMA device and DMA channel */
+	struct dma_device dma_dev;
+	struct dma_chan dma_chan;
+	/* DMA channel resources */
+	void *resp_base;
+	dma_addr_t resp_dma_base;
+	void *cmds_base;
+	dma_addr_t cmds_dma_base;
+	spinlock_t reqs_lock;
+	struct sba_request *reqs;
+	bool reqs_fence;
+	struct list_head reqs_alloc_list;
+	struct list_head reqs_pending_list;
+	struct list_head reqs_active_list;
+	struct list_head reqs_received_list;
+	struct list_head reqs_completed_list;
+	struct list_head reqs_aborted_list;
+	struct list_head reqs_free_list;
+	int reqs_free_count;
+};
+
+/* ====== SBA command helper routines ===== */
+
+static inline u64 __pure sba_cmd_enc(u64 cmd, u32 val, u32 shift, u32 mask)
+{
+	cmd &= ~((u64)mask << shift);
+	cmd |= ((u64)(val & mask) << shift);
+	return cmd;
+}
+
+static inline u32 __pure sba_cmd_load_c_mdata(u32 b0)
+{
+	return b0 & SBA_C_MDATA_BNUMx_MASK;
+}
+
+static inline u32 __pure sba_cmd_write_c_mdata(u32 b0)
+{
+	return b0 & SBA_C_MDATA_BNUMx_MASK;
+}
+
+static inline u32 __pure sba_cmd_xor_c_mdata(u32 b1, u32 b0)
+{
+	return (b0 & SBA_C_MDATA_BNUMx_MASK) |
+	       ((b1 & SBA_C_MDATA_BNUMx_MASK) << SBA_C_MDATA_BNUMx_SHIFT(1));
+}
+
+static inline u32 __pure sba_cmd_pq_c_mdata(u32 d, u32 b1, u32 b0)
+{
+	return (b0 & SBA_C_MDATA_BNUMx_MASK) |
+	       ((b1 & SBA_C_MDATA_BNUMx_MASK) << SBA_C_MDATA_BNUMx_SHIFT(1)) |
+	       ((d & SBA_C_MDATA_DNUM_MASK) << SBA_C_MDATA_DNUM_SHIFT);
+}
+
+/* ====== Channel resource management routines ===== */
+
+static struct sba_request *sba_alloc_request(struct sba_device *sba)
+{
+	unsigned long flags;
+	struct sba_request *req = NULL;
+
+	spin_lock_irqsave(&sba->reqs_lock, flags);
+
+	req = list_first_entry_or_null(&sba->reqs_free_list,
+				       struct sba_request, node);
+	if (req) {
+		list_move_tail(&req->node, &sba->reqs_alloc_list);
+		req->state = SBA_REQUEST_STATE_ALLOCED;
+		req->fence = false;
+		req->first = req;
+		INIT_LIST_HEAD(&req->next);
+		req->next_count = 1;
+		atomic_set(&req->next_pending_count, 1);
+
+		sba->reqs_free_count--;
+
+		dma_async_tx_descriptor_init(&req->tx, &sba->dma_chan);
+	}
+
+	spin_unlock_irqrestore(&sba->reqs_lock, flags);
+
+	return req;
+}
+
+/* Note: Must be called with sba->reqs_lock held */
+static void _sba_pending_request(struct sba_device *sba,
+				 struct sba_request *req)
+{
+	lockdep_assert_held(&sba->reqs_lock);
+	req->state = SBA_REQUEST_STATE_PENDING;
+	list_move_tail(&req->node, &sba->reqs_pending_list);
+	if (list_empty(&sba->reqs_active_list))
+		sba->reqs_fence = false;
+}
+
+/* Note: Must be called with sba->reqs_lock held */
+static bool _sba_active_request(struct sba_device *sba,
+				struct sba_request *req)
+{
+	lockdep_assert_held(&sba->reqs_lock);
+	if (list_empty(&sba->reqs_active_list))
+		sba->reqs_fence = false;
+	if (sba->reqs_fence)
+		return false;
+	req->state = SBA_REQUEST_STATE_ACTIVE;
+	list_move_tail(&req->node, &sba->reqs_active_list);
+	if (req->fence)
+		sba->reqs_fence = true;
+	return true;
+}
+
+/* Note: Must be called with sba->reqs_lock held */
+static void _sba_abort_request(struct sba_device *sba,
+			       struct sba_request *req)
+{
+	lockdep_assert_held(&sba->reqs_lock);
+	req->state = SBA_REQUEST_STATE_ABORTED;
+	list_move_tail(&req->node, &sba->reqs_aborted_list);
+	if (list_empty(&sba->reqs_active_list))
+		sba->reqs_fence = false;
+}
+
+/* Note: Must be called with sba->reqs_lock held */
+static void _sba_free_request(struct sba_device *sba,
+			      struct sba_request *req)
+{
+	lockdep_assert_held(&sba->reqs_lock);
+	req->state = SBA_REQUEST_STATE_FREE;
+	list_move_tail(&req->node, &sba->reqs_free_list);
+	if (list_empty(&sba->reqs_active_list))
+		sba->reqs_fence = false;
+	sba->reqs_free_count++;
+}
+
+static void sba_received_request(struct sba_request *req)
+{
+	unsigned long flags;
+	struct sba_device *sba = req->sba;
+
+	spin_lock_irqsave(&sba->reqs_lock, flags);
+	req->state = SBA_REQUEST_STATE_RECEIVED;
+	list_move_tail(&req->node, &sba->reqs_received_list);
+	spin_unlock_irqrestore(&sba->reqs_lock, flags);
+}
+
+static void sba_complete_chained_requests(struct sba_request *req)
+{
+	unsigned long flags;
+	struct sba_request *nreq;
+	struct sba_device *sba = req->sba;
+
+	spin_lock_irqsave(&sba->reqs_lock, flags);
+
+	req->state = SBA_REQUEST_STATE_COMPLETED;
+	list_move_tail(&req->node, &sba->reqs_completed_list);
+	list_for_each_entry(nreq, &req->next, next) {
+		nreq->state = SBA_REQUEST_STATE_COMPLETED;
+		list_move_tail(&nreq->node, &sba->reqs_completed_list);
+	}
+	if (list_empty(&sba->reqs_active_list))
+		sba->reqs_fence = false;
+
+	spin_unlock_irqrestore(&sba->reqs_lock, flags);
+}
+
+static void sba_free_chained_requests(struct sba_request *req)
+{
+	unsigned long flags;
+	struct sba_request *nreq;
+	struct sba_device *sba = req->sba;
+
+	spin_lock_irqsave(&sba->reqs_lock, flags);
+
+	_sba_free_request(sba, req);
+	list_for_each_entry(nreq, &req->next, next)
+		_sba_free_request(sba, nreq);
+
+	spin_unlock_irqrestore(&sba->reqs_lock, flags);
+}
+
+static void sba_chain_request(struct sba_request *first,
+			      struct sba_request *req)
+{
+	unsigned long flags;
+	struct sba_device *sba = req->sba;
+
+	spin_lock_irqsave(&sba->reqs_lock, flags);
+
+	list_add_tail(&req->next, &first->next);
+	req->first = first;
+	first->next_count++;
+	atomic_set(&first->next_pending_count, first->next_count);
+
+	spin_unlock_irqrestore(&sba->reqs_lock, flags);
+}
+
+static void sba_cleanup_nonpending_requests(struct sba_device *sba)
+{
+	unsigned long flags;
+	struct sba_request *req, *req1;
+
+	spin_lock_irqsave(&sba->reqs_lock, flags);
+
+	/* Freeup all alloced request */
+	list_for_each_entry_safe(req, req1, &sba->reqs_alloc_list, node)
+		_sba_free_request(sba, req);
+
+	/* Freeup all received request */
+	list_for_each_entry_safe(req, req1, &sba->reqs_received_list, node)
+		_sba_free_request(sba, req);
+
+	/* Freeup all completed request */
+	list_for_each_entry_safe(req, req1, &sba->reqs_completed_list, node)
+		_sba_free_request(sba, req);
+
+	/* Set all active requests as aborted */
+	list_for_each_entry_safe(req, req1, &sba->reqs_active_list, node)
+		_sba_abort_request(sba, req);
+
+	/*
+	 * Note: We expect that aborted request will be eventually
+	 * freed by sba_receive_message()
+	 */
+
+	spin_unlock_irqrestore(&sba->reqs_lock, flags);
+}
+
+static void sba_cleanup_pending_requests(struct sba_device *sba)
+{
+	unsigned long flags;
+	struct sba_request *req, *req1;
+
+	spin_lock_irqsave(&sba->reqs_lock, flags);
+
+	/* Freeup all pending request */
+	list_for_each_entry_safe(req, req1, &sba->reqs_pending_list, node)
+		_sba_free_request(sba, req);
+
+	spin_unlock_irqrestore(&sba->reqs_lock, flags);
+}
+
+/* ====== DMAENGINE callbacks ===== */
+
+static void sba_free_chan_resources(struct dma_chan *dchan)
+{
+	/*
+	 * Channel resources are pre-alloced so we just free-up
+	 * whatever we can so that we can re-use pre-alloced
+	 * channel resources next time.
+	 */
+	sba_cleanup_nonpending_requests(to_sba_device(dchan));
+}
+
+static int sba_device_terminate_all(struct dma_chan *dchan)
+{
+	/* Cleanup all pending requests */
+	sba_cleanup_pending_requests(to_sba_device(dchan));
+
+	return 0;
+}
+
+static int sba_send_mbox_request(struct sba_device *sba,
+				 struct sba_request *req)
+{
+	int mchans_idx, ret = 0;
+
+	/* Select mailbox channel in round-robin fashion */
+	mchans_idx = atomic_inc_return(&sba->mchans_current);
+	mchans_idx = mchans_idx % sba->mchans_count;
+
+	/* Send message for the request */
+	req->msg.error = 0;
+	ret = mbox_send_message(sba->mchans[mchans_idx], &req->msg);
+	if (ret < 0) {
+		dev_err(sba->dev, "send message failed with error %d", ret);
+		return ret;
+	}
+	ret = req->msg.error;
+	if (ret < 0) {
+		dev_err(sba->dev, "message error %d", ret);
+		return ret;
+	}
+
+	return 0;
+}
+
+static void sba_issue_pending(struct dma_chan *dchan)
+{
+	int ret;
+	unsigned long flags;
+	struct sba_request *req, *req1;
+	struct sba_device *sba = to_sba_device(dchan);
+
+	spin_lock_irqsave(&sba->reqs_lock, flags);
+
+	/* Process all pending request */
+	list_for_each_entry_safe(req, req1, &sba->reqs_pending_list, node) {
+		/* Try to make request active */
+		if (!_sba_active_request(sba, req))
+			break;
+
+		/* Send request to mailbox channel */
+		spin_unlock_irqrestore(&sba->reqs_lock, flags);
+		ret = sba_send_mbox_request(sba, req);
+		spin_lock_irqsave(&sba->reqs_lock, flags);
+
+		/* If something went wrong then keep request pending */
+		if (ret < 0) {
+			_sba_pending_request(sba, req);
+			break;
+		}
+	}
+
+	spin_unlock_irqrestore(&sba->reqs_lock, flags);
+}
+
+static dma_cookie_t sba_tx_submit(struct dma_async_tx_descriptor *tx)
+{
+	unsigned long flags;
+	dma_cookie_t cookie;
+	struct sba_device *sba;
+	struct sba_request *req, *nreq;
+
+	if (unlikely(!tx))
+		return -EINVAL;
+
+	sba = to_sba_device(tx->chan);
+	req = to_sba_request(tx);
+
+	/* Assign cookie and mark all chained requests pending */
+	spin_lock_irqsave(&sba->reqs_lock, flags);
+	cookie = dma_cookie_assign(tx);
+	_sba_pending_request(sba, req);
+	list_for_each_entry(nreq, &req->next, next)
+		_sba_pending_request(sba, nreq);
+	spin_unlock_irqrestore(&sba->reqs_lock, flags);
+
+	return cookie;
+}
+
+static enum dma_status sba_tx_status(struct dma_chan *dchan,
+				     dma_cookie_t cookie,
+				     struct dma_tx_state *txstate)
+{
+	int mchan_idx;
+	enum dma_status ret;
+	struct sba_device *sba = to_sba_device(dchan);
+
+	for (mchan_idx = 0; mchan_idx < sba->mchans_count; mchan_idx++)
+		mbox_client_peek_data(sba->mchans[mchan_idx]);
+
+	ret = dma_cookie_status(dchan, cookie, txstate);
+	if (ret == DMA_COMPLETE)
+		return ret;
+
+	return dma_cookie_status(dchan, cookie, txstate);
+}
+
+static void sba_fillup_memcpy_msg(struct sba_request *req,
+				  struct brcm_sba_command *cmds,
+				  struct brcm_message *msg,
+				  dma_addr_t msg_offset, size_t msg_len,
+				  dma_addr_t dst, dma_addr_t src)
+{
+	u64 cmd;
+	u32 c_mdata;
+	struct brcm_sba_command *cmdsp = cmds;
+
+	/* Type-B command to load data into buf0 */
+	cmd = sba_cmd_enc(0x0, SBA_TYPE_B,
+			  SBA_TYPE_SHIFT, SBA_TYPE_MASK);
+	cmd = sba_cmd_enc(cmd, msg_len,
+			  SBA_USER_DEF_SHIFT, SBA_USER_DEF_MASK);
+	c_mdata = sba_cmd_load_c_mdata(0);
+	cmd = sba_cmd_enc(cmd, SBA_C_MDATA_LS(c_mdata),
+			  SBA_C_MDATA_SHIFT, SBA_C_MDATA_MASK);
+	cmd = sba_cmd_enc(cmd, SBA_CMD_LOAD_BUFFER,
+			  SBA_CMD_SHIFT, SBA_CMD_MASK);
+	cmdsp->cmd = cmd;
+	*cmdsp->cmd_dma = cpu_to_le64(cmd);
+	cmdsp->flags = BRCM_SBA_CMD_TYPE_B;
+	cmdsp->data = src + msg_offset;
+	cmdsp->data_len = msg_len;
+	cmdsp++;
+
+	/* Type-A command to write buf0 */
+	cmd = sba_cmd_enc(0x0, SBA_TYPE_A,
+			  SBA_TYPE_SHIFT, SBA_TYPE_MASK);
+	cmd = sba_cmd_enc(cmd, msg_len,
+			  SBA_USER_DEF_SHIFT, SBA_USER_DEF_MASK);
+	cmd = sba_cmd_enc(cmd, 0x1,
+			  SBA_RESP_SHIFT, SBA_RESP_MASK);
+	c_mdata = sba_cmd_write_c_mdata(0);
+	cmd = sba_cmd_enc(cmd, SBA_C_MDATA_LS(c_mdata),
+			  SBA_C_MDATA_SHIFT, SBA_C_MDATA_MASK);
+	cmd = sba_cmd_enc(cmd, SBA_CMD_WRITE_BUFFER,
+			  SBA_CMD_SHIFT, SBA_CMD_MASK);
+	cmdsp->cmd = cmd;
+	*cmdsp->cmd_dma = cpu_to_le64(cmd);
+	cmdsp->flags = BRCM_SBA_CMD_TYPE_A;
+	if (req->sba->hw_resp_size) {
+		cmdsp->flags |= BRCM_SBA_CMD_HAS_RESP;
+		cmdsp->resp = req->resp_dma;
+		cmdsp->resp_len = req->sba->hw_resp_size;
+	}
+	cmdsp->flags |= BRCM_SBA_CMD_HAS_OUTPUT;
+	cmdsp->data = dst + msg_offset;
+	cmdsp->data_len = msg_len;
+	cmdsp++;
+
+	/* Fillup brcm_message */
+	msg->type = BRCM_MESSAGE_SBA;
+	msg->sba.cmds = cmds;
+	msg->sba.cmds_count = cmdsp - cmds;
+	msg->ctx = req;
+	msg->error = 0;
+}
+
+static struct sba_request *
+sba_prep_dma_memcpy_req(struct sba_device *sba,
+			dma_addr_t off, dma_addr_t dst, dma_addr_t src,
+			size_t len, unsigned long flags)
+{
+	struct sba_request *req = NULL;
+
+	/* Alloc new request */
+	req = sba_alloc_request(sba);
+	if (!req)
+		return NULL;
+	req->fence = (flags & DMA_PREP_FENCE) ? true : false;
+
+	/* Fillup request message */
+	sba_fillup_memcpy_msg(req, req->cmds, &req->msg,
+			      off, len, dst, src);
+
+	/* Init async_tx descriptor */
+	req->tx.flags = flags;
+	req->tx.cookie = -EBUSY;
+
+	return req;
+}
+
+static struct dma_async_tx_descriptor *
+sba_prep_dma_memcpy(struct dma_chan *dchan, dma_addr_t dst, dma_addr_t src,
+		    size_t len, unsigned long flags)
+{
+	size_t req_len;
+	dma_addr_t off = 0;
+	struct sba_device *sba = to_sba_device(dchan);
+	struct sba_request *first = NULL, *req;
+
+	/* Create chained requests where each request is upto hw_buf_size */
+	while (len) {
+		req_len = (len < sba->hw_buf_size) ? len : sba->hw_buf_size;
+
+		req = sba_prep_dma_memcpy_req(sba, off, dst, src,
+					      req_len, flags);
+		if (!req) {
+			if (first)
+				sba_free_chained_requests(first);
+			return NULL;
+		}
+
+		if (first)
+			sba_chain_request(first, req);
+		else
+			first = req;
+
+		off += req_len;
+		len -= req_len;
+	}
+
+	return (first) ? &first->tx : NULL;
+}
+
+static void sba_fillup_xor_msg(struct sba_request *req,
+				struct brcm_sba_command *cmds,
+				struct brcm_message *msg,
+				dma_addr_t msg_offset, size_t msg_len,
+				dma_addr_t dst, dma_addr_t *src, u32 src_cnt)
+{
+	u64 cmd;
+	u32 c_mdata;
+	unsigned int i;
+	struct brcm_sba_command *cmdsp = cmds;
+
+	/* Type-B command to load data into buf0 */
+	cmd = sba_cmd_enc(0x0, SBA_TYPE_B,
+			  SBA_TYPE_SHIFT, SBA_TYPE_MASK);
+	cmd = sba_cmd_enc(cmd, msg_len,
+			  SBA_USER_DEF_SHIFT, SBA_USER_DEF_MASK);
+	c_mdata = sba_cmd_load_c_mdata(0);
+	cmd = sba_cmd_enc(cmd, SBA_C_MDATA_LS(c_mdata),
+			  SBA_C_MDATA_SHIFT, SBA_C_MDATA_MASK);
+	cmd = sba_cmd_enc(cmd, SBA_CMD_LOAD_BUFFER,
+			  SBA_CMD_SHIFT, SBA_CMD_MASK);
+	cmdsp->cmd = cmd;
+	*cmdsp->cmd_dma = cpu_to_le64(cmd);
+	cmdsp->flags = BRCM_SBA_CMD_TYPE_B;
+	cmdsp->data = src[0] + msg_offset;
+	cmdsp->data_len = msg_len;
+	cmdsp++;
+
+	/* Type-B commands to xor data with buf0 and put it back in buf0 */
+	for (i = 1; i < src_cnt; i++) {
+		cmd = sba_cmd_enc(0x0, SBA_TYPE_B,
+				  SBA_TYPE_SHIFT, SBA_TYPE_MASK);
+		cmd = sba_cmd_enc(cmd, msg_len,
+				  SBA_USER_DEF_SHIFT, SBA_USER_DEF_MASK);
+		c_mdata = sba_cmd_xor_c_mdata(0, 0);
+		cmd = sba_cmd_enc(cmd, SBA_C_MDATA_LS(c_mdata),
+				  SBA_C_MDATA_SHIFT, SBA_C_MDATA_MASK);
+		cmd = sba_cmd_enc(cmd, SBA_CMD_XOR,
+				  SBA_CMD_SHIFT, SBA_CMD_MASK);
+		cmdsp->cmd = cmd;
+		*cmdsp->cmd_dma = cpu_to_le64(cmd);
+		cmdsp->flags = BRCM_SBA_CMD_TYPE_B;
+		cmdsp->data = src[i] + msg_offset;
+		cmdsp->data_len = msg_len;
+		cmdsp++;
+	}
+
+	/* Type-A command to write buf0 */
+	cmd = sba_cmd_enc(0x0, SBA_TYPE_A,
+			  SBA_TYPE_SHIFT, SBA_TYPE_MASK);
+	cmd = sba_cmd_enc(cmd, msg_len,
+			  SBA_USER_DEF_SHIFT, SBA_USER_DEF_MASK);
+	cmd = sba_cmd_enc(cmd, 0x1,
+			  SBA_RESP_SHIFT, SBA_RESP_MASK);
+	c_mdata = sba_cmd_write_c_mdata(0);
+	cmd = sba_cmd_enc(cmd, SBA_C_MDATA_LS(c_mdata),
+			  SBA_C_MDATA_SHIFT, SBA_C_MDATA_MASK);
+	cmd = sba_cmd_enc(cmd, SBA_CMD_WRITE_BUFFER,
+			  SBA_CMD_SHIFT, SBA_CMD_MASK);
+	cmdsp->cmd = cmd;
+	*cmdsp->cmd_dma = cpu_to_le64(cmd);
+	cmdsp->flags = BRCM_SBA_CMD_TYPE_A;
+	if (req->sba->hw_resp_size) {
+		cmdsp->flags |= BRCM_SBA_CMD_HAS_RESP;
+		cmdsp->resp = req->resp_dma;
+		cmdsp->resp_len = req->sba->hw_resp_size;
+	}
+	cmdsp->flags |= BRCM_SBA_CMD_HAS_OUTPUT;
+	cmdsp->data = dst + msg_offset;
+	cmdsp->data_len = msg_len;
+	cmdsp++;
+
+	/* Fillup brcm_message */
+	msg->type = BRCM_MESSAGE_SBA;
+	msg->sba.cmds = cmds;
+	msg->sba.cmds_count = cmdsp - cmds;
+	msg->ctx = req;
+	msg->error = 0;
+}
+
+struct sba_request *
+sba_prep_dma_xor_req(struct sba_device *sba,
+		     dma_addr_t off, dma_addr_t dst, dma_addr_t *src,
+		     u32 src_cnt, size_t len, unsigned long flags)
+{
+	struct sba_request *req = NULL;
+
+	/* Alloc new request */
+	req = sba_alloc_request(sba);
+	if (!req)
+		return NULL;
+	req->fence = (flags & DMA_PREP_FENCE) ? true : false;
+
+	/* Fillup request message */
+	sba_fillup_xor_msg(req, req->cmds, &req->msg,
+			   off, len, dst, src, src_cnt);
+
+	/* Init async_tx descriptor */
+	req->tx.flags = flags;
+	req->tx.cookie = -EBUSY;
+
+	return req;
+}
+
+static struct dma_async_tx_descriptor *
+sba_prep_dma_xor(struct dma_chan *dchan, dma_addr_t dst, dma_addr_t *src,
+		 u32 src_cnt, size_t len, unsigned long flags)
+{
+	size_t req_len;
+	dma_addr_t off = 0;
+	struct sba_device *sba = to_sba_device(dchan);
+	struct sba_request *first = NULL, *req;
+
+	/* Sanity checks */
+	if (unlikely(src_cnt > sba->max_xor_srcs))
+		return NULL;
+
+	/* Create chained requests where each request is upto hw_buf_size */
+	while (len) {
+		req_len = (len < sba->hw_buf_size) ? len : sba->hw_buf_size;
+
+		req = sba_prep_dma_xor_req(sba, off, dst, src, src_cnt,
+					   req_len, flags);
+		if (!req) {
+			if (first)
+				sba_free_chained_requests(first);
+			return NULL;
+		}
+
+		if (first)
+			sba_chain_request(first, req);
+		else
+			first = req;
+
+		off += req_len;
+		len -= req_len;
+	}
+
+	return (first) ? &first->tx : NULL;
+}
+
+static void sba_fillup_pq_msg(struct sba_request *req,
+				bool pq_continue,
+				struct brcm_sba_command *cmds,
+				struct brcm_message *msg,
+				dma_addr_t msg_offset, size_t msg_len,
+				dma_addr_t *dst_p, dma_addr_t *dst_q,
+				const u8 *scf, dma_addr_t *src, u32 src_cnt)
+{
+	u64 cmd;
+	u32 c_mdata;
+	unsigned int i;
+	struct brcm_sba_command *cmdsp = cmds;
+
+	if (pq_continue) {
+		/* Type-B command to load old P into buf0 */
+		if (dst_p) {
+			cmd = sba_cmd_enc(0x0, SBA_TYPE_B,
+				SBA_TYPE_SHIFT, SBA_TYPE_MASK);
+			cmd = sba_cmd_enc(cmd, msg_len,
+				SBA_USER_DEF_SHIFT, SBA_USER_DEF_MASK);
+			c_mdata = sba_cmd_load_c_mdata(0);
+			cmd = sba_cmd_enc(cmd, SBA_C_MDATA_LS(c_mdata),
+				SBA_C_MDATA_SHIFT, SBA_C_MDATA_MASK);
+			cmd = sba_cmd_enc(cmd, SBA_CMD_LOAD_BUFFER,
+				SBA_CMD_SHIFT, SBA_CMD_MASK);
+			cmdsp->cmd = cmd;
+			*cmdsp->cmd_dma = cpu_to_le64(cmd);
+			cmdsp->flags = BRCM_SBA_CMD_TYPE_B;
+			cmdsp->data = *dst_p + msg_offset;
+			cmdsp->data_len = msg_len;
+			cmdsp++;
+		}
+
+		/* Type-B command to load old Q into buf1 */
+		if (dst_q) {
+			cmd = sba_cmd_enc(0x0, SBA_TYPE_B,
+				SBA_TYPE_SHIFT, SBA_TYPE_MASK);
+			cmd = sba_cmd_enc(cmd, msg_len,
+				SBA_USER_DEF_SHIFT, SBA_USER_DEF_MASK);
+			c_mdata = sba_cmd_load_c_mdata(1);
+			cmd = sba_cmd_enc(cmd, SBA_C_MDATA_LS(c_mdata),
+				SBA_C_MDATA_SHIFT, SBA_C_MDATA_MASK);
+			cmd = sba_cmd_enc(cmd, SBA_CMD_LOAD_BUFFER,
+				SBA_CMD_SHIFT, SBA_CMD_MASK);
+			cmdsp->cmd = cmd;
+			*cmdsp->cmd_dma = cpu_to_le64(cmd);
+			cmdsp->flags = BRCM_SBA_CMD_TYPE_B;
+			cmdsp->data = *dst_q + msg_offset;
+			cmdsp->data_len = msg_len;
+			cmdsp++;
+		}
+	} else {
+		/* Type-A command to zero all buffers */
+		cmd = sba_cmd_enc(0x0, SBA_TYPE_A,
+				  SBA_TYPE_SHIFT, SBA_TYPE_MASK);
+		cmd = sba_cmd_enc(cmd, msg_len,
+				  SBA_USER_DEF_SHIFT, SBA_USER_DEF_MASK);
+		cmd = sba_cmd_enc(cmd, SBA_CMD_ZERO_ALL_BUFFERS,
+				  SBA_CMD_SHIFT, SBA_CMD_MASK);
+		cmdsp->cmd = cmd;
+		*cmdsp->cmd_dma = cpu_to_le64(cmd);
+		cmdsp->flags = BRCM_SBA_CMD_TYPE_A;
+		cmdsp++;
+	}
+
+	/* Type-B commands for generate P onto buf0 and Q onto buf1 */
+	for (i = 0; i < src_cnt; i++) {
+		cmd = sba_cmd_enc(0x0, SBA_TYPE_B,
+				  SBA_TYPE_SHIFT, SBA_TYPE_MASK);
+		cmd = sba_cmd_enc(cmd, msg_len,
+				  SBA_USER_DEF_SHIFT, SBA_USER_DEF_MASK);
+		c_mdata = sba_cmd_pq_c_mdata(raid6_gflog[scf[i]], 1, 0);
+		cmd = sba_cmd_enc(cmd, SBA_C_MDATA_LS(c_mdata),
+				  SBA_C_MDATA_SHIFT, SBA_C_MDATA_MASK);
+		cmd = sba_cmd_enc(cmd, SBA_C_MDATA_MS(c_mdata),
+				  SBA_C_MDATA_MS_SHIFT, SBA_C_MDATA_MS_MASK);
+		cmd = sba_cmd_enc(cmd, SBA_CMD_GALOIS_XOR,
+				  SBA_CMD_SHIFT, SBA_CMD_MASK);
+		cmdsp->cmd = cmd;
+		*cmdsp->cmd_dma = cpu_to_le64(cmd);
+		cmdsp->flags = BRCM_SBA_CMD_TYPE_B;
+		cmdsp->data = src[i] + msg_offset;
+		cmdsp->data_len = msg_len;
+		cmdsp++;
+	}
+
+	/* Type-A command to write buf0 */
+	if (dst_p) {
+		cmd = sba_cmd_enc(0x0, SBA_TYPE_A,
+				  SBA_TYPE_SHIFT, SBA_TYPE_MASK);
+		cmd = sba_cmd_enc(cmd, msg_len,
+				  SBA_USER_DEF_SHIFT, SBA_USER_DEF_MASK);
+		cmd = sba_cmd_enc(cmd, 0x1,
+				  SBA_RESP_SHIFT, SBA_RESP_MASK);
+		c_mdata = sba_cmd_write_c_mdata(0);
+		cmd = sba_cmd_enc(cmd, SBA_C_MDATA_LS(c_mdata),
+				  SBA_C_MDATA_SHIFT, SBA_C_MDATA_MASK);
+		cmd = sba_cmd_enc(cmd, SBA_CMD_WRITE_BUFFER,
+				  SBA_CMD_SHIFT, SBA_CMD_MASK);
+		cmdsp->cmd = cmd;
+		*cmdsp->cmd_dma = cpu_to_le64(cmd);
+		cmdsp->flags = BRCM_SBA_CMD_TYPE_A;
+		if (req->sba->hw_resp_size) {
+			cmdsp->flags |= BRCM_SBA_CMD_HAS_RESP;
+			cmdsp->resp = req->resp_dma;
+			cmdsp->resp_len = req->sba->hw_resp_size;
+		}
+		cmdsp->flags |= BRCM_SBA_CMD_HAS_OUTPUT;
+		cmdsp->data = *dst_p + msg_offset;
+		cmdsp->data_len = msg_len;
+		cmdsp++;
+	}
+
+	/* Type-A command to write buf1 */
+	if (dst_q) {
+		cmd = sba_cmd_enc(0x0, SBA_TYPE_A,
+				  SBA_TYPE_SHIFT, SBA_TYPE_MASK);
+		cmd = sba_cmd_enc(cmd, msg_len,
+				  SBA_USER_DEF_SHIFT, SBA_USER_DEF_MASK);
+		cmd = sba_cmd_enc(cmd, 0x1,
+				  SBA_RESP_SHIFT, SBA_RESP_MASK);
+		c_mdata = sba_cmd_write_c_mdata(1);
+		cmd = sba_cmd_enc(cmd, SBA_C_MDATA_LS(c_mdata),
+				  SBA_C_MDATA_SHIFT, SBA_C_MDATA_MASK);
+		cmd = sba_cmd_enc(cmd, SBA_CMD_WRITE_BUFFER,
+				  SBA_CMD_SHIFT, SBA_CMD_MASK);
+		cmdsp->cmd = cmd;
+		*cmdsp->cmd_dma = cpu_to_le64(cmd);
+		cmdsp->flags = BRCM_SBA_CMD_TYPE_A;
+		if (req->sba->hw_resp_size) {
+			cmdsp->flags |= BRCM_SBA_CMD_HAS_RESP;
+			cmdsp->resp = req->resp_dma;
+			cmdsp->resp_len = req->sba->hw_resp_size;
+		}
+		cmdsp->flags |= BRCM_SBA_CMD_HAS_OUTPUT;
+		cmdsp->data = *dst_q + msg_offset;
+		cmdsp->data_len = msg_len;
+		cmdsp++;
+	}
+
+	/* Fillup brcm_message */
+	msg->type = BRCM_MESSAGE_SBA;
+	msg->sba.cmds = cmds;
+	msg->sba.cmds_count = cmdsp - cmds;
+	msg->ctx = req;
+	msg->error = 0;
+}
+
+struct sba_request *
+sba_prep_dma_pq_req(struct sba_device *sba, dma_addr_t off,
+		    dma_addr_t *dst_p, dma_addr_t *dst_q, dma_addr_t *src,
+		    u32 src_cnt, const u8 *scf, size_t len, unsigned long flags)
+{
+	struct sba_request *req = NULL;
+
+	/* Alloc new request */
+	req = sba_alloc_request(sba);
+	if (!req)
+		return NULL;
+	req->fence = (flags & DMA_PREP_FENCE) ? true : false;
+
+	/* Fillup request messages */
+	sba_fillup_pq_msg(req, dmaf_continue(flags),
+			  req->cmds, &req->msg,
+			  off, len, dst_p, dst_q, scf, src, src_cnt);
+
+	/* Init async_tx descriptor */
+	req->tx.flags = flags;
+	req->tx.cookie = -EBUSY;
+
+	return req;
+}
+
+static void sba_fillup_pq_single_msg(struct sba_request *req,
+				bool pq_continue,
+				struct brcm_sba_command *cmds,
+				struct brcm_message *msg,
+				dma_addr_t msg_offset, size_t msg_len,
+				dma_addr_t *dst_p, dma_addr_t *dst_q,
+				dma_addr_t src, u8 scf)
+{
+	u64 cmd;
+	u32 c_mdata;
+	u8 pos, dpos = raid6_gflog[scf];
+	struct brcm_sba_command *cmdsp = cmds;
+
+	if (!dst_p)
+		goto skip_p;
+
+	if (pq_continue) {
+		/* Type-B command to load old P into buf0 */
+		cmd = sba_cmd_enc(0x0, SBA_TYPE_B,
+				  SBA_TYPE_SHIFT, SBA_TYPE_MASK);
+		cmd = sba_cmd_enc(cmd, msg_len,
+				  SBA_USER_DEF_SHIFT, SBA_USER_DEF_MASK);
+		c_mdata = sba_cmd_load_c_mdata(0);
+		cmd = sba_cmd_enc(cmd, SBA_C_MDATA_LS(c_mdata),
+				  SBA_C_MDATA_SHIFT, SBA_C_MDATA_MASK);
+		cmd = sba_cmd_enc(cmd, SBA_CMD_LOAD_BUFFER,
+				  SBA_CMD_SHIFT, SBA_CMD_MASK);
+		cmdsp->cmd = cmd;
+		*cmdsp->cmd_dma = cpu_to_le64(cmd);
+		cmdsp->flags = BRCM_SBA_CMD_TYPE_B;
+		cmdsp->data = *dst_p + msg_offset;
+		cmdsp->data_len = msg_len;
+		cmdsp++;
+
+		/*
+		 * Type-B commands to xor data with buf0 and put it
+		 * back in buf0
+		 */
+		cmd = sba_cmd_enc(0x0, SBA_TYPE_B,
+				  SBA_TYPE_SHIFT, SBA_TYPE_MASK);
+		cmd = sba_cmd_enc(cmd, msg_len,
+				  SBA_USER_DEF_SHIFT, SBA_USER_DEF_MASK);
+		c_mdata = sba_cmd_xor_c_mdata(0, 0);
+		cmd = sba_cmd_enc(cmd, SBA_C_MDATA_LS(c_mdata),
+				  SBA_C_MDATA_SHIFT, SBA_C_MDATA_MASK);
+		cmd = sba_cmd_enc(cmd, SBA_CMD_XOR,
+				  SBA_CMD_SHIFT, SBA_CMD_MASK);
+		cmdsp->cmd = cmd;
+		*cmdsp->cmd_dma = cpu_to_le64(cmd);
+		cmdsp->flags = BRCM_SBA_CMD_TYPE_B;
+		cmdsp->data = src + msg_offset;
+		cmdsp->data_len = msg_len;
+		cmdsp++;
+	} else {
+		/* Type-B command to load old P into buf0 */
+		cmd = sba_cmd_enc(0x0, SBA_TYPE_B,
+				  SBA_TYPE_SHIFT, SBA_TYPE_MASK);
+		cmd = sba_cmd_enc(cmd, msg_len,
+				  SBA_USER_DEF_SHIFT, SBA_USER_DEF_MASK);
+		c_mdata = sba_cmd_load_c_mdata(0);
+		cmd = sba_cmd_enc(cmd, SBA_C_MDATA_LS(c_mdata),
+				  SBA_C_MDATA_SHIFT, SBA_C_MDATA_MASK);
+		cmd = sba_cmd_enc(cmd, SBA_CMD_LOAD_BUFFER,
+				  SBA_CMD_SHIFT, SBA_CMD_MASK);
+		cmdsp->cmd = cmd;
+		*cmdsp->cmd_dma = cpu_to_le64(cmd);
+		cmdsp->flags = BRCM_SBA_CMD_TYPE_B;
+		cmdsp->data = src + msg_offset;
+		cmdsp->data_len = msg_len;
+		cmdsp++;
+	}
+
+	/* Type-A command to write buf0 */
+	cmd = sba_cmd_enc(0x0, SBA_TYPE_A,
+			  SBA_TYPE_SHIFT, SBA_TYPE_MASK);
+	cmd = sba_cmd_enc(cmd, msg_len,
+			  SBA_USER_DEF_SHIFT, SBA_USER_DEF_MASK);
+	cmd = sba_cmd_enc(cmd, 0x1,
+			  SBA_RESP_SHIFT, SBA_RESP_MASK);
+	c_mdata = sba_cmd_write_c_mdata(0);
+	cmd = sba_cmd_enc(cmd, SBA_C_MDATA_LS(c_mdata),
+			  SBA_C_MDATA_SHIFT, SBA_C_MDATA_MASK);
+	cmd = sba_cmd_enc(cmd, SBA_CMD_WRITE_BUFFER,
+			  SBA_CMD_SHIFT, SBA_CMD_MASK);
+	cmdsp->cmd = cmd;
+	*cmdsp->cmd_dma = cpu_to_le64(cmd);
+	cmdsp->flags = BRCM_SBA_CMD_TYPE_A;
+	if (req->sba->hw_resp_size) {
+		cmdsp->flags |= BRCM_SBA_CMD_HAS_RESP;
+		cmdsp->resp = req->resp_dma;
+		cmdsp->resp_len = req->sba->hw_resp_size;
+	}
+	cmdsp->flags |= BRCM_SBA_CMD_HAS_OUTPUT;
+	cmdsp->data = *dst_p + msg_offset;
+	cmdsp->data_len = msg_len;
+	cmdsp++;
+
+skip_p:
+	if (!dst_q)
+		goto skip_q;
+
+	/* Type-A command to zero all buffers */
+	cmd = sba_cmd_enc(0x0, SBA_TYPE_A,
+			  SBA_TYPE_SHIFT, SBA_TYPE_MASK);
+	cmd = sba_cmd_enc(cmd, msg_len,
+			  SBA_USER_DEF_SHIFT, SBA_USER_DEF_MASK);
+	cmd = sba_cmd_enc(cmd, SBA_CMD_ZERO_ALL_BUFFERS,
+			  SBA_CMD_SHIFT, SBA_CMD_MASK);
+	cmdsp->cmd = cmd;
+	*cmdsp->cmd_dma = cpu_to_le64(cmd);
+	cmdsp->flags = BRCM_SBA_CMD_TYPE_A;
+	cmdsp++;
+
+	if (dpos == 255)
+		goto skip_q_computation;
+	pos = (dpos < req->sba->max_pq_coefs) ?
+		dpos : (req->sba->max_pq_coefs - 1);
+
+	/*
+	 * Type-B command to generate initial Q from data
+	 * and store output into buf0
+	 */
+	cmd = sba_cmd_enc(0x0, SBA_TYPE_B,
+			  SBA_TYPE_SHIFT, SBA_TYPE_MASK);
+	cmd = sba_cmd_enc(cmd, msg_len,
+			  SBA_USER_DEF_SHIFT, SBA_USER_DEF_MASK);
+	c_mdata = sba_cmd_pq_c_mdata(pos, 0, 0);
+	cmd = sba_cmd_enc(cmd, SBA_C_MDATA_LS(c_mdata),
+			  SBA_C_MDATA_SHIFT, SBA_C_MDATA_MASK);
+	cmd = sba_cmd_enc(cmd, SBA_C_MDATA_MS(c_mdata),
+			  SBA_C_MDATA_MS_SHIFT, SBA_C_MDATA_MS_MASK);
+	cmd = sba_cmd_enc(cmd, SBA_CMD_GALOIS,
+			  SBA_CMD_SHIFT, SBA_CMD_MASK);
+	cmdsp->cmd = cmd;
+	*cmdsp->cmd_dma = cpu_to_le64(cmd);
+	cmdsp->flags = BRCM_SBA_CMD_TYPE_B;
+	cmdsp->data = src + msg_offset;
+	cmdsp->data_len = msg_len;
+	cmdsp++;
+
+	dpos -= pos;
+
+	/* Multiple Type-A command to generate final Q */
+	while (dpos) {
+		pos = (dpos < req->sba->max_pq_coefs) ?
+			dpos : (req->sba->max_pq_coefs - 1);
+
+		/*
+		 * Type-A command to generate Q with buf0 and
+		 * buf1 store result in buf0
+		 */
+		cmd = sba_cmd_enc(0x0, SBA_TYPE_A,
+				  SBA_TYPE_SHIFT, SBA_TYPE_MASK);
+		cmd = sba_cmd_enc(cmd, msg_len,
+				  SBA_USER_DEF_SHIFT, SBA_USER_DEF_MASK);
+		c_mdata = sba_cmd_pq_c_mdata(pos, 0, 1);
+		cmd = sba_cmd_enc(cmd, SBA_C_MDATA_LS(c_mdata),
+				  SBA_C_MDATA_SHIFT, SBA_C_MDATA_MASK);
+		cmd = sba_cmd_enc(cmd, SBA_C_MDATA_MS(c_mdata),
+				  SBA_C_MDATA_MS_SHIFT, SBA_C_MDATA_MS_MASK);
+		cmd = sba_cmd_enc(cmd, SBA_CMD_GALOIS,
+				  SBA_CMD_SHIFT, SBA_CMD_MASK);
+		cmdsp->cmd = cmd;
+		*cmdsp->cmd_dma = cpu_to_le64(cmd);
+		cmdsp->flags = BRCM_SBA_CMD_TYPE_A;
+		cmdsp++;
+
+		dpos -= pos;
+	}
+
+skip_q_computation:
+	if (pq_continue) {
+		/*
+		 * Type-B command to XOR previous output with
+		 * buf0 and write it into buf0
+		 */
+		cmd = sba_cmd_enc(0x0, SBA_TYPE_B,
+				  SBA_TYPE_SHIFT, SBA_TYPE_MASK);
+		cmd = sba_cmd_enc(cmd, msg_len,
+				  SBA_USER_DEF_SHIFT, SBA_USER_DEF_MASK);
+		c_mdata = sba_cmd_xor_c_mdata(0, 0);
+		cmd = sba_cmd_enc(cmd, SBA_C_MDATA_LS(c_mdata),
+				  SBA_C_MDATA_SHIFT, SBA_C_MDATA_MASK);
+		cmd = sba_cmd_enc(cmd, SBA_CMD_XOR,
+				  SBA_CMD_SHIFT, SBA_CMD_MASK);
+		cmdsp->cmd = cmd;
+		*cmdsp->cmd_dma = cpu_to_le64(cmd);
+		cmdsp->flags = BRCM_SBA_CMD_TYPE_B;
+		cmdsp->data = *dst_q + msg_offset;
+		cmdsp->data_len = msg_len;
+		cmdsp++;
+	}
+
+	/* Type-A command to write buf0 */
+	cmd = sba_cmd_enc(0x0, SBA_TYPE_A,
+			  SBA_TYPE_SHIFT, SBA_TYPE_MASK);
+	cmd = sba_cmd_enc(cmd, msg_len,
+			  SBA_USER_DEF_SHIFT, SBA_USER_DEF_MASK);
+	cmd = sba_cmd_enc(cmd, 0x1,
+			  SBA_RESP_SHIFT, SBA_RESP_MASK);
+	c_mdata = sba_cmd_write_c_mdata(0);
+	cmd = sba_cmd_enc(cmd, SBA_C_MDATA_LS(c_mdata),
+			  SBA_C_MDATA_SHIFT, SBA_C_MDATA_MASK);
+	cmd = sba_cmd_enc(cmd, SBA_CMD_WRITE_BUFFER,
+			  SBA_CMD_SHIFT, SBA_CMD_MASK);
+	cmdsp->cmd = cmd;
+	*cmdsp->cmd_dma = cpu_to_le64(cmd);
+	cmdsp->flags = BRCM_SBA_CMD_TYPE_A;
+	if (req->sba->hw_resp_size) {
+		cmdsp->flags |= BRCM_SBA_CMD_HAS_RESP;
+		cmdsp->resp = req->resp_dma;
+		cmdsp->resp_len = req->sba->hw_resp_size;
+	}
+	cmdsp->flags |= BRCM_SBA_CMD_HAS_OUTPUT;
+	cmdsp->data = *dst_q + msg_offset;
+	cmdsp->data_len = msg_len;
+	cmdsp++;
+
+skip_q:
+	/* Fillup brcm_message */
+	msg->type = BRCM_MESSAGE_SBA;
+	msg->sba.cmds = cmds;
+	msg->sba.cmds_count = cmdsp - cmds;
+	msg->ctx = req;
+	msg->error = 0;
+}
+
+struct sba_request *
+sba_prep_dma_pq_single_req(struct sba_device *sba, dma_addr_t off,
+			   dma_addr_t *dst_p, dma_addr_t *dst_q,
+			   dma_addr_t src, u8 scf, size_t len,
+			   unsigned long flags)
+{
+	struct sba_request *req = NULL;
+
+	/* Alloc new request */
+	req = sba_alloc_request(sba);
+	if (!req)
+		return NULL;
+	req->fence = (flags & DMA_PREP_FENCE) ? true : false;
+
+	/* Fillup request messages */
+	sba_fillup_pq_single_msg(req,  dmaf_continue(flags),
+				 req->cmds, &req->msg, off, len,
+				 dst_p, dst_q, src, scf);
+
+	/* Init async_tx descriptor */
+	req->tx.flags = flags;
+	req->tx.cookie = -EBUSY;
+
+	return req;
+}
+
+static struct dma_async_tx_descriptor *
+sba_prep_dma_pq(struct dma_chan *dchan, dma_addr_t *dst, dma_addr_t *src,
+		u32 src_cnt, const u8 *scf, size_t len, unsigned long flags)
+{
+	u32 i, dst_q_index;
+	size_t req_len;
+	bool slow = false;
+	dma_addr_t off = 0;
+	dma_addr_t *dst_p = NULL, *dst_q = NULL;
+	struct sba_device *sba = to_sba_device(dchan);
+	struct sba_request *first = NULL, *req;
+
+	/* Sanity checks */
+	if (unlikely(src_cnt > sba->max_pq_srcs))
+		return NULL;
+	for (i = 0; i < src_cnt; i++)
+		if (sba->max_pq_coefs <= raid6_gflog[scf[i]])
+			slow = true;
+
+	/* Figure-out P and Q destination addresses */
+	if (!(flags & DMA_PREP_PQ_DISABLE_P))
+		dst_p = &dst[0];
+	if (!(flags & DMA_PREP_PQ_DISABLE_Q))
+		dst_q = &dst[1];
+
+	/* Create chained requests where each request is upto hw_buf_size */
+	while (len) {
+		req_len = (len < sba->hw_buf_size) ? len : sba->hw_buf_size;
+
+		if (slow) {
+			dst_q_index = src_cnt;
+
+			if (dst_q) {
+				for (i = 0; i < src_cnt; i++) {
+					if (*dst_q == src[i]) {
+						dst_q_index = i;
+						break;
+					}
+				}
+			}
+
+			if (dst_q_index < src_cnt) {
+				i = dst_q_index;
+				req = sba_prep_dma_pq_single_req(sba,
+					off, dst_p, dst_q, src[i], scf[i],
+					req_len, flags | DMA_PREP_FENCE);
+				if (!req)
+					goto fail;
+
+				if (first)
+					sba_chain_request(first, req);
+				else
+					first = req;
+
+				flags |= DMA_PREP_CONTINUE;
+			}
+
+			for (i = 0; i < src_cnt; i++) {
+				if (dst_q_index == i)
+					continue;
+
+				req = sba_prep_dma_pq_single_req(sba,
+					off, dst_p, dst_q, src[i], scf[i],
+					req_len, flags | DMA_PREP_FENCE);
+				if (!req)
+					goto fail;
+
+				if (first)
+					sba_chain_request(first, req);
+				else
+					first = req;
+
+				flags |= DMA_PREP_CONTINUE;
+			}
+		} else {
+			req = sba_prep_dma_pq_req(sba, off,
+						  dst_p, dst_q, src, src_cnt,
+						  scf, req_len, flags);
+			if (!req)
+				goto fail;
+
+			if (first)
+				sba_chain_request(first, req);
+			else
+				first = req;
+		}
+
+		off += req_len;
+		len -= req_len;
+	}
+
+	return (first) ? &first->tx : NULL;
+
+fail:
+	if (first)
+		sba_free_chained_requests(first);
+	return NULL;
+}
+
+/* ====== Mailbox callbacks ===== */
+
+static void sba_dma_tx_actions(struct sba_request *req)
+{
+	struct dma_async_tx_descriptor *tx = &req->tx;
+
+	WARN_ON(tx->cookie < 0);
+
+	if (tx->cookie > 0) {
+		dma_cookie_complete(tx);
+
+		/*
+		 * Call the callback (must not sleep or submit new
+		 * operations to this channel)
+		 */
+		if (tx->callback)
+			tx->callback(tx->callback_param);
+
+		dma_descriptor_unmap(tx);
+	}
+
+	/* Run dependent operations */
+	dma_run_dependencies(tx);
+
+	/* If waiting for 'ack' then move to completed list */
+	if (!async_tx_test_ack(&req->tx))
+		sba_complete_chained_requests(req);
+	else
+		sba_free_chained_requests(req);
+}
+
+static void sba_receive_message(struct mbox_client *cl, void *msg)
+{
+	unsigned long flags;
+	struct brcm_message *m = msg;
+	struct sba_request *req = m->ctx, *req1;
+	struct sba_device *sba = req->sba;
+
+	/* Error count if message has error */
+	if (m->error < 0)
+		dev_err(sba->dev, "%s got message with error %d",
+			dma_chan_name(&sba->dma_chan), m->error);
+
+	/* Mark request as received */
+	sba_received_request(req);
+
+	/* Wait for all chained requests to be completed */
+	if (atomic_dec_return(&req->first->next_pending_count))
+		goto done;
+
+	/* Point to first request */
+	req = req->first;
+
+	/* Update request */
+	if (req->state == SBA_REQUEST_STATE_RECEIVED)
+		sba_dma_tx_actions(req);
+	else
+		sba_free_chained_requests(req);
+
+	spin_lock_irqsave(&sba->reqs_lock, flags);
+
+	/* Re-check all completed request waiting for 'ack' */
+	list_for_each_entry_safe(req, req1, &sba->reqs_completed_list, node) {
+		spin_unlock_irqrestore(&sba->reqs_lock, flags);
+		sba_dma_tx_actions(req);
+		spin_lock_irqsave(&sba->reqs_lock, flags);
+	}
+
+	spin_unlock_irqrestore(&sba->reqs_lock, flags);
+
+done:
+	/* Try to submit pending request */
+	sba_issue_pending(&sba->dma_chan);
+}
+
+/* ====== Platform driver routines ===== */
+
+static int sba_prealloc_channel_resources(struct sba_device *sba)
+{
+	int i, j, p, ret = 0;
+	struct sba_request *req = NULL;
+
+	sba->resp_base = dma_alloc_coherent(sba->dma_dev.dev,
+					    sba->max_resp_pool_size,
+					    &sba->resp_dma_base, GFP_KERNEL);
+	if (!sba->resp_base)
+		return -ENOMEM;
+
+	sba->cmds_base = dma_alloc_coherent(sba->dma_dev.dev,
+					    sba->max_cmds_pool_size,
+					    &sba->cmds_dma_base, GFP_KERNEL);
+	if (!sba->cmds_base) {
+		ret = -ENOMEM;
+		goto fail_free_resp_pool;
+	}
+
+	spin_lock_init(&sba->reqs_lock);
+	sba->reqs_fence = false;
+	INIT_LIST_HEAD(&sba->reqs_alloc_list);
+	INIT_LIST_HEAD(&sba->reqs_pending_list);
+	INIT_LIST_HEAD(&sba->reqs_active_list);
+	INIT_LIST_HEAD(&sba->reqs_received_list);
+	INIT_LIST_HEAD(&sba->reqs_completed_list);
+	INIT_LIST_HEAD(&sba->reqs_aborted_list);
+	INIT_LIST_HEAD(&sba->reqs_free_list);
+
+	sba->reqs = devm_kcalloc(sba->dev, sba->max_req,
+				 sizeof(*req), GFP_KERNEL);
+	if (!sba->reqs) {
+		ret = -ENOMEM;
+		goto fail_free_cmds_pool;
+	}
+
+	for (i = 0, p = 0; i < sba->max_req; i++) {
+		req = &sba->reqs[i];
+		INIT_LIST_HEAD(&req->node);
+		req->sba = sba;
+		req->state = SBA_REQUEST_STATE_FREE;
+		INIT_LIST_HEAD(&req->next);
+		req->next_count = 1;
+		atomic_set(&req->next_pending_count, 0);
+		req->fence = false;
+		req->resp = sba->resp_base + p;
+		req->resp_dma = sba->resp_dma_base + p;
+		p += sba->hw_resp_size;
+		req->cmds = devm_kcalloc(sba->dev, sba->max_cmd_per_req,
+					 sizeof(*req->cmds), GFP_KERNEL);
+		if (!req->cmds) {
+			ret = -ENOMEM;
+			goto fail_free_cmds_pool;
+		}
+		for (j = 0; j < sba->max_cmd_per_req; j++) {
+			req->cmds[j].cmd = 0;
+			req->cmds[j].cmd_dma = sba->cmds_base +
+				(i * sba->max_cmd_per_req + j) * sizeof(u64);
+			req->cmds[j].cmd_dma_addr = sba->cmds_dma_base +
+				(i * sba->max_cmd_per_req + j) * sizeof(u64);
+			req->cmds[j].flags = 0;
+		}
+		memset(&req->msg, 0, sizeof(req->msg));
+		dma_async_tx_descriptor_init(&req->tx, &sba->dma_chan);
+		req->tx.tx_submit = sba_tx_submit;
+		req->tx.phys = req->resp_dma;
+		list_add_tail(&req->node, &sba->reqs_free_list);
+	}
+
+	sba->reqs_free_count = sba->max_req;
+
+	return 0;
+
+fail_free_cmds_pool:
+	dma_free_coherent(sba->dma_dev.dev,
+			  sba->max_cmds_pool_size,
+			  sba->cmds_base, sba->cmds_dma_base);
+fail_free_resp_pool:
+	dma_free_coherent(sba->dma_dev.dev,
+			  sba->max_resp_pool_size,
+			  sba->resp_base, sba->resp_dma_base);
+	return ret;
+}
+
+static void sba_freeup_channel_resources(struct sba_device *sba)
+{
+	dmaengine_terminate_all(&sba->dma_chan);
+	dma_free_coherent(sba->dma_dev.dev, sba->max_cmds_pool_size,
+			  sba->cmds_base, sba->cmds_dma_base);
+	dma_free_coherent(sba->dma_dev.dev, sba->max_resp_pool_size,
+			  sba->resp_base, sba->resp_dma_base);
+	sba->resp_base = NULL;
+	sba->resp_dma_base = 0;
+}
+
+static int sba_async_register(struct sba_device *sba)
+{
+	int ret;
+	struct dma_device *dma_dev = &sba->dma_dev;
+
+	/* Initialize DMA channel cookie */
+	sba->dma_chan.device = dma_dev;
+	dma_cookie_init(&sba->dma_chan);
+
+	/* Initialize DMA device capability mask */
+	dma_cap_zero(dma_dev->cap_mask);
+	dma_cap_set(DMA_MEMCPY, dma_dev->cap_mask);
+	dma_cap_set(DMA_XOR, dma_dev->cap_mask);
+	dma_cap_set(DMA_PQ, dma_dev->cap_mask);
+
+	/*
+	 * Set mailbox channel device as the base device of
+	 * our dma_device because the actual memory accesses
+	 * will be done by mailbox controller
+	 */
+	dma_dev->dev = sba->mbox_dev;
+
+	/* Set base prep routines */
+	dma_dev->device_free_chan_resources = sba_free_chan_resources;
+	dma_dev->device_terminate_all = sba_device_terminate_all;
+	dma_dev->device_issue_pending = sba_issue_pending;
+	dma_dev->device_tx_status = sba_tx_status;
+
+	/* Set memcpy routines and capability */
+	if (dma_has_cap(DMA_MEMCPY, dma_dev->cap_mask))
+		dma_dev->device_prep_dma_memcpy = sba_prep_dma_memcpy;
+
+	/* Set xor routines and capability */
+	if (dma_has_cap(DMA_XOR, dma_dev->cap_mask)) {
+		dma_dev->device_prep_dma_xor = sba_prep_dma_xor;
+		dma_dev->max_xor = sba->max_xor_srcs;
+	}
+
+	/* Set pq routines and capability */
+	if (dma_has_cap(DMA_PQ, dma_dev->cap_mask)) {
+		dma_dev->device_prep_dma_pq = sba_prep_dma_pq;
+		dma_set_maxpq(dma_dev, sba->max_pq_srcs, 0);
+	}
+
+	/* Initialize DMA device channel list */
+	INIT_LIST_HEAD(&dma_dev->channels);
+	list_add_tail(&sba->dma_chan.device_node, &dma_dev->channels);
+
+	/* Register with Linux async DMA framework*/
+	ret = dma_async_device_register(dma_dev);
+	if (ret) {
+		dev_err(sba->dev, "async device register error %d", ret);
+		return ret;
+	}
+
+	dev_info(sba->dev, "%s capabilities: %s%s%s\n",
+		 dma_chan_name(&sba->dma_chan),
+		 dma_has_cap(DMA_MEMCPY, dma_dev->cap_mask) ? "memcpy " : "",
+		 dma_has_cap(DMA_XOR, dma_dev->cap_mask) ? "xor " : "",
+		 dma_has_cap(DMA_PQ, dma_dev->cap_mask) ? "pq " : "");
+
+	return 0;
+}
+
+static int sba_probe(struct platform_device *pdev)
+{
+	int i, ret = 0, mchans_count;
+	struct sba_device *sba;
+	struct platform_device *mbox_pdev;
+	struct of_phandle_args args;
+
+	/* Allocate main SBA struct */
+	sba = devm_kzalloc(&pdev->dev, sizeof(*sba), GFP_KERNEL);
+	if (!sba)
+		return -ENOMEM;
+
+	sba->dev = &pdev->dev;
+	platform_set_drvdata(pdev, sba);
+
+	/* Determine SBA version from DT compatible string */
+	if (of_device_is_compatible(sba->dev->of_node, "brcm,iproc-sba"))
+		sba->ver = SBA_VER_1;
+	else if (of_device_is_compatible(sba->dev->of_node,
+					 "brcm,iproc-sba-v2"))
+		sba->ver = SBA_VER_2;
+	else
+		return -ENODEV;
+
+	/* Derived Configuration parameters */
+	switch (sba->ver) {
+	case SBA_VER_1:
+		sba->max_req = 1024;
+		sba->hw_buf_size = 4096;
+		sba->hw_resp_size = 8;
+		sba->max_pq_coefs = 6;
+		sba->max_pq_srcs = 6;
+		break;
+	case SBA_VER_2:
+		sba->max_req = 1024;
+		sba->hw_buf_size = 4096;
+		sba->hw_resp_size = 8;
+		sba->max_pq_coefs = 30;
+		/*
+		 * We can support max_pq_srcs == max_pq_coefs because
+		 * we are limited by number of SBA commands that we can
+		 * fit in one message for underlying ring manager HW.
+		 */
+		sba->max_pq_srcs = 12;
+		break;
+	default:
+		return -EINVAL;
+	}
+	sba->max_cmd_per_req = sba->max_pq_srcs + 3;
+	sba->max_xor_srcs = sba->max_cmd_per_req - 1;
+	sba->max_resp_pool_size = sba->max_req * sba->hw_resp_size;
+	sba->max_cmds_pool_size = sba->max_req *
+				  sba->max_cmd_per_req * sizeof(u64);
+
+	/* Setup mailbox client */
+	sba->client.dev			= &pdev->dev;
+	sba->client.rx_callback		= sba_receive_message;
+	sba->client.tx_block		= false;
+	sba->client.knows_txdone	= false;
+	sba->client.tx_tout		= 0;
+
+	/* Number of channels equals number of mailbox channels */
+	ret = of_count_phandle_with_args(pdev->dev.of_node,
+					 "mboxes", "#mbox-cells");
+	if (ret <= 0)
+		return -ENODEV;
+	mchans_count = ret;
+	sba->mchans_count = 0;
+	atomic_set(&sba->mchans_current, 0);
+
+	/* Allocate mailbox channel array */
+	sba->mchans = devm_kcalloc(&pdev->dev, sba->mchans_count,
+				   sizeof(*sba->mchans), GFP_KERNEL);
+	if (!sba->mchans)
+		return -ENOMEM;
+
+	/* Request mailbox channels */
+	for (i = 0; i < mchans_count; i++) {
+		sba->mchans[i] = mbox_request_channel(&sba->client, i);
+		if (IS_ERR(sba->mchans[i])) {
+			ret = PTR_ERR(sba->mchans[i]);
+			goto fail_free_mchans;
+		}
+		sba->mchans_count++;
+	}
+
+	/* Find-out underlying mailbox device */
+	ret = of_parse_phandle_with_args(pdev->dev.of_node,
+					 "mboxes", "#mbox-cells", 0, &args);
+	if (ret)
+		goto fail_free_mchans;
+	mbox_pdev = of_find_device_by_node(args.np);
+	of_node_put(args.np);
+	if (!mbox_pdev) {
+		ret = -ENODEV;
+		goto fail_free_mchans;
+	}
+	sba->mbox_dev = &mbox_pdev->dev;
+
+	/* All mailbox channels should be of same ring manager device */
+	for (i = 1; i < mchans_count; i++) {
+		ret = of_parse_phandle_with_args(pdev->dev.of_node,
+					 "mboxes", "#mbox-cells", i, &args);
+		if (ret)
+			goto fail_free_mchans;
+		mbox_pdev = of_find_device_by_node(args.np);
+		of_node_put(args.np);
+		if (sba->mbox_dev != &mbox_pdev->dev) {
+			ret = -EINVAL;
+			goto fail_free_mchans;
+		}
+	}
+
+	/* Register DMA device with linux async framework */
+	ret = sba_async_register(sba);
+	if (ret)
+		goto fail_free_mchans;
+
+	/* Prealloc channel resource */
+	ret = sba_prealloc_channel_resources(sba);
+	if (ret)
+		goto fail_async_dev_unreg;
+
+	/* Print device info */
+	dev_info(sba->dev, "%s using SBAv%d and %d mailbox channels",
+		 dma_chan_name(&sba->dma_chan), sba->ver+1,
+		 sba->mchans_count);
+
+	return 0;
+
+fail_async_dev_unreg:
+	dma_async_device_unregister(&sba->dma_dev);
+fail_free_mchans:
+	for (i = 0; i < sba->mchans_count; i++)
+		mbox_free_channel(sba->mchans[i]);
+	return ret;
+}
+
+static int sba_remove(struct platform_device *pdev)
+{
+	int i;
+	struct sba_device *sba = platform_get_drvdata(pdev);
+
+	sba_freeup_channel_resources(sba);
+
+	dma_async_device_unregister(&sba->dma_dev);
+
+	for (i = 0; i < sba->mchans_count; i++)
+		mbox_free_channel(sba->mchans[i]);
+
+	return 0;
+}
+
+static const struct of_device_id sba_of_match[] = {
+	{ .compatible = "brcm,iproc-sba", },
+	{ .compatible = "brcm,iproc-sba-v2", },
+	{},
+};
+MODULE_DEVICE_TABLE(of, sba_of_match);
+
+static struct platform_driver sba_driver = {
+	.probe = sba_probe,
+	.remove = sba_remove,
+	.driver = {
+		.name = "bcm-sba-raid",
+		.of_match_table = sba_of_match,
+	},
+};
+module_platform_driver(sba_driver);
+
+MODULE_DESCRIPTION("Broadcom SBA RAID driver");
+MODULE_AUTHOR("Anup Patel <anup.patel@broadcom.com>");
+MODULE_LICENSE("GPL v2");
-- 
2.7.4


^ permalink raw reply related

* [PATCH v4 4/4] dt-bindings: Add DT bindings document for Broadcom SBA RAID driver
From: Anup Patel @ 2017-02-14  6:51 UTC (permalink / raw)
  To: Vinod Koul, Rob Herring, Mark Rutland, Herbert Xu,
	David S . Miller, Jassi Brar
  Cc: Dan Williams, Ray Jui, Scott Branden, Jon Mason, Rob Rice,
	bcm-kernel-feedback-list, dmaengine, devicetree, linux-arm-kernel,
	linux-kernel, linux-crypto, linux-raid, Anup Patel
In-Reply-To: <1487055112-5185-1-git-send-email-anup.patel@broadcom.com>

This patch adds the DT bindings document for newly added Broadcom
SBA RAID driver.

Signed-off-by: Anup Patel <anup.patel@broadcom.com>
Reviewed-by: Ray Jui <ray.jui@broadcom.com>
Reviewed-by: Scott Branden <scott.branden@broadcom.com>
---
 .../devicetree/bindings/dma/brcm,iproc-sba.txt     | 29 ++++++++++++++++++++++
 1 file changed, 29 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/dma/brcm,iproc-sba.txt

diff --git a/Documentation/devicetree/bindings/dma/brcm,iproc-sba.txt b/Documentation/devicetree/bindings/dma/brcm,iproc-sba.txt
new file mode 100644
index 0000000..092913a
--- /dev/null
+++ b/Documentation/devicetree/bindings/dma/brcm,iproc-sba.txt
@@ -0,0 +1,29 @@
+* Broadcom SBA RAID engine
+
+Required properties:
+- compatible: Should be one of the following
+	      "brcm,iproc-sba"
+	      "brcm,iproc-sba-v2"
+  The "brcm,iproc-sba" has support for only 6 PQ coefficients
+  The "brcm,iproc-sba-v2" has support for only 30 PQ coefficients
+- mboxes: List of phandle and mailbox channel specifiers
+
+Example:
+
+raid_mbox: mbox@67400000 {
+	...
+	#mbox-cells = <3>;
+	...
+};
+
+raid0 {
+	compatible = "brcm,iproc-sba-v2";
+	mboxes = <&raid_mbox 0 0x1 0xffff>,
+		 <&raid_mbox 1 0x1 0xffff>,
+		 <&raid_mbox 2 0x1 0xffff>,
+		 <&raid_mbox 3 0x1 0xffff>,
+		 <&raid_mbox 4 0x1 0xffff>,
+		 <&raid_mbox 5 0x1 0xffff>,
+		 <&raid_mbox 6 0x1 0xffff>,
+		 <&raid_mbox 7 0x1 0xffff>;
+};
-- 
2.7.4

^ permalink raw reply related

* [PATCH] Monitor: dev should be a block file in waitclean
From: Zhilong Liu @ 2017-02-14  7:50 UTC (permalink / raw)
  To: Jes.Sorensen; +Cc: linux-raid, Zhilong Liu

    mdadm --wait-clean /dev/mdX, the dev should
    be a block file, otherwise fd2devnm returns
    NULL and triggers core dumped.

Signed-off-by: Zhilong Liu <zlliu@suse.com>

diff --git a/Monitor.c b/Monitor.c
index 802a9d9..d16ac49 100644
--- a/Monitor.c
+++ b/Monitor.c
@@ -1060,7 +1060,17 @@ int WaitClean(char *dev, int sock, int verbose)
 	struct mdinfo *mdi;
 	int rv = 1;
 	char devnm[32];
+	struct stat stb;
 
+	if (stat(dev, &stb) != 0) {
+		pr_err("Cannot find %s: %s\n", dev,
+			strerror(errno));
+		return 2;
+	}
+	if ((S_IFMT & stb.st_mode) != S_IFBLK) {
+		pr_err("%s is not a block device.\n", dev);
+		return 2;
+	}
 	fd = open(dev, O_RDONLY);
 	if (fd < 0) {
 		if (verbose)
-- 
2.6.6


^ permalink raw reply related

* Which API for md state monitoring - sysfs vs. /proc/mdstat vs. GET_DISK_INFO ioctl
From: Tim Small @ 2017-02-14 14:43 UTC (permalink / raw)
  To: linux-raid; +Cc: shli

Hello,

A design question for a user-space RAID monitoring tool.  The tool is
intended to export machine-readable md state so that the system owners
can creating automated alert policies etc.

Should it get its information on md device state from /proc/mdstat, or
/sys/block/*/md/ as a data source, or should it use the ioctl interface
that mdadm uses (on the basis that this is more complete / heavily tested).

I assume the issues are:

. usability (/proc/mdstat is intended to be human-readable so machine
parsing it seems bleugh - it's also incomplete?)

. stability (perhaps /sys/block/*/md/ is less stable than the ioctl or
proc interface?  Or perhaps it's just fine and won't break in
backward-incompatible ways?).

Any thoughts?

Should I see which way the recent "add bad block flag to disk state"
patch-set thread goes?

Tim.

^ permalink raw reply

* Re: Which API for md state monitoring - sysfs vs. /proc/mdstat vs. GET_DISK_INFO ioctl
From: Hannes Reinecke @ 2017-02-14 15:26 UTC (permalink / raw)
  To: Tim Small, linux-raid; +Cc: shli
In-Reply-To: <994ffd50-9041-a5e0-bbd9-1db704f6ad63@buttersideup.com>

On 02/14/2017 03:43 PM, Tim Small wrote:
> Hello,
> 
> A design question for a user-space RAID monitoring tool.  The tool is
> intended to export machine-readable md state so that the system owners
> can creating automated alert policies etc.
> 
> Should it get its information on md device state from /proc/mdstat, or
> /sys/block/*/md/ as a data source, or should it use the ioctl interface
> that mdadm uses (on the basis that this is more complete / heavily tested).
> 
> I assume the issues are:
> 
> . usability (/proc/mdstat is intended to be human-readable so machine
> parsing it seems bleugh - it's also incomplete?)
> 
Yes.

> . stability (perhaps /sys/block/*/md/ is less stable than the ioctl or
> proc interface?  Or perhaps it's just fine and won't break in
> backward-incompatible ways?).
> 
I'm not sure if there are any guarantees here; we _try_ to keep the
sysfs interface stable, but ...

> Any thoughts?
> 
> Should I see which way the recent "add bad block flag to disk state"
> patch-set thread goes?
> 
Hehe.

Unfortunately the only tool able to provide a consistent view of the MD
RAID status is called 'mdadm'.

Problem is that you each of the various interfaces (ioctl, /proc/mdstat,
/sys/block/*/md) provides you with different information. So your
userspace tool would need to parse all of these information sources to
get a consistent view.

I did a similar thing myself for my md_monitor program
(github.com:/hreinecke/md_monitor.git), so you could have a look there
for some further information.

Cheers,

Hannes
-- 
Dr. Hannes Reinecke		   Teamlead Storage & Networking
hare@suse.de			               +49 911 74053 688
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: F. Imendörffer, J. Smithard, J. Guild, D. Upmanyu, G. Norton
HRB 21284 (AG Nürnberg)

^ permalink raw reply

* [PATCH v2 0/5] md: use bio_clone_fast()
From: Ming Lei @ 2017-02-14 15:28 UTC (permalink / raw)
  To: Shaohua Li, Jens Axboe, linux-kernel, linux-raid, linux-block,
	Christoph Hellwig, NeilBrown
  Cc: Ming Lei

Hi,

This patches replaces bio_clone() with bio_fast_clone() in
bio_clone_mddev() because:

1) bio_clone_mddev() is used in raid normal I/O and isn't in
resync I/O path, and all the direct access to bvec table in
raid happens on resync I/O only except for write behind of raid1.
Write behind is treated specially, so the replacement is safe.

2) for write behind, bio_clone() is kept, but this patchset
introduces bio_clone_bioset_partial() to just clone one specific 
bvecs range instead of whole table. Then write behind is improved
too.

V2:
	1) move 3rd patch into 2nd one
	2) kill  bio_clone_mddev() and use  bio_clone_fast() directly
	3) define local variable 'offset' as sector_t in raid1_write_request()

V1:
	1) don't introduce bio_clone_slow_mddev_partial()
	2) return failure if mddev->bio_set can't be created
	3) remove check in bio_clone_mddev() as suggested by
	Christoph Hellwig.
	4) rename bio_clone_mddev() as bio_clone_fast_mddev()

Ming Lei (5):
  block: introduce bio_clone_bioset_partial()
  md: fail if mddev->bio_set can't be created
  md/raid1: use bio_clone_bioset_partial() in case of write behind
  md: remove unnecessary check on mddev
  md: fast clone bio in bio_clone_mddev()

 block/bio.c         | 61 +++++++++++++++++++++++++++++++++++++++++------------
 drivers/md/faulty.c |  2 +-
 drivers/md/md.c     | 15 ++++---------
 drivers/md/md.h     |  2 --
 drivers/md/raid1.c  | 28 +++++++++++++++++-------
 drivers/md/raid10.c | 11 +++++-----
 drivers/md/raid5.c  |  4 ++--
 include/linux/bio.h | 11 ++++++++--
 8 files changed, 89 insertions(+), 45 deletions(-)

-- 
2.7.4

^ permalink raw reply

* [PATCH v2 1/5] block: introduce bio_clone_bioset_partial()
From: Ming Lei @ 2017-02-14 15:28 UTC (permalink / raw)
  To: Shaohua Li, Jens Axboe, linux-kernel, linux-raid, linux-block,
	Christoph Hellwig, NeilBrown
  Cc: Ming Lei
In-Reply-To: <1487086143-10255-1-git-send-email-tom.leiming@gmail.com>

md still need bio clone(not the fast version) for behind write,
and it is more efficient to use bio_clone_bioset_partial().

The idea is simple and just copy the bvecs range specified from
parameters.

Reviewed-by: Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ming Lei <tom.leiming@gmail.com>
---
 block/bio.c         | 61 +++++++++++++++++++++++++++++++++++++++++------------
 include/linux/bio.h | 11 ++++++++--
 2 files changed, 57 insertions(+), 15 deletions(-)

diff --git a/block/bio.c b/block/bio.c
index 4b564d0c3e29..5eec5e08417f 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -625,21 +625,20 @@ struct bio *bio_clone_fast(struct bio *bio, gfp_t gfp_mask, struct bio_set *bs)
 }
 EXPORT_SYMBOL(bio_clone_fast);
 
-/**
- * 	bio_clone_bioset - clone a bio
- * 	@bio_src: bio to clone
- *	@gfp_mask: allocation priority
- *	@bs: bio_set to allocate from
- *
- *	Clone bio. Caller will own the returned bio, but not the actual data it
- *	points to. Reference count of returned bio will be one.
- */
-struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask,
-			     struct bio_set *bs)
+static struct bio *__bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask,
+				      struct bio_set *bs, int offset,
+				      int size)
 {
 	struct bvec_iter iter;
 	struct bio_vec bv;
 	struct bio *bio;
+	struct bvec_iter iter_src = bio_src->bi_iter;
+
+	/* for supporting partial clone */
+	if (offset || size != bio_src->bi_iter.bi_size) {
+		bio_advance_iter(bio_src, &iter_src, offset);
+		iter_src.bi_size = size;
+	}
 
 	/*
 	 * Pre immutable biovecs, __bio_clone() used to just do a memcpy from
@@ -663,7 +662,8 @@ struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask,
 	 *    __bio_clone_fast() anyways.
 	 */
 
-	bio = bio_alloc_bioset(gfp_mask, bio_segments(bio_src), bs);
+	bio = bio_alloc_bioset(gfp_mask, __bio_segments(bio_src,
+			       &iter_src), bs);
 	if (!bio)
 		return NULL;
 	bio->bi_bdev		= bio_src->bi_bdev;
@@ -680,7 +680,7 @@ struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask,
 		bio->bi_io_vec[bio->bi_vcnt++] = bio_src->bi_io_vec[0];
 		break;
 	default:
-		bio_for_each_segment(bv, bio_src, iter)
+		__bio_for_each_segment(bv, bio_src, iter, iter_src)
 			bio->bi_io_vec[bio->bi_vcnt++] = bv;
 		break;
 	}
@@ -699,9 +699,44 @@ struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask,
 
 	return bio;
 }
+
+/**
+ * 	bio_clone_bioset - clone a bio
+ * 	@bio_src: bio to clone
+ *	@gfp_mask: allocation priority
+ *	@bs: bio_set to allocate from
+ *
+ *	Clone bio. Caller will own the returned bio, but not the actual data it
+ *	points to. Reference count of returned bio will be one.
+ */
+struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask,
+			     struct bio_set *bs)
+{
+	return __bio_clone_bioset(bio_src, gfp_mask, bs, 0,
+				  bio_src->bi_iter.bi_size);
+}
 EXPORT_SYMBOL(bio_clone_bioset);
 
 /**
+ * 	bio_clone_bioset_partial - clone a partial bio
+ * 	@bio_src: bio to clone
+ *	@gfp_mask: allocation priority
+ *	@bs: bio_set to allocate from
+ *	@offset: cloned starting from the offset
+ *	@size: size for the cloned bio
+ *
+ *	Clone bio. Caller will own the returned bio, but not the actual data it
+ *	points to. Reference count of returned bio will be one.
+ */
+struct bio *bio_clone_bioset_partial(struct bio *bio_src, gfp_t gfp_mask,
+				     struct bio_set *bs, int offset,
+				     int size)
+{
+	return __bio_clone_bioset(bio_src, gfp_mask, bs, offset, size);
+}
+EXPORT_SYMBOL(bio_clone_bioset_partial);
+
+/**
  *	bio_add_pc_page	-	attempt to add page to bio
  *	@q: the target queue
  *	@bio: destination bio
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 7cf8a6c70a3f..8e521194f6fc 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -183,7 +183,7 @@ static inline void bio_advance_iter(struct bio *bio, struct bvec_iter *iter,
 
 #define bio_iter_last(bvec, iter) ((iter).bi_size == (bvec).bv_len)
 
-static inline unsigned bio_segments(struct bio *bio)
+static inline unsigned __bio_segments(struct bio *bio, struct bvec_iter *bvec)
 {
 	unsigned segs = 0;
 	struct bio_vec bv;
@@ -205,12 +205,17 @@ static inline unsigned bio_segments(struct bio *bio)
 		break;
 	}
 
-	bio_for_each_segment(bv, bio, iter)
+	__bio_for_each_segment(bv, bio, iter, *bvec)
 		segs++;
 
 	return segs;
 }
 
+static inline unsigned bio_segments(struct bio *bio)
+{
+	return __bio_segments(bio, &bio->bi_iter);
+}
+
 /*
  * get a reference to a bio, so it won't disappear. the intended use is
  * something like:
@@ -384,6 +389,8 @@ extern void bio_put(struct bio *);
 extern void __bio_clone_fast(struct bio *, struct bio *);
 extern struct bio *bio_clone_fast(struct bio *, gfp_t, struct bio_set *);
 extern struct bio *bio_clone_bioset(struct bio *, gfp_t, struct bio_set *bs);
+extern struct bio *bio_clone_bioset_partial(struct bio *, gfp_t,
+					    struct bio_set *, int, int);
 
 extern struct bio_set *fs_bio_set;
 
-- 
2.7.4

^ permalink raw reply related

* [PATCH v2 2/5] md: fail if mddev->bio_set can't be created
From: Ming Lei @ 2017-02-14 15:29 UTC (permalink / raw)
  To: Shaohua Li, Jens Axboe, linux-kernel, linux-raid, linux-block,
	Christoph Hellwig, NeilBrown
  Cc: Ming Lei
In-Reply-To: <1487086143-10255-1-git-send-email-tom.leiming@gmail.com>

The current behaviour is to fall back to allocate
bio from 'fs_bio_set', that isn't a correct way
because it might cause deadlock.

So this patch simply return failure if mddev->bio_set
can't be created.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ming Lei <tom.leiming@gmail.com>
---
 drivers/md/md.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 7e9a495e4160..b5e2adf3493b 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -5228,8 +5228,11 @@ int md_run(struct mddev *mddev)
 		sysfs_notify_dirent_safe(rdev->sysfs_state);
 	}
 
-	if (mddev->bio_set == NULL)
+	if (mddev->bio_set == NULL) {
 		mddev->bio_set = bioset_create(BIO_POOL_SIZE, 0);
+		if (!mddev->bio_set)
+			return -ENOMEM;
+	}
 
 	spin_lock(&pers_lock);
 	pers = find_pers(mddev->level, mddev->clevel);
-- 
2.7.4

^ permalink raw reply related

* [PATCH v2 3/5] md/raid1: use bio_clone_bioset_partial() in case of write behind
From: Ming Lei @ 2017-02-14 15:29 UTC (permalink / raw)
  To: Shaohua Li, Jens Axboe, linux-kernel, linux-raid, linux-block,
	Christoph Hellwig, NeilBrown
  Cc: Ming Lei
In-Reply-To: <1487086143-10255-1-git-send-email-tom.leiming@gmail.com>

Write behind need to replace pages in bio's bvecs, and we have
to clone a fresh bio with new bvec table, so use the introduced
bio_clone_bioset_partial() for it.

For other bio_clone_mddev() cases, we will use fast clone since
they don't need to touch bvec table.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ming Lei <tom.leiming@gmail.com>
---
 drivers/md/raid1.c | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 830ff2b20346..b7548e0a9eca 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1341,13 +1341,12 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
 
 	first_clone = 1;
 	for (i = 0; i < disks; i++) {
-		struct bio *mbio;
+		struct bio *mbio = NULL;
+		sector_t offset;
 		if (!r1_bio->bios[i])
 			continue;
 
-		mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
-		bio_trim(mbio, r1_bio->sector - bio->bi_iter.bi_sector,
-			 max_sectors);
+		offset = r1_bio->sector - bio->bi_iter.bi_sector;
 
 		if (first_clone) {
 			/* do behind I/O ?
@@ -1357,8 +1356,13 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
 			if (bitmap &&
 			    (atomic_read(&bitmap->behind_writes)
 			     < mddev->bitmap_info.max_write_behind) &&
-			    !waitqueue_active(&bitmap->behind_wait))
+			    !waitqueue_active(&bitmap->behind_wait)) {
+				mbio = bio_clone_bioset_partial(bio, GFP_NOIO,
+								mddev->bio_set,
+								offset,
+								max_sectors);
 				alloc_behind_pages(mbio, r1_bio);
+			}
 
 			bitmap_startwrite(bitmap, r1_bio->sector,
 					  r1_bio->sectors,
@@ -1366,6 +1370,12 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
 						   &r1_bio->state));
 			first_clone = 0;
 		}
+
+		if (!mbio) {
+			mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
+			bio_trim(mbio, offset, max_sectors);
+		}
+
 		if (r1_bio->behind_bvecs) {
 			struct bio_vec *bvec;
 			int j;
-- 
2.7.4


^ permalink raw reply related

* [PATCH v2 4/5] md: remove unnecessary check on mddev
From: Ming Lei @ 2017-02-14 15:29 UTC (permalink / raw)
  To: Shaohua Li, Jens Axboe, linux-kernel, linux-raid, linux-block,
	Christoph Hellwig, NeilBrown
  Cc: Ming Lei
In-Reply-To: <1487086143-10255-1-git-send-email-tom.leiming@gmail.com>

mddev is never NULL and neither is ->bio_set, so
remove the check.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ming Lei <tom.leiming@gmail.com>
---
 drivers/md/md.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index b5e2adf3493b..6cd96fde2a67 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -193,9 +193,6 @@ EXPORT_SYMBOL_GPL(bio_alloc_mddev);
 struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask,
 			    struct mddev *mddev)
 {
-	if (!mddev || !mddev->bio_set)
-		return bio_clone(bio, gfp_mask);
-
 	return bio_clone_bioset(bio, gfp_mask, mddev->bio_set);
 }
 EXPORT_SYMBOL_GPL(bio_clone_mddev);
-- 
2.7.4

^ permalink raw reply related

* [PATCH v2 5/5] md: fast clone bio in bio_clone_mddev()
From: Ming Lei @ 2017-02-14 15:29 UTC (permalink / raw)
  To: Shaohua Li, Jens Axboe, linux-kernel, linux-raid, linux-block,
	Christoph Hellwig, NeilBrown
  Cc: Ming Lei
In-Reply-To: <1487086143-10255-1-git-send-email-tom.leiming@gmail.com>

Firstly bio_clone_mddev() is used in raid normal I/O and isn't
in resync I/O path.

Secondly all the direct access to bvec table in raid happens on
resync I/O except for write behind of raid1, in which we still
use bio_clone() for allocating new bvec table.

So this patch replaces bio_clone() with bio_clone_fast()
in bio_clone_mddev().

Also kill bio_clone_mddev() and call bio_clone_fast() directly, as
suggested by Christoph Hellwig.

Signed-off-by: Ming Lei <tom.leiming@gmail.com>
---
 drivers/md/faulty.c |  2 +-
 drivers/md/md.c     |  7 -------
 drivers/md/md.h     |  2 --
 drivers/md/raid1.c  | 10 ++++++----
 drivers/md/raid10.c | 11 +++++------
 drivers/md/raid5.c  |  4 ++--
 6 files changed, 14 insertions(+), 22 deletions(-)

diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c
index 685aa2d77e25..b0536cfd8e17 100644
--- a/drivers/md/faulty.c
+++ b/drivers/md/faulty.c
@@ -214,7 +214,7 @@ static void faulty_make_request(struct mddev *mddev, struct bio *bio)
 		}
 	}
 	if (failit) {
-		struct bio *b = bio_clone_mddev(bio, GFP_NOIO, mddev);
+		struct bio *b = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set);
 
 		b->bi_bdev = conf->rdev->bdev;
 		b->bi_private = bio;
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 6cd96fde2a67..985374f20e2e 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -190,13 +190,6 @@ struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
 }
 EXPORT_SYMBOL_GPL(bio_alloc_mddev);
 
-struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask,
-			    struct mddev *mddev)
-{
-	return bio_clone_bioset(bio, gfp_mask, mddev->bio_set);
-}
-EXPORT_SYMBOL_GPL(bio_clone_mddev);
-
 /*
  * We have a system wide 'event count' that is incremented
  * on any 'interesting' event, and readers of /proc/mdstat
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 2a514036a83d..a86ad62079de 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -673,8 +673,6 @@ extern void md_rdev_clear(struct md_rdev *rdev);
 
 extern void mddev_suspend(struct mddev *mddev);
 extern void mddev_resume(struct mddev *mddev);
-extern struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask,
-				   struct mddev *mddev);
 extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
 				   struct mddev *mddev);
 
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index b7548e0a9eca..85f309836fd7 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1108,7 +1108,7 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
 	r1_bio->read_disk = rdisk;
 	r1_bio->start_next_window = 0;
 
-	read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);
+	read_bio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set);
 	bio_trim(read_bio, r1_bio->sector - bio->bi_iter.bi_sector,
 		 max_sectors);
 
@@ -1372,7 +1372,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
 		}
 
 		if (!mbio) {
-			mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
+			mbio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set);
 			bio_trim(mbio, offset, max_sectors);
 		}
 
@@ -2283,7 +2283,8 @@ static int narrow_write_error(struct r1bio *r1_bio, int i)
 
 			wbio->bi_vcnt = vcnt;
 		} else {
-			wbio = bio_clone_mddev(r1_bio->master_bio, GFP_NOIO, mddev);
+			wbio = bio_clone_fast(r1_bio->master_bio, GFP_NOIO,
+					      mddev->bio_set);
 		}
 
 		bio_set_op_attrs(wbio, REQ_OP_WRITE, 0);
@@ -2421,7 +2422,8 @@ static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio)
 		const unsigned long do_sync
 			= r1_bio->master_bio->bi_opf & REQ_SYNC;
 		r1_bio->read_disk = disk;
-		bio = bio_clone_mddev(r1_bio->master_bio, GFP_NOIO, mddev);
+		bio = bio_clone_fast(r1_bio->master_bio, GFP_NOIO,
+				     mddev->bio_set);
 		bio_trim(bio, r1_bio->sector - bio->bi_iter.bi_sector,
 			 max_sectors);
 		r1_bio->bios[r1_bio->read_disk] = bio;
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 6bc5c2a85160..063c43d83b72 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1132,7 +1132,7 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
 	}
 	slot = r10_bio->read_slot;
 
-	read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);
+	read_bio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set);
 	bio_trim(read_bio, r10_bio->sector - bio->bi_iter.bi_sector,
 		 max_sectors);
 
@@ -1406,7 +1406,7 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
 		int d = r10_bio->devs[i].devnum;
 		if (r10_bio->devs[i].bio) {
 			struct md_rdev *rdev = conf->mirrors[d].rdev;
-			mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
+			mbio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set);
 			bio_trim(mbio, r10_bio->sector - bio->bi_iter.bi_sector,
 				 max_sectors);
 			r10_bio->devs[i].bio = mbio;
@@ -1457,7 +1457,7 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
 				smp_mb();
 				rdev = conf->mirrors[d].rdev;
 			}
-			mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
+			mbio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set);
 			bio_trim(mbio, r10_bio->sector - bio->bi_iter.bi_sector,
 				 max_sectors);
 			r10_bio->devs[i].repl_bio = mbio;
@@ -2565,7 +2565,7 @@ static int narrow_write_error(struct r10bio *r10_bio, int i)
 		if (sectors > sect_to_write)
 			sectors = sect_to_write;
 		/* Write at 'sector' for 'sectors' */
-		wbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
+		wbio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set);
 		bio_trim(wbio, sector - bio->bi_iter.bi_sector, sectors);
 		wsector = r10_bio->devs[i].addr + (sector - r10_bio->sector);
 		wbio->bi_iter.bi_sector = wsector +
@@ -2641,8 +2641,7 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
 			   mdname(mddev),
 			   bdevname(rdev->bdev, b),
 			   (unsigned long long)r10_bio->sector);
-	bio = bio_clone_mddev(r10_bio->master_bio,
-			      GFP_NOIO, mddev);
+	bio = bio_clone_fast(r10_bio->master_bio, GFP_NOIO, mddev->bio_set);
 	bio_trim(bio, r10_bio->sector - bio->bi_iter.bi_sector, max_sectors);
 	r10_bio->devs[slot].bio = bio;
 	r10_bio->devs[slot].rdev = rdev;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index a1b41184e850..c16d09614cf6 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -5056,9 +5056,9 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
 		return 0;
 	}
 	/*
-	 * use bio_clone_mddev to make a copy of the bio
+	 * use bio_clone_fast to make a copy of the bio
 	 */
-	align_bi = bio_clone_mddev(raid_bio, GFP_NOIO, mddev);
+	align_bi = bio_clone_fast(raid_bio, GFP_NOIO, mddev->bio_set);
 	if (!align_bi)
 		return 0;
 	/*
-- 
2.7.4

^ permalink raw reply related

* Re: [PATCH v2 0/5] md: use bio_clone_fast()
From: Jens Axboe @ 2017-02-14 15:31 UTC (permalink / raw)
  To: Ming Lei, Shaohua Li, linux-kernel, linux-raid, linux-block,
	Christoph Hellwig, NeilBrown
In-Reply-To: <1487086143-10255-1-git-send-email-tom.leiming@gmail.com>

On 02/14/2017 08:28 AM, Ming Lei wrote:
> Hi,
> 
> This patches replaces bio_clone() with bio_fast_clone() in
> bio_clone_mddev() because:
> 
> 1) bio_clone_mddev() is used in raid normal I/O and isn't in
> resync I/O path, and all the direct access to bvec table in
> raid happens on resync I/O only except for write behind of raid1.
> Write behind is treated specially, so the replacement is safe.
> 
> 2) for write behind, bio_clone() is kept, but this patchset
> introduces bio_clone_bioset_partial() to just clone one specific 
> bvecs range instead of whole table. Then write behind is improved
> too.

You can add my reviewed-by to the first patch. Shaohua, I'm fine
with you carrying this in the md tree, that would be the easiest
way forward.

-- 
Jens Axboe

^ permalink raw reply

* Re: [PATCH v2 5/5] md: fast clone bio in bio_clone_mddev()
From: Christoph Hellwig @ 2017-02-14 16:01 UTC (permalink / raw)
  To: Ming Lei
  Cc: Shaohua Li, Jens Axboe, linux-kernel, linux-raid, linux-block,
	Christoph Hellwig, NeilBrown
In-Reply-To: <1487086143-10255-6-git-send-email-tom.leiming@gmail.com>

On Tue, Feb 14, 2017 at 11:29:03PM +0800, Ming Lei wrote:
> Firstly bio_clone_mddev() is used in raid normal I/O and isn't
> in resync I/O path.
> 
> Secondly all the direct access to bvec table in raid happens on
> resync I/O except for write behind of raid1, in which we still
> use bio_clone() for allocating new bvec table.
> 
> So this patch replaces bio_clone() with bio_clone_fast()
> in bio_clone_mddev().
> 
> Also kill bio_clone_mddev() and call bio_clone_fast() directly, as
> suggested by Christoph Hellwig.
> 
> Signed-off-by: Ming Lei <tom.leiming@gmail.com>

Looks fine,

Reviewed-by: Christoph Hellwig <hch@lst.de>

Btw, can you also tack on another patch to kill bio_alloc_mddev
to be consistent?

^ permalink raw reply

* Re: [PATCH v1 1/5] block: introduce bio_clone_bioset_partial()
From: Christoph Hellwig @ 2017-02-14 16:01 UTC (permalink / raw)
  To: Ming Lei
  Cc: Christoph Hellwig, Shaohua Li, Jens Axboe,
	Linux Kernel Mailing List,
	open list:SOFTWARE RAID (Multiple Disks) SUPPORT, linux-block,
	NeilBrown
In-Reply-To: <CACVXFVNN3Gtm4Ut5KCmx8REbMYnh3Y2+sAZPto7F40fvGyZ=zw@mail.gmail.com>

On Tue, Feb 14, 2017 at 09:04:26AM +0800, Ming Lei wrote:
> On Mon, Feb 13, 2017 at 9:46 PM, Christoph Hellwig <hch@infradead.org> wrote:
> > On Fri, Feb 10, 2017 at 06:56:13PM +0800, Ming Lei wrote:
> >> md still need bio clone(not the fast version) for behind write,
> >> and it is more efficient to use bio_clone_bioset_partial().
> >>
> >> The idea is simple and just copy the bvecs range specified from
> >> parameters.
> >
> > Given how few users bio_clone_bioset has I wonder if we shouldn't
> > simply add the two new arguments to it instead of adding another
> > indirection.
> 
> For md write-behind, looks we have to provide the two arguments,
> could you explain a bit how we can do that by adding another indirection?

I meant to just pass the additional arguments that 
bio_clone_bioset_partial has to bio_clone_bioset.

^ permalink raw reply

* Re: [PATCH v4 3/4] dmaengine: Add Broadcom SBA RAID driver
From: Dan Williams @ 2017-02-14 16:34 UTC (permalink / raw)
  To: Anup Patel
  Cc: Vinod Koul, Rob Herring, Mark Rutland, Herbert Xu,
	David S . Miller, Jassi Brar, Ray Jui, Scott Branden, Jon Mason,
	Rob Rice, BCM Kernel Feedback,
	dmaengine-u79uwXL29TY76Z2rM5mHXA@public.gmane.org, Device Tree,
	linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r@public.gmane.org,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	linux-crypto-u79uwXL29TY76Z2rM5mHXA, linux-raid
In-Reply-To: <1487055112-5185-4-git-send-email-anup.patel-dY08KVG/lbpWk0Htik3J/w@public.gmane.org>

On Mon, Feb 13, 2017 at 10:51 PM, Anup Patel <anup.patel-dY08KVG/lbpWk0Htik3J/w@public.gmane.org> wrote:
> The Broadcom stream buffer accelerator (SBA) provides offloading
> capabilities for RAID operations. This SBA offload engine is
> accessible via Broadcom SoC specific ring manager.
>
> This patch adds Broadcom SBA RAID driver which provides one
> DMA device with RAID capabilities using one or more Broadcom
> SoC specific ring manager channels. The SBA RAID driver in its
> current shape implements memcpy, xor, and pq operations.
>
> Signed-off-by: Anup Patel <anup.patel-dY08KVG/lbpWk0Htik3J/w@public.gmane.org>
> Reviewed-by: Ray Jui <ray.jui-dY08KVG/lbpWk0Htik3J/w@public.gmane.org>
> ---
>  drivers/dma/Kconfig        |   13 +
>  drivers/dma/Makefile       |    1 +
>  drivers/dma/bcm-sba-raid.c | 1694 ++++++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 1708 insertions(+)
>  create mode 100644 drivers/dma/bcm-sba-raid.c
>
> diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig
> index 263495d..bf8fb84 100644
> --- a/drivers/dma/Kconfig
> +++ b/drivers/dma/Kconfig
> @@ -99,6 +99,19 @@ config AXI_DMAC
>           controller is often used in Analog Device's reference designs for FPGA
>           platforms.
>
> +config BCM_SBA_RAID
> +       tristate "Broadcom SBA RAID engine support"
> +       depends on (ARM64 && MAILBOX && RAID6_PQ) || COMPILE_TEST
> +       select DMA_ENGINE
> +       select DMA_ENGINE_RAID
> +       select ASYNC_TX_ENABLE_CHANNEL_SWITCH

I thought you agreed to drop this. Its usage is broken.
--
To unsubscribe from this list: send the line "unsubscribe devicetree" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH v4 1/1] DM: inplace compressed DM target (fwd)
From: Ram Pai @ 2017-02-14 17:56 UTC (permalink / raw)
  To: Julia Lawall
  Cc: linux-doc, linux-kernel, dm-devel, linux-raid, agk, snitzer,
	corbet, shli, hbabu
In-Reply-To: <alpine.DEB.2.20.1702140707530.2033@hadrien>

On Tue, Feb 14, 2017 at 07:09:04AM +0100, Julia Lawall wrote:
> On line 1759, since ret is unsigned it will not be less than 0.

Thanks fixed it. Infact noticed that dm_icomp_io_range_compress() had missed a case where
it was supposed to return error; a negative number.  Fixed that aswell.

RP

> 
> julia
> 
> ---------- Forwarded message ----------
> Date: Tue, 14 Feb 2017 09:00:39 +0800
> From: kbuild test robot <fengguang.wu@intel.com>
> To: kbuild@01.org
> Cc: Julia Lawall <julia.lawall@lip6.fr>
> Subject: Re: [PATCH v4 1/1] DM: inplace compressed DM target
> 
> CC: kbuild-all@01.org
> In-Reply-To: <1487018545-5061-2-git-send-email-linuxram@us.ibm.com>
> 
> Hi Ram,
> 
> [auto build test WARNING on dm/for-next]
> [also build test WARNING on v4.10-rc8 next-20170213]
> [if your patch is applied to the wrong git tree, please drop us a note to help improve the system]
> 
> url:    https://github.com/0day-ci/linux/commits/Ram-Pai/DM-inplace-compressed-DM-target/20170214-055727
> base:   https://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm.git for-next
> :::::: branch date: 3 hours ago
> :::::: commit date: 3 hours ago
> 
> >> drivers/md/dm-inplace-compress.c:1759:5-8: WARNING: Unsigned expression compared with zero: ret < 0
> 
> git remote add linux-review https://github.com/0day-ci/linux
> git remote update linux-review
> git checkout e7924efaaba5efdcd28f32efbb949ed1153c932c
> vim +1759 drivers/md/dm-inplace-compress.c
> 
> e7924efa Ram Pai 2017-02-13  1743   * @io : io range
> e7924efa Ram Pai 2017-02-13  1744   * @sector_start : the sector on backing storage to which the
> e7924efa Ram Pai 2017-02-13  1745   *	compressed data needs to be written.
> e7924efa Ram Pai 2017-02-13  1746   * @meta_start: the page index of the bits corresponding to
> e7924efa Ram Pai 2017-02-13  1747   * @meta_end  : start and end blocks.
> e7924efa Ram Pai 2017-02-13  1748   */
> e7924efa Ram Pai 2017-02-13  1749  static int dm_icomp_compress_write(struct dm_icomp_io_range *io,
> e7924efa Ram Pai 2017-02-13  1750  		sector_t sector_start, u64 *meta_start, u64 *meta_end)
> e7924efa Ram Pai 2017-02-13  1751  {
> e7924efa Ram Pai 2017-02-13  1752  	struct dm_icomp_req *req = io->req;
> e7924efa Ram Pai 2017-02-13  1753  	sector_t count = DMCP_BYTES_TO_SECTOR(io->decomp_len);
> e7924efa Ram Pai 2017-02-13  1754  	unsigned int comp_len, ret;
> e7924efa Ram Pai 2017-02-13  1755  	u64 page_index;
> e7924efa Ram Pai 2017-02-13  1756
> e7924efa Ram Pai 2017-02-13  1757  	/* comp_data must be able to accommadate a larger compress buffer */
> e7924efa Ram Pai 2017-02-13  1758  	ret = dm_icomp_io_range_compress(req->info, io, &comp_len);
> e7924efa Ram Pai 2017-02-13 @1759  	if (ret < 0) {
> e7924efa Ram Pai 2017-02-13  1760  		req->result = -EIO;
> e7924efa Ram Pai 2017-02-13  1761  		return -EIO;
> e7924efa Ram Pai 2017-02-13  1762  	}
> e7924efa Ram Pai 2017-02-13  1763  	WARN_ON(comp_len > io->comp_len);
> e7924efa Ram Pai 2017-02-13  1764
> e7924efa Ram Pai 2017-02-13  1765  	dm_icomp_get_req(req);
> e7924efa Ram Pai 2017-02-13  1766
> e7924efa Ram Pai 2017-02-13  1767  	io->io_req.bi_op = REQ_OP_WRITE;
> 
> ---
> 0-DAY kernel test infrastructure                Open Source Technology Center
> https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

-- 
Ram Pai


^ permalink raw reply

* Re: Enable the skip_copy feature will results in data integrity issue in raid5 degraded mode.
From: Shaohua Li @ 2017-02-14 19:48 UTC (permalink / raw)
  To: Chien Lee; +Cc: linux-raid, NeilBrown, owner-linux-raid
In-Reply-To: <CAByoP04pSvxrb_ubNXjaneHvBMPVrMyMj5QgmBSfXsCa4D=5AA@mail.gmail.com>

On Mon, Feb 13, 2017 at 05:07:45PM +0800, Chien Lee wrote:
> Hello,
> 
> 
> Recently we find a bug about skip_copy feature in raid5 degraded mode.
> In the beginning, we enable the skip_copy feature to speed up system’s
> write performance. But when the system has database read/write I/O
> continually in raid5 degraded mode, the Mongo DB will detect the
> checksum error and generate related debug log. The following is the
> testing detail.
> 
> 
> a. Enable skip_copy
>     --> Checksum error logs from Mongo DB
> 
> 2017-02-06T11:54:56.537+0800 E STORAGE  [conn7] WiredTiger (0)
> [1486353296:537114][52:0x7f98396a4700],
> file:collection-110-3235234017846331078.wt,  WT_CURSOR.next: read
> checksum error for 4096B block at offset 61440: calculated block
> checksum of 1363526237 doesn't match expected checksum of 2969711960
> 
> 
> b. Disable skip_copy
>     --> Mongo DB has no checksum error.
> 
> 
> We've pretty sure that it must be a bug by our repeated database I/O
> testing. When skip_copy feature is enabled, the raid5/raid6 always
> causes the mongo DB checksum error in degraded mode less than one
> hour. On the contrary, it will never cause this abnormal situation
> when the skip_copy feature is disabled. Besides, because the skip_copy
> feature only affects the write action instead of read action, I think
> it should be the write action in degraded mode while skip_copy feature
> is enabled cause this bug.
> 
> 
> Please kindly provide us some help or idea about the root cause and solution.

Thanks for the reporting, I'll look at it. In the meaning time, do you have a
quick way which I can use to reproduce the issue?

Thanks,
Shaohua

^ permalink raw reply

* Re: Enable the skip_copy feature will results in data integrity issue in raid5 degraded mode.
From: Shaohua Li @ 2017-02-15  0:36 UTC (permalink / raw)
  To: Chien Lee; +Cc: linux-raid, NeilBrown, owner-linux-raid
In-Reply-To: <20170214194851.3txkw3nrcxczejyv@kernel.org>

On Tue, Feb 14, 2017 at 11:48:51AM -0800, Shaohua Li wrote:
> On Mon, Feb 13, 2017 at 05:07:45PM +0800, Chien Lee wrote:
> > Hello,
> > 
> > 
> > Recently we find a bug about skip_copy feature in raid5 degraded mode.
> > In the beginning, we enable the skip_copy feature to speed up system’s
> > write performance. But when the system has database read/write I/O
> > continually in raid5 degraded mode, the Mongo DB will detect the
> > checksum error and generate related debug log. The following is the
> > testing detail.
> > 
> > 
> > a. Enable skip_copy
> >     --> Checksum error logs from Mongo DB
> > 
> > 2017-02-06T11:54:56.537+0800 E STORAGE  [conn7] WiredTiger (0)
> > [1486353296:537114][52:0x7f98396a4700],
> > file:collection-110-3235234017846331078.wt,  WT_CURSOR.next: read
> > checksum error for 4096B block at offset 61440: calculated block
> > checksum of 1363526237 doesn't match expected checksum of 2969711960
> > 
> > 
> > b. Disable skip_copy
> >     --> Mongo DB has no checksum error.
> > 
> > 
> > We've pretty sure that it must be a bug by our repeated database I/O
> > testing. When skip_copy feature is enabled, the raid5/raid6 always
> > causes the mongo DB checksum error in degraded mode less than one
> > hour. On the contrary, it will never cause this abnormal situation
> > when the skip_copy feature is disabled. Besides, because the skip_copy
> > feature only affects the write action instead of read action, I think
> > it should be the write action in degraded mode while skip_copy feature
> > is enabled cause this bug.
> > 
> > 
> > Please kindly provide us some help or idea about the root cause and solution.
> 
> Thanks for the reporting, I'll look at it. In the meaning time, do you have a
> quick way which I can use to reproduce the issue?

Can't find anything suspicious after checking a while. Can you describe the
setup/test in detail? like if there is sync running, IO error?

^ permalink raw reply

* Re: [PATCH v1 1/5] block: introduce bio_clone_bioset_partial()
From: Ming Lei @ 2017-02-15  2:26 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Shaohua Li, Jens Axboe, Linux Kernel Mailing List,
	open list:SOFTWARE RAID (Multiple Disks) SUPPORT, linux-block,
	NeilBrown
In-Reply-To: <20170214160151.GB32705@infradead.org>

On Wed, Feb 15, 2017 at 12:01 AM, Christoph Hellwig <hch@infradead.org> wrote:
> On Tue, Feb 14, 2017 at 09:04:26AM +0800, Ming Lei wrote:
>> On Mon, Feb 13, 2017 at 9:46 PM, Christoph Hellwig <hch@infradead.org> wrote:
>> > On Fri, Feb 10, 2017 at 06:56:13PM +0800, Ming Lei wrote:
>> >> md still need bio clone(not the fast version) for behind write,
>> >> and it is more efficient to use bio_clone_bioset_partial().
>> >>
>> >> The idea is simple and just copy the bvecs range specified from
>> >> parameters.
>> >
>> > Given how few users bio_clone_bioset has I wonder if we shouldn't
>> > simply add the two new arguments to it instead of adding another
>> > indirection.
>>
>> For md write-behind, looks we have to provide the two arguments,
>> could you explain a bit how we can do that by adding another indirection?
>
> I meant to just pass the additional arguments that
> bio_clone_bioset_partial has to bio_clone_bioset.

That may cause more changes(fs, ...) into this patchset, so I suggest to
do that in another patchset, especially after we confirmed current
users of bio_clone is absolutely necessary, and I will check if other bio_clone
can be converted to fast clone.


Thanks,
Ming Lei

^ permalink raw reply

* Re: [PATCH v4 3/4] dmaengine: Add Broadcom SBA RAID driver
From: Anup Patel @ 2017-02-15  6:25 UTC (permalink / raw)
  To: Dan Williams
  Cc: Vinod Koul, Rob Herring, Mark Rutland, Herbert Xu,
	David S . Miller, Jassi Brar, Ray Jui, Scott Branden, Jon Mason,
	Rob Rice, BCM Kernel Feedback,
	dmaengine-u79uwXL29TY76Z2rM5mHXA@public.gmane.org, Device Tree,
	linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r@public.gmane.org,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	linux-crypto-u79uwXL29TY76Z2rM5mHXA, linux-raid
In-Reply-To: <CAPcyv4g5PU1EQWq-SCbt0QB=vSaMyafqa-ThJifDxEoQha-6Hw-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>

On Tue, Feb 14, 2017 at 10:04 PM, Dan Williams <dan.j.williams-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org> wrote:
> On Mon, Feb 13, 2017 at 10:51 PM, Anup Patel <anup.patel-dY08KVG/lbpWk0Htik3J/w@public.gmane.org> wrote:
>> The Broadcom stream buffer accelerator (SBA) provides offloading
>> capabilities for RAID operations. This SBA offload engine is
>> accessible via Broadcom SoC specific ring manager.
>>
>> This patch adds Broadcom SBA RAID driver which provides one
>> DMA device with RAID capabilities using one or more Broadcom
>> SoC specific ring manager channels. The SBA RAID driver in its
>> current shape implements memcpy, xor, and pq operations.
>>
>> Signed-off-by: Anup Patel <anup.patel-dY08KVG/lbpWk0Htik3J/w@public.gmane.org>
>> Reviewed-by: Ray Jui <ray.jui-dY08KVG/lbpWk0Htik3J/w@public.gmane.org>
>> ---
>>  drivers/dma/Kconfig        |   13 +
>>  drivers/dma/Makefile       |    1 +
>>  drivers/dma/bcm-sba-raid.c | 1694 ++++++++++++++++++++++++++++++++++++++++++++
>>  3 files changed, 1708 insertions(+)
>>  create mode 100644 drivers/dma/bcm-sba-raid.c
>>
>> diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig
>> index 263495d..bf8fb84 100644
>> --- a/drivers/dma/Kconfig
>> +++ b/drivers/dma/Kconfig
>> @@ -99,6 +99,19 @@ config AXI_DMAC
>>           controller is often used in Analog Device's reference designs for FPGA
>>           platforms.
>>
>> +config BCM_SBA_RAID
>> +       tristate "Broadcom SBA RAID engine support"
>> +       depends on (ARM64 && MAILBOX && RAID6_PQ) || COMPILE_TEST
>> +       select DMA_ENGINE
>> +       select DMA_ENGINE_RAID
>> +       select ASYNC_TX_ENABLE_CHANNEL_SWITCH
>
> I thought you agreed to drop this. Its usage is broken.

If ASYNC_TX_ENABLE_CHANNEL_SWITCH is not selected
then async_dma_find_channel() will only try to find channel
with DMA_ASYNC_TX capability.

The DMA_ASYNC_TX capability is set by
dma_async_device_register() when all Async Tx
capabilities are supported by a DMA devices namely
DMA_INTERRUPT, DMA_MEMCPY, DMA_XOR,
DMA_XOR_VAL, DMA_PQ, and DMA_PQ_VAL.

We only support DMA_MEMCPY, DMA_XOR, and
DMA_PQ capabilities in BCM-SBA-RAID driver so
DMA_ASYNC_TX capability is never set for the
DMA device registered by BCM-SBA-RAID driver.

Due to above, if ASYNC_TX_ENABLE_CHANNEL_SWITCH
is not selected then Async Tx APIs fail to find DMA
channel provided by BCM-SBA-RAID hence the
option ASYNC_TX_ENABLE_CHANNEL_SWITCH is
required for BCM-SBA-RAID.

The DMA mappings are violated by channel switching
only if we switch form DMA channel A to DMA channel
B and both these DMA channels have different underlying
"struct device". In most of the cases DMA mappings
are not violated because DMA channels having
Async Tx capabilities are provided using same
underlying "struct device".

Regards,
Anup
--
To unsubscribe from this list: send the line "unsubscribe devicetree" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH v4 3/4] dmaengine: Add Broadcom SBA RAID driver
From: Dan Williams @ 2017-02-15  6:43 UTC (permalink / raw)
  To: Anup Patel
  Cc: Vinod Koul, Rob Herring, Mark Rutland, Herbert Xu,
	David S . Miller, Jassi Brar, Ray Jui, Scott Branden, Jon Mason,
	Rob Rice, BCM Kernel Feedback,
	dmaengine-u79uwXL29TY76Z2rM5mHXA@public.gmane.org, Device Tree,
	linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r@public.gmane.org,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	linux-crypto-u79uwXL29TY76Z2rM5mHXA, linux-raid
In-Reply-To: <CAALAos_=TN_ZfeJbRQEwuc+t5zaxKVgrgEMNPQQu8K8Q8+8F6A-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>

On Tue, Feb 14, 2017 at 10:25 PM, Anup Patel <anup.patel-dY08KVG/lbpWk0Htik3J/w@public.gmane.org> wrote:
> On Tue, Feb 14, 2017 at 10:04 PM, Dan Williams <dan.j.williams-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org> wrote:
>> On Mon, Feb 13, 2017 at 10:51 PM, Anup Patel <anup.patel-dY08KVG/lbpWk0Htik3J/w@public.gmane.org> wrote:
>>> The Broadcom stream buffer accelerator (SBA) provides offloading
>>> capabilities for RAID operations. This SBA offload engine is
>>> accessible via Broadcom SoC specific ring manager.
>>>
>>> This patch adds Broadcom SBA RAID driver which provides one
>>> DMA device with RAID capabilities using one or more Broadcom
>>> SoC specific ring manager channels. The SBA RAID driver in its
>>> current shape implements memcpy, xor, and pq operations.
>>>
>>> Signed-off-by: Anup Patel <anup.patel-dY08KVG/lbpWk0Htik3J/w@public.gmane.org>
>>> Reviewed-by: Ray Jui <ray.jui-dY08KVG/lbpWk0Htik3J/w@public.gmane.org>
>>> ---
>>>  drivers/dma/Kconfig        |   13 +
>>>  drivers/dma/Makefile       |    1 +
>>>  drivers/dma/bcm-sba-raid.c | 1694 ++++++++++++++++++++++++++++++++++++++++++++
>>>  3 files changed, 1708 insertions(+)
>>>  create mode 100644 drivers/dma/bcm-sba-raid.c
>>>
>>> diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig
>>> index 263495d..bf8fb84 100644
>>> --- a/drivers/dma/Kconfig
>>> +++ b/drivers/dma/Kconfig
>>> @@ -99,6 +99,19 @@ config AXI_DMAC
>>>           controller is often used in Analog Device's reference designs for FPGA
>>>           platforms.
>>>
>>> +config BCM_SBA_RAID
>>> +       tristate "Broadcom SBA RAID engine support"
>>> +       depends on (ARM64 && MAILBOX && RAID6_PQ) || COMPILE_TEST
>>> +       select DMA_ENGINE
>>> +       select DMA_ENGINE_RAID
>>> +       select ASYNC_TX_ENABLE_CHANNEL_SWITCH
>>
>> I thought you agreed to drop this. Its usage is broken.
>
> If ASYNC_TX_ENABLE_CHANNEL_SWITCH is not selected
> then async_dma_find_channel() will only try to find channel
> with DMA_ASYNC_TX capability.
>
> The DMA_ASYNC_TX capability is set by
> dma_async_device_register() when all Async Tx
> capabilities are supported by a DMA devices namely
> DMA_INTERRUPT, DMA_MEMCPY, DMA_XOR,
> DMA_XOR_VAL, DMA_PQ, and DMA_PQ_VAL.
>
> We only support DMA_MEMCPY, DMA_XOR, and
> DMA_PQ capabilities in BCM-SBA-RAID driver so
> DMA_ASYNC_TX capability is never set for the
> DMA device registered by BCM-SBA-RAID driver.
>
> Due to above, if ASYNC_TX_ENABLE_CHANNEL_SWITCH
> is not selected then Async Tx APIs fail to find DMA
> channel provided by BCM-SBA-RAID hence the
> option ASYNC_TX_ENABLE_CHANNEL_SWITCH is
> required for BCM-SBA-RAID.
>
> The DMA mappings are violated by channel switching
> only if we switch form DMA channel A to DMA channel
> B and both these DMA channels have different underlying
> "struct device". In most of the cases DMA mappings
> are not violated because DMA channels having
> Async Tx capabilities are provided using same
> underlying "struct device".

No, fix the infrastructure. Do not put local hack in your driver for
this global problem [1].

[1]: https://lwn.net/Articles/443531/
--
To unsubscribe from this list: send the line "unsubscribe devicetree" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH v4 3/4] dmaengine: Add Broadcom SBA RAID driver
From: Anup Patel @ 2017-02-15  7:03 UTC (permalink / raw)
  To: Dan Williams
  Cc: Vinod Koul, Rob Herring, Mark Rutland, Herbert Xu,
	David S . Miller, Jassi Brar, Ray Jui, Scott Branden, Jon Mason,
	Rob Rice, BCM Kernel Feedback, dmaengine@vger.kernel.org,
	Device Tree, linux-arm-kernel@lists.infradead.org,
	linux-kernel@vger.kernel.org, linux-crypto, linux-raid
In-Reply-To: <CAPcyv4jckb=sbocr3W43NZ2YyUVLbDdhbCAhLfgQwh-bRuvjYQ@mail.gmail.com>

On Wed, Feb 15, 2017 at 12:13 PM, Dan Williams <dan.j.williams@intel.com> wrote:
> On Tue, Feb 14, 2017 at 10:25 PM, Anup Patel <anup.patel@broadcom.com> wrote:
>> On Tue, Feb 14, 2017 at 10:04 PM, Dan Williams <dan.j.williams@intel.com> wrote:
>>> On Mon, Feb 13, 2017 at 10:51 PM, Anup Patel <anup.patel@broadcom.com> wrote:
>>>> The Broadcom stream buffer accelerator (SBA) provides offloading
>>>> capabilities for RAID operations. This SBA offload engine is
>>>> accessible via Broadcom SoC specific ring manager.
>>>>
>>>> This patch adds Broadcom SBA RAID driver which provides one
>>>> DMA device with RAID capabilities using one or more Broadcom
>>>> SoC specific ring manager channels. The SBA RAID driver in its
>>>> current shape implements memcpy, xor, and pq operations.
>>>>
>>>> Signed-off-by: Anup Patel <anup.patel@broadcom.com>
>>>> Reviewed-by: Ray Jui <ray.jui@broadcom.com>
>>>> ---
>>>>  drivers/dma/Kconfig        |   13 +
>>>>  drivers/dma/Makefile       |    1 +
>>>>  drivers/dma/bcm-sba-raid.c | 1694 ++++++++++++++++++++++++++++++++++++++++++++
>>>>  3 files changed, 1708 insertions(+)
>>>>  create mode 100644 drivers/dma/bcm-sba-raid.c
>>>>
>>>> diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig
>>>> index 263495d..bf8fb84 100644
>>>> --- a/drivers/dma/Kconfig
>>>> +++ b/drivers/dma/Kconfig
>>>> @@ -99,6 +99,19 @@ config AXI_DMAC
>>>>           controller is often used in Analog Device's reference designs for FPGA
>>>>           platforms.
>>>>
>>>> +config BCM_SBA_RAID
>>>> +       tristate "Broadcom SBA RAID engine support"
>>>> +       depends on (ARM64 && MAILBOX && RAID6_PQ) || COMPILE_TEST
>>>> +       select DMA_ENGINE
>>>> +       select DMA_ENGINE_RAID
>>>> +       select ASYNC_TX_ENABLE_CHANNEL_SWITCH
>>>
>>> I thought you agreed to drop this. Its usage is broken.
>>
>> If ASYNC_TX_ENABLE_CHANNEL_SWITCH is not selected
>> then async_dma_find_channel() will only try to find channel
>> with DMA_ASYNC_TX capability.
>>
>> The DMA_ASYNC_TX capability is set by
>> dma_async_device_register() when all Async Tx
>> capabilities are supported by a DMA devices namely
>> DMA_INTERRUPT, DMA_MEMCPY, DMA_XOR,
>> DMA_XOR_VAL, DMA_PQ, and DMA_PQ_VAL.
>>
>> We only support DMA_MEMCPY, DMA_XOR, and
>> DMA_PQ capabilities in BCM-SBA-RAID driver so
>> DMA_ASYNC_TX capability is never set for the
>> DMA device registered by BCM-SBA-RAID driver.
>>
>> Due to above, if ASYNC_TX_ENABLE_CHANNEL_SWITCH
>> is not selected then Async Tx APIs fail to find DMA
>> channel provided by BCM-SBA-RAID hence the
>> option ASYNC_TX_ENABLE_CHANNEL_SWITCH is
>> required for BCM-SBA-RAID.
>>
>> The DMA mappings are violated by channel switching
>> only if we switch form DMA channel A to DMA channel
>> B and both these DMA channels have different underlying
>> "struct device". In most of the cases DMA mappings
>> are not violated because DMA channels having
>> Async Tx capabilities are provided using same
>> underlying "struct device".
>
> No, fix the infrastructure. Do not put local hack in your driver for
> this global problem [1].

There is no hack in the driver. We need
ASYNC_TX_ENABLE_CHANNEL_SWITCH
based on current state of dmaengine framework.

The framework should be fixed as separate patchset.

We have other RAID drivers such as xgene-dma and
mv_xor_v2 who also require
ASYNC_TX_ENABLE_CHANNEL_SWITCH due
to same reason.

Fixing the framework and improving framework is
a ongoing process. I don't see why that should
stop this patchset.

Regards,
Anup

^ permalink raw reply

* Re: [PATCH v4 3/4] dmaengine: Add Broadcom SBA RAID driver
From: Dan Williams @ 2017-02-15  7:25 UTC (permalink / raw)
  To: Anup Patel
  Cc: Vinod Koul, Rob Herring, Mark Rutland, Herbert Xu,
	David S . Miller, Jassi Brar, Ray Jui, Scott Branden, Jon Mason,
	Rob Rice, BCM Kernel Feedback,
	dmaengine-u79uwXL29TY76Z2rM5mHXA@public.gmane.org, Device Tree,
	linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r@public.gmane.org,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	linux-crypto-u79uwXL29TY76Z2rM5mHXA, linux-raid
In-Reply-To: <CAALAos-txDCs3QZ4HGBNicOD8t49NPT6E8RpjVMccMn1D-UTgQ-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>

On Tue, Feb 14, 2017 at 11:03 PM, Anup Patel <anup.patel-dY08KVG/lbpWk0Htik3J/w@public.gmane.org> wrote:
> On Wed, Feb 15, 2017 at 12:13 PM, Dan Williams <dan.j.williams-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org> wrote:
>> On Tue, Feb 14, 2017 at 10:25 PM, Anup Patel <anup.patel-dY08KVG/lbpWk0Htik3J/w@public.gmane.org> wrote:
>>> On Tue, Feb 14, 2017 at 10:04 PM, Dan Williams <dan.j.williams-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org> wrote:
>>>> On Mon, Feb 13, 2017 at 10:51 PM, Anup Patel <anup.patel-dY08KVG/lbpWk0Htik3J/w@public.gmane.org> wrote:
>>>>> The Broadcom stream buffer accelerator (SBA) provides offloading
>>>>> capabilities for RAID operations. This SBA offload engine is
>>>>> accessible via Broadcom SoC specific ring manager.
>>>>>
>>>>> This patch adds Broadcom SBA RAID driver which provides one
>>>>> DMA device with RAID capabilities using one or more Broadcom
>>>>> SoC specific ring manager channels. The SBA RAID driver in its
>>>>> current shape implements memcpy, xor, and pq operations.
>>>>>
>>>>> Signed-off-by: Anup Patel <anup.patel-dY08KVG/lbpWk0Htik3J/w@public.gmane.org>
>>>>> Reviewed-by: Ray Jui <ray.jui-dY08KVG/lbpWk0Htik3J/w@public.gmane.org>
>>>>> ---
>>>>>  drivers/dma/Kconfig        |   13 +
>>>>>  drivers/dma/Makefile       |    1 +
>>>>>  drivers/dma/bcm-sba-raid.c | 1694 ++++++++++++++++++++++++++++++++++++++++++++
>>>>>  3 files changed, 1708 insertions(+)
>>>>>  create mode 100644 drivers/dma/bcm-sba-raid.c
>>>>>
>>>>> diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig
>>>>> index 263495d..bf8fb84 100644
>>>>> --- a/drivers/dma/Kconfig
>>>>> +++ b/drivers/dma/Kconfig
>>>>> @@ -99,6 +99,19 @@ config AXI_DMAC
>>>>>           controller is often used in Analog Device's reference designs for FPGA
>>>>>           platforms.
>>>>>
>>>>> +config BCM_SBA_RAID
>>>>> +       tristate "Broadcom SBA RAID engine support"
>>>>> +       depends on (ARM64 && MAILBOX && RAID6_PQ) || COMPILE_TEST
>>>>> +       select DMA_ENGINE
>>>>> +       select DMA_ENGINE_RAID
>>>>> +       select ASYNC_TX_ENABLE_CHANNEL_SWITCH
>>>>
>>>> I thought you agreed to drop this. Its usage is broken.
>>>
>>> If ASYNC_TX_ENABLE_CHANNEL_SWITCH is not selected
>>> then async_dma_find_channel() will only try to find channel
>>> with DMA_ASYNC_TX capability.
>>>
>>> The DMA_ASYNC_TX capability is set by
>>> dma_async_device_register() when all Async Tx
>>> capabilities are supported by a DMA devices namely
>>> DMA_INTERRUPT, DMA_MEMCPY, DMA_XOR,
>>> DMA_XOR_VAL, DMA_PQ, and DMA_PQ_VAL.
>>>
>>> We only support DMA_MEMCPY, DMA_XOR, and
>>> DMA_PQ capabilities in BCM-SBA-RAID driver so
>>> DMA_ASYNC_TX capability is never set for the
>>> DMA device registered by BCM-SBA-RAID driver.
>>>
>>> Due to above, if ASYNC_TX_ENABLE_CHANNEL_SWITCH
>>> is not selected then Async Tx APIs fail to find DMA
>>> channel provided by BCM-SBA-RAID hence the
>>> option ASYNC_TX_ENABLE_CHANNEL_SWITCH is
>>> required for BCM-SBA-RAID.
>>>
>>> The DMA mappings are violated by channel switching
>>> only if we switch form DMA channel A to DMA channel
>>> B and both these DMA channels have different underlying
>>> "struct device". In most of the cases DMA mappings
>>> are not violated because DMA channels having
>>> Async Tx capabilities are provided using same
>>> underlying "struct device".
>>
>> No, fix the infrastructure. Do not put local hack in your driver for
>> this global problem [1].
>
> There is no hack in the driver. We need
> ASYNC_TX_ENABLE_CHANNEL_SWITCH
> based on current state of dmaengine framework.
>
> The framework should be fixed as separate patchset.
>
> We have other RAID drivers such as xgene-dma and
> mv_xor_v2 who also require
> ASYNC_TX_ENABLE_CHANNEL_SWITCH due
> to same reason.
>
> Fixing the framework and improving framework is
> a ongoing process. I don't see why that should
> stop this patchset.
>

Because this driver is turning on a dangerous compile time option and
is not using the functionality. If this silicon IP block appears in
another product in the future paired with another DMA engine then the
assumptions about a safe/single dma-device is violated.

The realization of how async_tx was breaking DMA mapping api
assumptions came after some of these dma-drivers were added to the
kernel. We should stop making the problem worse.

I should have submitted a patch like the below at the time we
discovered this problem, but unfortunately it languished when I
stopped maintaining the iop-adma and ioat drivers.

diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig
index 263495d0adbd..6b30eb9ad125 100644
--- a/drivers/dma/Kconfig
+++ b/drivers/dma/Kconfig
@@ -35,6 +35,7 @@ comment "DMA Devices"

 #core
 config ASYNC_TX_ENABLE_CHANNEL_SWITCH
+       depends on BROKEN
        bool

 config ARCH_HAS_ASYNC_TX_FIND_CHANNEL
--
To unsubscribe from this list: send the line "unsubscribe devicetree" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related

* Re: [PATCH v4 3/4] dmaengine: Add Broadcom SBA RAID driver
From: Anup Patel @ 2017-02-15  8:33 UTC (permalink / raw)
  To: Dan Williams
  Cc: Vinod Koul, Rob Herring, Mark Rutland, Herbert Xu,
	David S . Miller, Jassi Brar, Ray Jui, Scott Branden, Jon Mason,
	Rob Rice, BCM Kernel Feedback, dmaengine@vger.kernel.org,
	Device Tree, linux-arm-kernel@lists.infradead.org,
	linux-kernel@vger.kernel.org, linux-crypto, linux-raid
In-Reply-To: <CAPcyv4h83vgeB82KmEEFV196MsWdteRqypY5_tcSyaMM_QwYMA@mail.gmail.com>

On Wed, Feb 15, 2017 at 12:55 PM, Dan Williams <dan.j.williams@intel.com> wrote:
> On Tue, Feb 14, 2017 at 11:03 PM, Anup Patel <anup.patel@broadcom.com> wrote:
>> On Wed, Feb 15, 2017 at 12:13 PM, Dan Williams <dan.j.williams@intel.com> wrote:
>>> On Tue, Feb 14, 2017 at 10:25 PM, Anup Patel <anup.patel@broadcom.com> wrote:
>>>> On Tue, Feb 14, 2017 at 10:04 PM, Dan Williams <dan.j.williams@intel.com> wrote:
>>>>> On Mon, Feb 13, 2017 at 10:51 PM, Anup Patel <anup.patel@broadcom.com> wrote:
>>>>>> The Broadcom stream buffer accelerator (SBA) provides offloading
>>>>>> capabilities for RAID operations. This SBA offload engine is
>>>>>> accessible via Broadcom SoC specific ring manager.
>>>>>>
>>>>>> This patch adds Broadcom SBA RAID driver which provides one
>>>>>> DMA device with RAID capabilities using one or more Broadcom
>>>>>> SoC specific ring manager channels. The SBA RAID driver in its
>>>>>> current shape implements memcpy, xor, and pq operations.
>>>>>>
>>>>>> Signed-off-by: Anup Patel <anup.patel@broadcom.com>
>>>>>> Reviewed-by: Ray Jui <ray.jui@broadcom.com>
>>>>>> ---
>>>>>>  drivers/dma/Kconfig        |   13 +
>>>>>>  drivers/dma/Makefile       |    1 +
>>>>>>  drivers/dma/bcm-sba-raid.c | 1694 ++++++++++++++++++++++++++++++++++++++++++++
>>>>>>  3 files changed, 1708 insertions(+)
>>>>>>  create mode 100644 drivers/dma/bcm-sba-raid.c
>>>>>>
>>>>>> diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig
>>>>>> index 263495d..bf8fb84 100644
>>>>>> --- a/drivers/dma/Kconfig
>>>>>> +++ b/drivers/dma/Kconfig
>>>>>> @@ -99,6 +99,19 @@ config AXI_DMAC
>>>>>>           controller is often used in Analog Device's reference designs for FPGA
>>>>>>           platforms.
>>>>>>
>>>>>> +config BCM_SBA_RAID
>>>>>> +       tristate "Broadcom SBA RAID engine support"
>>>>>> +       depends on (ARM64 && MAILBOX && RAID6_PQ) || COMPILE_TEST
>>>>>> +       select DMA_ENGINE
>>>>>> +       select DMA_ENGINE_RAID
>>>>>> +       select ASYNC_TX_ENABLE_CHANNEL_SWITCH
>>>>>
>>>>> I thought you agreed to drop this. Its usage is broken.
>>>>
>>>> If ASYNC_TX_ENABLE_CHANNEL_SWITCH is not selected
>>>> then async_dma_find_channel() will only try to find channel
>>>> with DMA_ASYNC_TX capability.
>>>>
>>>> The DMA_ASYNC_TX capability is set by
>>>> dma_async_device_register() when all Async Tx
>>>> capabilities are supported by a DMA devices namely
>>>> DMA_INTERRUPT, DMA_MEMCPY, DMA_XOR,
>>>> DMA_XOR_VAL, DMA_PQ, and DMA_PQ_VAL.
>>>>
>>>> We only support DMA_MEMCPY, DMA_XOR, and
>>>> DMA_PQ capabilities in BCM-SBA-RAID driver so
>>>> DMA_ASYNC_TX capability is never set for the
>>>> DMA device registered by BCM-SBA-RAID driver.
>>>>
>>>> Due to above, if ASYNC_TX_ENABLE_CHANNEL_SWITCH
>>>> is not selected then Async Tx APIs fail to find DMA
>>>> channel provided by BCM-SBA-RAID hence the
>>>> option ASYNC_TX_ENABLE_CHANNEL_SWITCH is
>>>> required for BCM-SBA-RAID.
>>>>
>>>> The DMA mappings are violated by channel switching
>>>> only if we switch form DMA channel A to DMA channel
>>>> B and both these DMA channels have different underlying
>>>> "struct device". In most of the cases DMA mappings
>>>> are not violated because DMA channels having
>>>> Async Tx capabilities are provided using same
>>>> underlying "struct device".
>>>
>>> No, fix the infrastructure. Do not put local hack in your driver for
>>> this global problem [1].
>>
>> There is no hack in the driver. We need
>> ASYNC_TX_ENABLE_CHANNEL_SWITCH
>> based on current state of dmaengine framework.
>>
>> The framework should be fixed as separate patchset.
>>
>> We have other RAID drivers such as xgene-dma and
>> mv_xor_v2 who also require
>> ASYNC_TX_ENABLE_CHANNEL_SWITCH due
>> to same reason.
>>
>> Fixing the framework and improving framework is
>> a ongoing process. I don't see why that should
>> stop this patchset.
>>
>
> Because this driver is turning on a dangerous compile time option and
> is not using the functionality. If this silicon IP block appears in
> another product in the future paired with another DMA engine then the
> assumptions about a safe/single dma-device is violated.
>
> The realization of how async_tx was breaking DMA mapping api
> assumptions came after some of these dma-drivers were added to the
> kernel. We should stop making the problem worse.
>
> I should have submitted a patch like the below at the time we
> discovered this problem, but unfortunately it languished when I
> stopped maintaining the iop-adma and ioat drivers.
>
> diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig
> index 263495d0adbd..6b30eb9ad125 100644
> --- a/drivers/dma/Kconfig
> +++ b/drivers/dma/Kconfig
> @@ -35,6 +35,7 @@ comment "DMA Devices"
>
>  #core
>  config ASYNC_TX_ENABLE_CHANNEL_SWITCH
> +       depends on BROKEN
>         bool
>
>  config ARCH_HAS_ASYNC_TX_FIND_CHANNEL

Instead of selecting
ASYNC_TX_ENABLE_CHANNEL_SWITCH,
we can select the following in BCM_SBA_RAID config
option:
1. ASYNC_TX_DISABLE_XOR_VAL
2. ASYNC_TX_DISABLE_PQ_VAL

This will satisfy the needs of
dma_async_device_register() when
ASYNC_TX_ENABLE_CHANNEL_SWITCH is
not selected.

Will this be acceptable ??

Regards,
Anup

^ permalink raw reply

* [PATCH V3 1/2] RAID1: a new I/O barrier implementation to remove resync window
From: colyli @ 2017-02-15 16:35 UTC (permalink / raw)
  To: linux-raid
  Cc: Coly Li, Shaohua Li, Neil Brown, Johannes Thumshirn,
	Guoqing Jiang

'Commit 79ef3a8aa1cb ("raid1: Rewrite the implementation of iobarrier.")'
introduces a sliding resync window for raid1 I/O barrier, this idea limits
I/O barriers to happen only inside a slidingresync window, for regular
I/Os out of this resync window they don't need to wait for barrier any
more. On large raid1 device, it helps a lot to improve parallel writing
I/O throughput when there are background resync I/Os performing at
same time.

The idea of sliding resync widow is awesome, but code complexity is a
challenge. Sliding resync window requires several veriables to work
collectively, this is complexed and very hard to make it work correctly.
Just grep "Fixes: 79ef3a8aa1" in kernel git log, there are 8 more patches
to fix the original resync window patch. This is not the end, any further
related modification may easily introduce more regreassion.

Therefore I decide to implement a much simpler raid1 I/O barrier, by
removing resync window code, I believe life will be much easier.

The brief idea of the simpler barrier is,
 - Do not maintain a logbal unique resync window
 - Use multiple hash buckets to reduce I/O barrier conflictions, regular
   I/O only has to wait for a resync I/O when both them have same barrier
   bucket index, vice versa.
 - I/O barrier can be recuded to an acceptable number if there are enought
   barrier buckets

Here I explain how the barrier buckets are designed,
 - BARRIER_UNIT_SECTOR_SIZE
   The whole LBA address space of a raid1 device is divided into multiple
   barrier units, by the size of BARRIER_UNIT_SECTOR_SIZE.
   Bio request won't go across border of barrier unit size, that means
   maximum bio size is BARRIER_UNIT_SECTOR_SIZE<<9 (64MB) in bytes.
   For random I/O 64MB is large enough for both read and write requests,
   for sequential I/O considering underlying block layer may merge them
   into larger requests, 64MB is still good enough.
   Neil also points out that for resync operation, "we want the resync to
   move from region to region fairly quickly so that the slowness caused
   by having to synchronize with the resync is averaged out over a fairly
   small time frame". For full speed resync, 64MB should take less then 1
   second. When resync is competing with other I/O, it could take up a few
   minutes. Therefore 64MB size is fairly good range for resync.

 - BARRIER_BUCKETS_NR
   There are BARRIER_BUCKETS_NR buckets in total, which is defined by,
        #define BARRIER_BUCKETS_NR_BITS   (PAGE_SHIFT - 2)
        #define BARRIER_BUCKETS_NR        (1<<BARRIER_BUCKETS_NR_BITS)
   this patch makes the bellowed members of struct r1conf from integer
   to array of integers,
        -       int                     nr_pending;
        -       int                     nr_waiting;
        -       int                     nr_queued;
        -       int                     barrier;
        +       int                     *nr_pending;
        +       int                     *nr_waiting;
        +       int                     *nr_queued;
        +       int                     *barrier;
   number of the array elements is defined as BARRIER_BUCKETS_NR. For 4KB
   kernel space page size, (PAGE_SHIFT - 2) indecates there are 1024 I/O
   barrier buckets, and each array of integers occupies single memory page.
   1024 means for a request which is smaller than the I/O barrier unit size
   has ~0.1% chance to wait for resync to pause, which is quite a small
   enough fraction. Also requesting single memory page is more friendly to
   kernel page allocator than larger memory size.

 - I/O barrier bucket is indexed by bio start sector
   If multiple I/O requests hit different I/O barrier units, they only need
   to compete I/O barrier with other I/Os which hit the same I/O barrier
   bucket index with each other. The index of a barrier bucket which a
   bio should look for is calculated by sector_to_idx() which is defined
   in raid1.h as an inline function,
        static inline int sector_to_idx(sector_t sector)
        {
                return hash_long(sector >> BARRIER_UNIT_SECTOR_BITS,
                                BARRIER_BUCKETS_NR_BITS);
        }
   Here sector_nr is the start sector number of a bio.

 - Single bio won't go across boundary of a I/O barrier unit
   If a request goes across boundary of barrier unit, it will be split. A
   bio may be split in raid1_make_request() or raid1_sync_request(), if
   sectors returned by align_to_barrier_unit_end() is small than original
   bio size.

Comparing to single sliding resync window,
 - Currently resync I/O grows linearly, therefore regular and resync I/O
   will have confliction within a single barrier units. So the I/O
   behavior is similar to single sliding resync window.
 - But a barrier unit bucket is shared by all barrier units with identical
   barrier uinit index, the probability of confliction might be higher
   than single sliding resync window, in condition that writing I/Os
   always hit barrier units which have identical barrier bucket indexs with
   the resync I/Os. This is a very rare condition in real I/O work loads,
   I cannot imagine how it could happen in practice.
 - Therefore we can achieve a good enough low confliction rate with much
   simpler barrier algorithm and implementation.

There are two changes should be noticed,
 - In raid1d(), I change the code to decrease conf->nr_pending[idx] into
   single loop, it looks like this,
        spin_lock_irqsave(&conf->device_lock, flags);
        conf->nr_queued[idx]--;
        spin_unlock_irqrestore(&conf->device_lock, flags);
   This change generates more spin lock operations, but in next patch of
   this patch set, it will be replaced by a single line code,
        atomic_dec(&conf->nr_queueud[idx]);
   So we don't need to worry about spin lock cost here.
 - Mainline raid1 code split original raid1_make_request() into
   raid1_read_request() and raid1_write_request(). If the original bio
   goes across an I/O barrier unit size, this bio will be split before
   calling raid1_read_request() or raid1_write_request(),  this change
   the code logic more simple and clear.
 - In this patch wait_barrier() is moved from raid1_make_request() to
   raid1_write_request(). In raid_read_request(), original wait_barrier()
   is replaced by raid1_read_request().
   The differnece is wait_read_barrier() only waits if array is frozen,
   using different barrier function in different code path makes the code
   more clean and easy to read.
Changelog
V3:
- Rebase the patch against latest upstream kernel code.
- Many fixes by review comments from Neil,
  - Back to use pointers to replace arraries in struct r1conf
  - Remove total_barriers from struct r1conf
  - Add more patch comments to explain how/why the values of
    BARRIER_UNIT_SECTOR_SIZE and BARRIER_BUCKETS_NR are decided.
  - Use get_unqueued_pending() to replace get_all_pendings() and
    get_all_queued()
  - Increase bucket number from 512 to 1024
- Change code comments format by review from Shaohua.
V2:
- Use bio_split() to split the orignal bio if it goes across barrier unit
  bounday, to make the code more simple, by suggestion from Shaohua and
  Neil.
- Use hash_long() to replace original linear hash, to avoid a possible
  confilict between resync I/O and sequential write I/O, by suggestion from
  Shaohua.
- Add conf->total_barriers to record barrier depth, which is used to
  control number of parallel sync I/O barriers, by suggestion from Shaohua.
- In V1 patch the bellowed barrier buckets related members in r1conf are
  allocated in memory page. To make the code more simple, V2 patch moves
  the memory space into struct r1conf, like this,
        -       int                     nr_pending;
        -       int                     nr_waiting;
        -       int                     nr_queued;
        -       int                     barrier;
        +       int                     nr_pending[BARRIER_BUCKETS_NR];
        +       int                     nr_waiting[BARRIER_BUCKETS_NR];
        +       int                     nr_queued[BARRIER_BUCKETS_NR];
        +       int                     barrier[BARRIER_BUCKETS_NR];
  This change is by the suggestion from Shaohua.
- Remove some inrelavent code comments, by suggestion from Guoqing.
- Add a missing wait_barrier() before jumping to retry_write, in
  raid1_make_write_request().
V1:
- Original RFC patch for comments

Signed-off-by: Coly Li <colyli@suse.de>
Cc: Shaohua Li <shli@fb.com>
Cc: Neil Brown <neilb@suse.de>
Cc: Johannes Thumshirn <jthumshirn@suse.de>
Cc: Guoqing Jiang <gqjiang@suse.com>
---
 drivers/md/raid1.c | 447 ++++++++++++++++++++++++++++++-----------------------
 drivers/md/raid1.h |  42 +++--
 2 files changed, 275 insertions(+), 214 deletions(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 7b0f647..4234494 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -71,9 +71,8 @@
  */
 static int max_queued_requests = 1024;
 
-static void allow_barrier(struct r1conf *conf, sector_t start_next_window,
-			  sector_t bi_sector);
-static void lower_barrier(struct r1conf *conf);
+static void allow_barrier(struct r1conf *conf, sector_t sector_nr);
+static void lower_barrier(struct r1conf *conf, sector_t sector_nr);
 
 #define raid1_log(md, fmt, args...)				\
 	do { if ((md)->queue) blk_add_trace_msg((md)->queue, "raid1 " fmt, ##args); } while (0)
@@ -100,7 +99,6 @@ static void r1bio_pool_free(void *r1_bio, void *data)
 #define RESYNC_WINDOW_SECTORS (RESYNC_WINDOW >> 9)
 #define CLUSTER_RESYNC_WINDOW (16 * RESYNC_WINDOW)
 #define CLUSTER_RESYNC_WINDOW_SECTORS (CLUSTER_RESYNC_WINDOW >> 9)
-#define NEXT_NORMALIO_DISTANCE (3 * RESYNC_WINDOW_SECTORS)
 
 static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
 {
@@ -215,7 +213,7 @@ static void put_buf(struct r1bio *r1_bio)
 
 	mempool_free(r1_bio, conf->r1buf_pool);
 
-	lower_barrier(conf);
+	lower_barrier(conf, r1_bio->sector);
 }
 
 static void reschedule_retry(struct r1bio *r1_bio)
@@ -223,10 +221,12 @@ static void reschedule_retry(struct r1bio *r1_bio)
 	unsigned long flags;
 	struct mddev *mddev = r1_bio->mddev;
 	struct r1conf *conf = mddev->private;
+	int idx;
 
+	idx = sector_to_idx(r1_bio->sector);
 	spin_lock_irqsave(&conf->device_lock, flags);
 	list_add(&r1_bio->retry_list, &conf->retry_list);
-	conf->nr_queued ++;
+	conf->nr_queued[idx]++;
 	spin_unlock_irqrestore(&conf->device_lock, flags);
 
 	wake_up(&conf->wait_barrier);
@@ -243,7 +243,6 @@ static void call_bio_endio(struct r1bio *r1_bio)
 	struct bio *bio = r1_bio->master_bio;
 	int done;
 	struct r1conf *conf = r1_bio->mddev->private;
-	sector_t start_next_window = r1_bio->start_next_window;
 	sector_t bi_sector = bio->bi_iter.bi_sector;
 
 	if (bio->bi_phys_segments) {
@@ -269,7 +268,7 @@ static void call_bio_endio(struct r1bio *r1_bio)
 		 * Wake up any possible resync thread that waits for the device
 		 * to go idle.
 		 */
-		allow_barrier(conf, start_next_window, bi_sector);
+		allow_barrier(conf, bi_sector);
 	}
 }
 
@@ -517,6 +516,25 @@ static void raid1_end_write_request(struct bio *bio)
 		bio_put(to_put);
 }
 
+static sector_t align_to_barrier_unit_end(sector_t start_sector,
+					  sector_t sectors)
+{
+	sector_t len;
+
+	WARN_ON(sectors == 0);
+	/*
+	 * len is the number of sectors from start_sector to end of the
+	 * barrier unit which start_sector belongs to.
+	 */
+	len = round_up(start_sector + 1, BARRIER_UNIT_SECTOR_SIZE) -
+	      start_sector;
+
+	if (len > sectors)
+		len = sectors;
+
+	return len;
+}
+
 /*
  * This routine returns the disk from which the requested read should
  * be done. There is a per-array 'next expected sequential IO' sector
@@ -813,168 +831,168 @@ static void flush_pending_writes(struct r1conf *conf)
  */
 static void raise_barrier(struct r1conf *conf, sector_t sector_nr)
 {
+	int idx = sector_to_idx(sector_nr);
+
 	spin_lock_irq(&conf->resync_lock);
 
 	/* Wait until no block IO is waiting */
-	wait_event_lock_irq(conf->wait_barrier, !conf->nr_waiting,
+	wait_event_lock_irq(conf->wait_barrier, !conf->nr_waiting[idx],
 			    conf->resync_lock);
 
 	/* block any new IO from starting */
-	conf->barrier++;
-	conf->next_resync = sector_nr;
+	conf->barrier[idx]++;
 
 	/* For these conditions we must wait:
 	 * A: while the array is in frozen state
-	 * B: while barrier >= RESYNC_DEPTH, meaning resync reach
-	 *    the max count which allowed.
-	 * C: next_resync + RESYNC_SECTORS > start_next_window, meaning
-	 *    next resync will reach to the window which normal bios are
-	 *    handling.
-	 * D: while there are any active requests in the current window.
+	 * B: while conf->nr_pending[idx] is not 0, meaning regular I/O
+	 *    existing in corresponding I/O barrier bucket.
+	 * C: while conf->barrier[idx] >= RESYNC_DEPTH, meaning reaches
+	 *    max resync count which allowed on current I/O barrier bucket.
 	 */
 	wait_event_lock_irq(conf->wait_barrier,
 			    !conf->array_frozen &&
-			    conf->barrier < RESYNC_DEPTH &&
-			    conf->current_window_requests == 0 &&
-			    (conf->start_next_window >=
-			     conf->next_resync + RESYNC_SECTORS),
+			     !conf->nr_pending[idx] &&
+			     conf->barrier[idx] < RESYNC_DEPTH,
 			    conf->resync_lock);
 
-	conf->nr_pending++;
+	conf->nr_pending[idx]++;
 	spin_unlock_irq(&conf->resync_lock);
 }
 
-static void lower_barrier(struct r1conf *conf)
+static void lower_barrier(struct r1conf *conf, sector_t sector_nr)
 {
 	unsigned long flags;
-	BUG_ON(conf->barrier <= 0);
+	int idx = sector_to_idx(sector_nr);
+
+	BUG_ON(conf->barrier[idx] <= 0);
+
 	spin_lock_irqsave(&conf->resync_lock, flags);
-	conf->barrier--;
-	conf->nr_pending--;
+	conf->barrier[idx]--;
+	conf->nr_pending[idx]--;
 	spin_unlock_irqrestore(&conf->resync_lock, flags);
 	wake_up(&conf->wait_barrier);
 }
 
-static bool need_to_wait_for_sync(struct r1conf *conf, struct bio *bio)
+static void _wait_barrier(struct r1conf *conf, int idx)
 {
-	bool wait = false;
-
-	if (conf->array_frozen || !bio)
-		wait = true;
-	else if (conf->barrier && bio_data_dir(bio) == WRITE) {
-		if ((conf->mddev->curr_resync_completed
-		     >= bio_end_sector(bio)) ||
-		    (conf->start_next_window + NEXT_NORMALIO_DISTANCE
-		     <= bio->bi_iter.bi_sector))
-			wait = false;
-		else
-			wait = true;
+	spin_lock_irq(&conf->resync_lock);
+	if (conf->array_frozen || conf->barrier[idx]) {
+		conf->nr_waiting[idx]++;
+		/* Wait for the barrier to drop. */
+		wait_event_lock_irq(
+			conf->wait_barrier,
+			!conf->array_frozen && !conf->barrier[idx],
+			conf->resync_lock);
+		conf->nr_waiting[idx]--;
 	}
 
-	return wait;
+	conf->nr_pending[idx]++;
+	spin_unlock_irq(&conf->resync_lock);
 }
 
-static sector_t wait_barrier(struct r1conf *conf, struct bio *bio)
+static void wait_read_barrier(struct r1conf *conf, sector_t sector_nr)
 {
-	sector_t sector = 0;
+	int idx = sector_to_idx(sector_nr);
 
 	spin_lock_irq(&conf->resync_lock);
-	if (need_to_wait_for_sync(conf, bio)) {
-		conf->nr_waiting++;
-		/* Wait for the barrier to drop.
-		 * However if there are already pending
-		 * requests (preventing the barrier from
-		 * rising completely), and the
-		 * per-process bio queue isn't empty,
-		 * then don't wait, as we need to empty
-		 * that queue to allow conf->start_next_window
-		 * to increase.
-		 */
-		raid1_log(conf->mddev, "wait barrier");
-		wait_event_lock_irq(conf->wait_barrier,
-				    !conf->array_frozen &&
-				    (!conf->barrier ||
-				     ((conf->start_next_window <
-				       conf->next_resync + RESYNC_SECTORS) &&
-				      current->bio_list &&
-				      !bio_list_empty(current->bio_list))),
-				    conf->resync_lock);
-		conf->nr_waiting--;
-	}
-
-	if (bio && bio_data_dir(bio) == WRITE) {
-		if (bio->bi_iter.bi_sector >= conf->next_resync) {
-			if (conf->start_next_window == MaxSector)
-				conf->start_next_window =
-					conf->next_resync +
-					NEXT_NORMALIO_DISTANCE;
-
-			if ((conf->start_next_window + NEXT_NORMALIO_DISTANCE)
-			    <= bio->bi_iter.bi_sector)
-				conf->next_window_requests++;
-			else
-				conf->current_window_requests++;
-			sector = conf->start_next_window;
-		}
+	if (conf->array_frozen) {
+		conf->nr_waiting[idx]++;
+		/* Wait for array to unfreeze */
+		wait_event_lock_irq(
+			conf->wait_barrier,
+			!conf->array_frozen,
+			conf->resync_lock);
+		conf->nr_waiting[idx]--;
 	}
 
-	conf->nr_pending++;
+	conf->nr_pending[idx]++;
 	spin_unlock_irq(&conf->resync_lock);
-	return sector;
 }
 
-static void allow_barrier(struct r1conf *conf, sector_t start_next_window,
-			  sector_t bi_sector)
+static void wait_barrier(struct r1conf *conf, sector_t sector_nr)
+{
+	int idx = sector_to_idx(sector_nr);
+
+	_wait_barrier(conf, idx);
+}
+
+static void wait_all_barriers(struct r1conf *conf)
+{
+	int idx;
+
+	for (idx = 0; idx < BARRIER_BUCKETS_NR; idx++)
+		_wait_barrier(conf, idx);
+}
+
+static void _allow_barrier(struct r1conf *conf, int idx)
 {
 	unsigned long flags;
 
 	spin_lock_irqsave(&conf->resync_lock, flags);
-	conf->nr_pending--;
-	if (start_next_window) {
-		if (start_next_window == conf->start_next_window) {
-			if (conf->start_next_window + NEXT_NORMALIO_DISTANCE
-			    <= bi_sector)
-				conf->next_window_requests--;
-			else
-				conf->current_window_requests--;
-		} else
-			conf->current_window_requests--;
-
-		if (!conf->current_window_requests) {
-			if (conf->next_window_requests) {
-				conf->current_window_requests =
-					conf->next_window_requests;
-				conf->next_window_requests = 0;
-				conf->start_next_window +=
-					NEXT_NORMALIO_DISTANCE;
-			} else
-				conf->start_next_window = MaxSector;
-		}
-	}
+	conf->nr_pending[idx]--;
 	spin_unlock_irqrestore(&conf->resync_lock, flags);
 	wake_up(&conf->wait_barrier);
 }
 
+static void allow_barrier(struct r1conf *conf, sector_t sector_nr)
+{
+	int idx = sector_to_idx(sector_nr);
+
+	_allow_barrier(conf, idx);
+}
+
+static void allow_all_barriers(struct r1conf *conf)
+{
+	int idx;
+
+	for (idx = 0; idx < BARRIER_BUCKETS_NR; idx++)
+		_allow_barrier(conf, idx);
+}
+
+/* conf->resync_lock should be held */
+static int get_unqueued_pending(struct r1conf *conf)
+{
+	int idx, ret;
+
+	for (ret = 0, idx = 0; idx < BARRIER_BUCKETS_NR; idx++)
+		ret += conf->nr_pending[idx] - conf->nr_queued[idx];
+
+	return ret;
+}
+
 static void freeze_array(struct r1conf *conf, int extra)
 {
-	/* stop syncio and normal IO and wait for everything to
+	/* Stop sync I/O and normal I/O and wait for everything to
 	 * go quite.
-	 * We wait until nr_pending match nr_queued+extra
-	 * This is called in the context of one normal IO request
-	 * that has failed. Thus any sync request that might be pending
-	 * will be blocked by nr_pending, and we need to wait for
-	 * pending IO requests to complete or be queued for re-try.
-	 * Thus the number queued (nr_queued) plus this request (extra)
-	 * must match the number of pending IOs (nr_pending) before
-	 * we continue.
+	 * This is called in two situations:
+	 * 1) management command handlers (reshape, remove disk, quiesce).
+	 * 2) one normal I/O request failed.
+
+	 * After array_frozen is set to 1, new sync IO will be blocked at
+	 * raise_barrier(), and new normal I/O will blocked at _wait_barrier()
+	 * or wait_read_barrier(). The flying I/Os will either complete or be
+	 * queued. When everything goes quite, there are only queued I/Os left.
+
+	 * Every flying I/O contributes to a conf->nr_pending[idx], idx is the
+	 * barrier bucket index which this I/O request hits. When all sync and
+	 * normal I/O are queued, sum of all conf->nr_pending[] will match sum
+	 * of all conf->nr_queued[]. But normal I/O failure is an exception,
+	 * in handle_read_error(), we may call freeze_array() before trying to
+	 * fix the read error. In this case, the error read I/O is not queued,
+	 * so get_unqueued_pending() == 1.
+	 *
+	 * Therefore before this function returns, we need to wait until
+	 * get_unqueued_pendings(conf) gets equal to extra. For
+	 * normal I/O context, extra is 1, in rested situations extra is 0.
 	 */
 	spin_lock_irq(&conf->resync_lock);
 	conf->array_frozen = 1;
 	raid1_log(conf->mddev, "wait freeze");
-	wait_event_lock_irq_cmd(conf->wait_barrier,
-				conf->nr_pending == conf->nr_queued+extra,
-				conf->resync_lock,
-				flush_pending_writes(conf));
+	wait_event_lock_irq_cmd(
+		conf->wait_barrier,
+		get_unqueued_pending(conf) == extra,
+		conf->resync_lock,
+		flush_pending_writes(conf));
 	spin_unlock_irq(&conf->resync_lock);
 }
 static void unfreeze_array(struct r1conf *conf)
@@ -1070,11 +1088,11 @@ static void raid1_unplug(struct blk_plug_cb *cb, bool from_schedule)
 	kfree(plug);
 }
 
-static void raid1_read_request(struct mddev *mddev, struct bio *bio,
-				 struct r1bio *r1_bio)
+static void raid1_read_request(struct mddev *mddev, struct bio *bio)
 {
 	struct r1conf *conf = mddev->private;
 	struct raid1_info *mirror;
+	struct r1bio *r1_bio;
 	struct bio *read_bio;
 	struct bitmap *bitmap = mddev->bitmap;
 	const int op = bio_op(bio);
@@ -1083,7 +1101,34 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
 	int max_sectors;
 	int rdisk;
 
-	wait_barrier(conf, bio);
+	/*
+	 * Still need barrier for READ in case that whole
+	 * array is frozen.
+	 */
+	wait_read_barrier(conf, bio->bi_iter.bi_sector);
+	bitmap = mddev->bitmap;
+
+	/*
+	 * make_request() can abort the operation when read-ahead is being
+	 * used and no empty request is available.
+	 *
+	 */
+	r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
+	r1_bio->master_bio = bio;
+	r1_bio->sectors = bio_sectors(bio);
+	r1_bio->state = 0;
+	r1_bio->mddev = mddev;
+	r1_bio->sector = bio->bi_iter.bi_sector;
+	/*
+	 * We might need to issue multiple reads to different
+	 * devices if there are bad blocks around, so we keep
+	 * track of the number of reads in bio->bi_phys_segments.
+	 * If this is 0, there is only one r1_bio and no locking
+	 * will be needed when requests complete.  If it is
+	 * non-zero, then it is the number of not-completed requests.
+	 */
+	bio->bi_phys_segments = 0;
+	bio_clear_flag(bio, BIO_SEG_VALID);
 
 read_again:
 	rdisk = read_balance(conf, r1_bio, &max_sectors);
@@ -1106,7 +1151,6 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
 			   atomic_read(&bitmap->behind_writes) == 0);
 	}
 	r1_bio->read_disk = rdisk;
-	r1_bio->start_next_window = 0;
 
 	read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);
 	bio_trim(read_bio, r1_bio->sector - bio->bi_iter.bi_sector,
@@ -1163,10 +1207,10 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
 		generic_make_request(read_bio);
 }
 
-static void raid1_write_request(struct mddev *mddev, struct bio *bio,
-				struct r1bio *r1_bio)
+static void raid1_write_request(struct mddev *mddev, struct bio *bio)
 {
 	struct r1conf *conf = mddev->private;
+	struct r1bio *r1_bio;
 	int i, disks;
 	struct bitmap *bitmap = mddev->bitmap;
 	unsigned long flags;
@@ -1180,7 +1224,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
 	int first_clone;
 	int sectors_handled;
 	int max_sectors;
-	sector_t start_next_window;
 
 	/*
 	 * Register the new request and wait if the reconstruction
@@ -1216,7 +1259,29 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
 		}
 		finish_wait(&conf->wait_barrier, &w);
 	}
-	start_next_window = wait_barrier(conf, bio);
+	wait_barrier(conf, bio->bi_iter.bi_sector);
+
+	/*
+	 * make_request() can abort the operation when read-ahead is being
+	 * used and no empty request is available.
+	 *
+	 */
+	r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
+	r1_bio->master_bio = bio;
+	r1_bio->sectors = bio_sectors(bio);
+	r1_bio->state = 0;
+	r1_bio->mddev = mddev;
+	r1_bio->sector = bio->bi_iter.bi_sector;
+	/*
+	 * We might need to issue multiple writes to different
+	 * devices if there are bad blocks around, so we keep
+	 * track of the number of writes in bio->bi_phys_segments.
+	 * If this is 0, there is only one r1_bio and no locking
+	 * will be needed when requests complete.  If it is
+	 * non-zero, then it is the number of not-completed requests.
+	 */
+	bio->bi_phys_segments = 0;
+	bio_clear_flag(bio, BIO_SEG_VALID);
 
 	if (conf->pending_count >= max_queued_requests) {
 		md_wakeup_thread(mddev->thread);
@@ -1237,7 +1302,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
 
 	disks = conf->raid_disks * 2;
  retry_write:
-	r1_bio->start_next_window = start_next_window;
 	blocked_rdev = NULL;
 	rcu_read_lock();
 	max_sectors = r1_bio->sectors;
@@ -1304,25 +1368,15 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
 	if (unlikely(blocked_rdev)) {
 		/* Wait for this device to become unblocked */
 		int j;
-		sector_t old = start_next_window;
 
 		for (j = 0; j < i; j++)
 			if (r1_bio->bios[j])
 				rdev_dec_pending(conf->mirrors[j].rdev, mddev);
 		r1_bio->state = 0;
-		allow_barrier(conf, start_next_window, bio->bi_iter.bi_sector);
+		allow_barrier(conf, bio->bi_iter.bi_sector);
 		raid1_log(mddev, "wait rdev %d blocked", blocked_rdev->raid_disk);
 		md_wait_for_blocked_rdev(blocked_rdev, mddev);
-		start_next_window = wait_barrier(conf, bio);
-		/*
-		 * We must make sure the multi r1bios of bio have
-		 * the same value of bi_phys_segments
-		 */
-		if (bio->bi_phys_segments && old &&
-		    old != start_next_window)
-			/* Wait for the former r1bio(s) to complete */
-			wait_event(conf->wait_barrier,
-				   bio->bi_phys_segments == 1);
+		wait_barrier(conf, bio->bi_iter.bi_sector);
 		goto retry_write;
 	}
 
@@ -1447,36 +1501,26 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
 
 static void raid1_make_request(struct mddev *mddev, struct bio *bio)
 {
-	struct r1conf *conf = mddev->private;
-	struct r1bio *r1_bio;
+	void (*make_request_fn)(struct mddev *mddev, struct bio *bio);
+	struct bio *split;
+	sector_t sectors;
 
-	/*
-	 * make_request() can abort the operation when read-ahead is being
-	 * used and no empty request is available.
-	 *
-	 */
-	r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
+	make_request_fn = (bio_data_dir(bio) == READ) ?
+			  raid1_read_request : raid1_write_request;
 
-	r1_bio->master_bio = bio;
-	r1_bio->sectors = bio_sectors(bio);
-	r1_bio->state = 0;
-	r1_bio->mddev = mddev;
-	r1_bio->sector = bio->bi_iter.bi_sector;
-
-	/*
-	 * We might need to issue multiple reads to different devices if there
-	 * are bad blocks around, so we keep track of the number of reads in
-	 * bio->bi_phys_segments.  If this is 0, there is only one r1_bio and
-	 * no locking will be needed when requests complete.  If it is
-	 * non-zero, then it is the number of not-completed requests.
-	 */
-	bio->bi_phys_segments = 0;
-	bio_clear_flag(bio, BIO_SEG_VALID);
+	/* if bio exceeds barrier unit boundary, split it */
+	do {
+		sectors = align_to_barrier_unit_end(
+				bio->bi_iter.bi_sector, bio_sectors(bio));
+		if (sectors < bio_sectors(bio)) {
+			split = bio_split(bio, sectors, GFP_NOIO, fs_bio_set);
+			bio_chain(split, bio);
+		} else {
+			split = bio;
+		}
 
-	if (bio_data_dir(bio) == READ)
-		raid1_read_request(mddev, bio, r1_bio);
-	else
-		raid1_write_request(mddev, bio, r1_bio);
+		make_request_fn(mddev, split);
+	} while (split != bio);
 }
 
 static void raid1_status(struct seq_file *seq, struct mddev *mddev)
@@ -1567,19 +1611,11 @@ static void print_conf(struct r1conf *conf)
 
 static void close_sync(struct r1conf *conf)
 {
-	wait_barrier(conf, NULL);
-	allow_barrier(conf, 0, 0);
+	wait_all_barriers(conf);
+	allow_all_barriers(conf);
 
 	mempool_destroy(conf->r1buf_pool);
 	conf->r1buf_pool = NULL;
-
-	spin_lock_irq(&conf->resync_lock);
-	conf->next_resync = MaxSector - 2 * NEXT_NORMALIO_DISTANCE;
-	conf->start_next_window = MaxSector;
-	conf->current_window_requests +=
-		conf->next_window_requests;
-	conf->next_window_requests = 0;
-	spin_unlock_irq(&conf->resync_lock);
 }
 
 static int raid1_spare_active(struct mddev *mddev)
@@ -2326,8 +2362,9 @@ static void handle_sync_write_finished(struct r1conf *conf, struct r1bio *r1_bio
 
 static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio)
 {
-	int m;
+	int m, idx;
 	bool fail = false;
+
 	for (m = 0; m < conf->raid_disks * 2 ; m++)
 		if (r1_bio->bios[m] == IO_MADE_GOOD) {
 			struct md_rdev *rdev = conf->mirrors[m].rdev;
@@ -2353,7 +2390,8 @@ static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio)
 	if (fail) {
 		spin_lock_irq(&conf->device_lock);
 		list_add(&r1_bio->retry_list, &conf->bio_end_io_list);
-		conf->nr_queued++;
+		idx = sector_to_idx(r1_bio->sector);
+		conf->nr_queued[idx]++;
 		spin_unlock_irq(&conf->device_lock);
 		md_wakeup_thread(conf->mddev->thread);
 	} else {
@@ -2475,6 +2513,7 @@ static void raid1d(struct md_thread *thread)
 	struct r1conf *conf = mddev->private;
 	struct list_head *head = &conf->retry_list;
 	struct blk_plug plug;
+	int idx;
 
 	md_check_recovery(mddev);
 
@@ -2482,17 +2521,17 @@ static void raid1d(struct md_thread *thread)
 	    !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
 		LIST_HEAD(tmp);
 		spin_lock_irqsave(&conf->device_lock, flags);
-		if (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
-			while (!list_empty(&conf->bio_end_io_list)) {
-				list_move(conf->bio_end_io_list.prev, &tmp);
-				conf->nr_queued--;
-			}
-		}
+		if (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags))
+			list_splice_init(&conf->bio_end_io_list, &tmp);
 		spin_unlock_irqrestore(&conf->device_lock, flags);
 		while (!list_empty(&tmp)) {
 			r1_bio = list_first_entry(&tmp, struct r1bio,
 						  retry_list);
 			list_del(&r1_bio->retry_list);
+			idx = sector_to_idx(r1_bio->sector);
+			spin_lock_irqsave(&conf->device_lock, flags);
+			conf->nr_queued[idx]--;
+			spin_unlock_irqrestore(&conf->device_lock, flags);
 			if (mddev->degraded)
 				set_bit(R1BIO_Degraded, &r1_bio->state);
 			if (test_bit(R1BIO_WriteError, &r1_bio->state))
@@ -2513,7 +2552,8 @@ static void raid1d(struct md_thread *thread)
 		}
 		r1_bio = list_entry(head->prev, struct r1bio, retry_list);
 		list_del(head->prev);
-		conf->nr_queued--;
+		idx = sector_to_idx(r1_bio->sector);
+		conf->nr_queued[idx]--;
 		spin_unlock_irqrestore(&conf->device_lock, flags);
 
 		mddev = r1_bio->mddev;
@@ -2552,7 +2592,6 @@ static int init_resync(struct r1conf *conf)
 					  conf->poolinfo);
 	if (!conf->r1buf_pool)
 		return -ENOMEM;
-	conf->next_resync = 0;
 	return 0;
 }
 
@@ -2581,6 +2620,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
 	int still_degraded = 0;
 	int good_sectors = RESYNC_SECTORS;
 	int min_bad = 0; /* number of sectors that are bad in all devices */
+	int idx = sector_to_idx(sector_nr);
 
 	if (!conf->r1buf_pool)
 		if (init_resync(conf))
@@ -2630,7 +2670,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
 	 * If there is non-resync activity waiting for a turn, then let it
 	 * though before starting on this new sync request.
 	 */
-	if (conf->nr_waiting)
+	if (conf->nr_waiting[idx])
 		schedule_timeout_uninterruptible(1);
 
 	/* we are incrementing sector_nr below. To be safe, we check against
@@ -2657,6 +2697,8 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
 	r1_bio->sector = sector_nr;
 	r1_bio->state = 0;
 	set_bit(R1BIO_IsSync, &r1_bio->state);
+	/* make sure good_sectors won't go across barrier unit boundary */
+	good_sectors = align_to_barrier_unit_end(sector_nr, good_sectors);
 
 	for (i = 0; i < conf->raid_disks * 2; i++) {
 		struct md_rdev *rdev;
@@ -2887,6 +2929,26 @@ static struct r1conf *setup_conf(struct mddev *mddev)
 	if (!conf)
 		goto abort;
 
+	conf->nr_pending = kcalloc(BARRIER_BUCKETS_NR,
+				   sizeof(int), GFP_KERNEL);
+	if (!conf->nr_pending)
+		goto abort;
+
+	conf->nr_waiting = kcalloc(BARRIER_BUCKETS_NR,
+				   sizeof(int), GFP_KERNEL);
+	if (!conf->nr_waiting)
+		goto abort;
+
+	conf->nr_queued = kcalloc(BARRIER_BUCKETS_NR,
+				  sizeof(int), GFP_KERNEL);
+	if (!conf->nr_queued)
+		goto abort;
+
+	conf->barrier = kcalloc(BARRIER_BUCKETS_NR,
+				sizeof(int), GFP_KERNEL);
+	if (!conf->barrier)
+		goto abort;
+
 	conf->mirrors = kzalloc(sizeof(struct raid1_info)
 				* mddev->raid_disks * 2,
 				 GFP_KERNEL);
@@ -2942,9 +3004,6 @@ static struct r1conf *setup_conf(struct mddev *mddev)
 	conf->pending_count = 0;
 	conf->recovery_disabled = mddev->recovery_disabled - 1;
 
-	conf->start_next_window = MaxSector;
-	conf->current_window_requests = conf->next_window_requests = 0;
-
 	err = -EIO;
 	for (i = 0; i < conf->raid_disks * 2; i++) {
 
@@ -2987,6 +3046,10 @@ static struct r1conf *setup_conf(struct mddev *mddev)
 		kfree(conf->mirrors);
 		safe_put_page(conf->tmppage);
 		kfree(conf->poolinfo);
+		kfree(conf->nr_pending);
+		kfree(conf->nr_waiting);
+		kfree(conf->nr_queued);
+		kfree(conf->barrier);
 		kfree(conf);
 	}
 	return ERR_PTR(err);
@@ -3088,6 +3151,10 @@ static void raid1_free(struct mddev *mddev, void *priv)
 	kfree(conf->mirrors);
 	safe_put_page(conf->tmppage);
 	kfree(conf->poolinfo);
+	kfree(conf->nr_pending);
+	kfree(conf->nr_waiting);
+	kfree(conf->nr_queued);
+	kfree(conf->barrier);
 	kfree(conf);
 }
 
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
index c52ef42..d3faf30 100644
--- a/drivers/md/raid1.h
+++ b/drivers/md/raid1.h
@@ -1,6 +1,14 @@
 #ifndef _RAID1_H
 #define _RAID1_H
 
+/* each barrier unit size is 64MB fow now
+ * note: it must be larger than RESYNC_DEPTH
+ */
+#define BARRIER_UNIT_SECTOR_BITS	17
+#define BARRIER_UNIT_SECTOR_SIZE	(1<<17)
+#define BARRIER_BUCKETS_NR_BITS		(PAGE_SHIFT - 2)
+#define BARRIER_BUCKETS_NR		(1<<BARRIER_BUCKETS_NR_BITS)
+
 struct raid1_info {
 	struct md_rdev	*rdev;
 	sector_t	head_position;
@@ -35,25 +43,6 @@ struct r1conf {
 						 */
 	int			raid_disks;
 
-	/* During resync, read_balancing is only allowed on the part
-	 * of the array that has been resynced.  'next_resync' tells us
-	 * where that is.
-	 */
-	sector_t		next_resync;
-
-	/* When raid1 starts resync, we divide array into four partitions
-	 * |---------|--------------|---------------------|-------------|
-	 *        next_resync   start_next_window       end_window
-	 * start_next_window = next_resync + NEXT_NORMALIO_DISTANCE
-	 * end_window = start_next_window + NEXT_NORMALIO_DISTANCE
-	 * current_window_requests means the count of normalIO between
-	 *   start_next_window and end_window.
-	 * next_window_requests means the count of normalIO after end_window.
-	 * */
-	sector_t		start_next_window;
-	int			current_window_requests;
-	int			next_window_requests;
-
 	spinlock_t		device_lock;
 
 	/* list of 'struct r1bio' that need to be processed by raid1d,
@@ -79,10 +68,10 @@ struct r1conf {
 	 */
 	wait_queue_head_t	wait_barrier;
 	spinlock_t		resync_lock;
-	int			nr_pending;
-	int			nr_waiting;
-	int			nr_queued;
-	int			barrier;
+	int			*nr_pending;
+	int			*nr_waiting;
+	int			*nr_queued;
+	int			*barrier;
 	int			array_frozen;
 
 	/* Set to 1 if a full sync is needed, (fresh device added).
@@ -135,7 +124,6 @@ struct r1bio {
 						 * in this BehindIO request
 						 */
 	sector_t		sector;
-	sector_t		start_next_window;
 	int			sectors;
 	unsigned long		state;
 	struct mddev		*mddev;
@@ -185,4 +173,10 @@ enum r1bio_state {
 	R1BIO_WriteError,
 	R1BIO_FailFast,
 };
+
+static inline int sector_to_idx(sector_t sector)
+{
+	return hash_long(sector >> BARRIER_UNIT_SECTOR_BITS,
+			 BARRIER_BUCKETS_NR_BITS);
+}
 #endif
-- 
2.6.6


^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox