linuxppc-dev.lists.ozlabs.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 02/11][v2] async_tx: add support for asynchronous GF multiplication
@ 2008-12-08 21:55 Yuri Tikhonov
  2008-12-17 18:34 ` Dan Williams
  0 siblings, 1 reply; 3+ messages in thread
From: Yuri Tikhonov @ 2008-12-08 21:55 UTC (permalink / raw)
  To: linux-raid; +Cc: linuxppc-dev, dan.j.williams, wd, dzu, yanok

This adds support for doing asynchronous GF multiplication by adding
four additional functions to async_tx API:

 async_pq() does simultaneous XOR of sources and XOR of sources
  GF-multiplied by given coefficients.

 async_pq_zero_sum() checks if results of calculations match given
  ones.

 async_gen_syndrome() does sumultaneous XOR and R/S syndrome of sources.

 async_syndrome_zerosum() checks if results of XOR/syndrome calculation
  matches given ones.

Latter two functions just use async_pq() with the approprite coefficients
in asynchronous case but have significant optimizations if synchronous
case.

To support this API dmaengine driver should set DMA_PQ and
DMA_PQ_ZERO_SUM capabilities and provide device_prep_dma_pq and
device_prep_dma_pqzero_sum methods in dma_device structure.

Signed-off-by: Yuri Tikhonov <yur@emcraft.com>
Signed-off-by: Ilya Yanok <yanok@emcraft.com>
---
 crypto/async_tx/Kconfig    |    4 +
 crypto/async_tx/Makefile   |    1 +
 crypto/async_tx/async_pq.c |  586 ++++++++++++++++++++++++++++++++++++++++++++
 include/linux/async_tx.h   |   45 ++++-
 include/linux/dmaengine.h  |   16 ++-
 5 files changed, 648 insertions(+), 4 deletions(-)
 create mode 100644 crypto/async_tx/async_pq.c

diff --git a/crypto/async_tx/Kconfig b/crypto/async_tx/Kconfig
index d8fb391..cb6d731 100644
--- a/crypto/async_tx/Kconfig
+++ b/crypto/async_tx/Kconfig
@@ -14,3 +14,7 @@ config ASYNC_MEMSET
 	tristate
 	select ASYNC_CORE
 
+config ASYNC_PQ
+	tristate
+	select ASYNC_CORE
+
diff --git a/crypto/async_tx/Makefile b/crypto/async_tx/Makefile
index 27baa7d..1b99265 100644
--- a/crypto/async_tx/Makefile
+++ b/crypto/async_tx/Makefile
@@ -2,3 +2,4 @@ obj-$(CONFIG_ASYNC_CORE) += async_tx.o
 obj-$(CONFIG_ASYNC_MEMCPY) += async_memcpy.o
 obj-$(CONFIG_ASYNC_MEMSET) += async_memset.o
 obj-$(CONFIG_ASYNC_XOR) += async_xor.o
+obj-$(CONFIG_ASYNC_PQ) += async_pq.o
diff --git a/crypto/async_tx/async_pq.c b/crypto/async_tx/async_pq.c
new file mode 100644
index 0000000..439338f
--- /dev/null
+++ b/crypto/async_tx/async_pq.c
@@ -0,0 +1,586 @@
+/*
+ *	Copyright(c) 2007 Yuri Tikhonov <yur@emcraft.com>
+ *
+ *	Developed for DENX Software Engineering GmbH
+ *
+ *	Asynchronous GF-XOR calculations ASYNC_TX API.
+ *
+ *	based on async_xor.c code written by:
+ *		Dan Williams <dan.j.williams@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+ *
+ * The full GNU General Public License is included in this distribution in the
+ * file called COPYING.
+ */
+#include <linux/kernel.h>
+#include <linux/interrupt.h>
+#include <linux/dma-mapping.h>
+#include <linux/raid/xor.h>
+#include <linux/async_tx.h>
+
+#include "../drivers/md/raid6.h"
+
+/**
+ *  The following static variables are used in cases of synchronous
+ * zero sum to save the values to check. Two pages used for zero sum and
+ * the third one is for dumb P destination when calling gen_syndrome()
+ */
+static spinlock_t spare_lock;
+struct page *spare_pages[3];
+
+/**
+ * do_async_pq - asynchronously calculate P and/or Q
+ */
+static struct dma_async_tx_descriptor *
+do_async_pq(struct dma_chan *chan, struct page **blocks,
+	unsigned char *scf_list, unsigned int offset, int src_cnt, size_t len,
+	enum async_tx_flags flags, struct dma_async_tx_descriptor *depend_tx,
+	dma_async_tx_callback cb_fn, void *cb_param)
+{
+	struct dma_device *dma = chan->device;
+	dma_addr_t dma_dest[2], dma_src[src_cnt];
+	struct dma_async_tx_descriptor *tx = NULL;
+	dma_async_tx_callback _cb_fn;
+	void *_cb_param;
+	int i, pq_src_cnt, src_off = 0;
+	enum async_tx_flags async_flags;
+	enum dma_ctrl_flags dma_flags = 0;
+
+	/*  If we won't handle src_cnt in one shot, then the following
+	 * flag(s) will be set only on the first pass of prep_dma
+	 */
+	if (flags & ASYNC_TX_PQ_ZERO_P)
+		dma_flags |= DMA_PREP_ZERO_P;
+	if (flags & ASYNC_TX_PQ_ZERO_Q)
+		dma_flags |= DMA_PREP_ZERO_Q;
+
+	/* DMAs use destinations as sources, so use BIDIRECTIONAL mapping */
+	dma_dest[0] = !blocks[src_cnt] ? 0 :
+				dma_map_page(dma->dev, blocks[src_cnt],
+					     offset, len, DMA_BIDIRECTIONAL);
+	dma_dest[1] = !blocks[src_cnt+1] ? 0 :
+				dma_map_page(dma->dev, blocks[src_cnt+1],
+					     offset, len, DMA_BIDIRECTIONAL);
+
+	for (i = 0; i < src_cnt; i++)
+		dma_src[i] = dma_map_page(dma->dev, blocks[i],
+					  offset, len, DMA_TO_DEVICE);
+
+	while (src_cnt) {
+		async_flags = flags;
+		pq_src_cnt = min(src_cnt, dma->max_pq);
+		/* if we are submitting additional pqs, leave the chain open,
+		 * clear the callback parameters, and leave the destination
+		 * buffers mapped
+		 */
+		if (src_cnt > pq_src_cnt) {
+			async_flags &= ~ASYNC_TX_ACK;
+			dma_flags |= DMA_COMPL_SKIP_DEST_UNMAP;
+			_cb_fn = NULL;
+			_cb_param = NULL;
+		} else {
+			_cb_fn = cb_fn;
+			_cb_param = cb_param;
+		}
+		if (_cb_fn)
+			dma_flags |= DMA_PREP_INTERRUPT;
+
+		/* Since we have clobbered the src_list we are committed
+		 * to doing this asynchronously.  Drivers force forward
+		 * progress in case they can not provide a descriptor
+		 */
+		tx = dma->device_prep_dma_pq(chan, dma_dest,
+					     &dma_src[src_off], pq_src_cnt,
+					     scf_list ? &scf_list[src_off] :
+							NULL,
+					     len, dma_flags);
+		if (unlikely(!tx))
+			async_tx_quiesce(&depend_tx);
+
+		/* spin wait for the preceeding transactions to complete */
+		while (unlikely(!tx)) {
+			dma_async_issue_pending(chan);
+			tx = dma->device_prep_dma_pq(chan, dma_dest,
+					&dma_src[src_off], pq_src_cnt,
+					scf_list ? &scf_list[src_off] : NULL,
+					len, dma_flags);
+		}
+
+		async_tx_submit(chan, tx, async_flags, depend_tx,
+				_cb_fn, _cb_param);
+
+		depend_tx = tx;
+		flags |= ASYNC_TX_DEP_ACK;
+
+		if (src_cnt > pq_src_cnt) {
+			/* drop completed sources */
+			src_cnt -= pq_src_cnt;
+			src_off += pq_src_cnt;
+
+			/* use the intermediate result as a source; we
+			 * clear DMA_PREP_ZERO, so prep_dma_pq will
+			 * include destination(s) into calculations
+			 */
+			dma_flags = 0;
+		} else
+			break;
+	}
+
+	return tx;
+}
+
+/**
+ * do_sync_pq - synchronously calculate P and Q
+ */
+static void
+do_sync_pq(struct page **blocks, unsigned char *scf, unsigned int offset,
+	int src_cnt, size_t len, enum async_tx_flags flags,
+	struct dma_async_tx_descriptor *depend_tx,
+	dma_async_tx_callback cb_fn, void *cb_param)
+{
+	int i, pos;
+	uint8_t *p, *q, *src;
+
+	/* set destination addresses */
+	p = blocks[src_cnt] ?
+		(uint8_t *)(page_address(blocks[src_cnt]) + offset) :
+		NULL;
+	q = blocks[src_cnt+1] ?
+		(uint8_t *)(page_address(blocks[src_cnt+1]) + offset) :
+		NULL;
+
+	if (flags & ASYNC_TX_PQ_ZERO_P) {
+		BUG_ON(!p);
+		memset(p, 0, len);
+	}
+
+	if (flags & ASYNC_TX_PQ_ZERO_Q) {
+		BUG_ON(!q);
+		memset(q, 0, len);
+	}
+
+	for (i = 0; i < src_cnt; i++) {
+		src = (uint8_t *)(page_address(blocks[i]) + offset);
+		for (pos = 0; pos < len; pos++) {
+			if (p)
+				p[pos] ^= src[pos];
+			if (q)
+				q[pos] ^= raid6_gfmul[scf[i]][src[pos]];
+		}
+	}
+	async_tx_sync_epilog(cb_fn, cb_param);
+}
+
+/**
+ * async_pq - attempt to do XOR and Galois calculations in parallel using
+ *	a dma engine.
+ * @blocks: source block array from 0 to (src_cnt-1) with the p destination
+ *	at blocks[src_cnt] and q at blocks[src_cnt + 1]. Only one of two
+ *	destinations may be present (another then has to be set to NULL).
+ *	By default, the result of calculations is XOR-ed with the initial
+ *	content of the destinationa buffers. Use ASYNC_TX_PQ_ZERO_x flags
+ *	to avoid this.
+ *	NOTE: client code must assume the contents of this array are destroyed
+ * @scf: array of source coefficients used in GF-multiplication
+ * @offset: offset in pages to start transaction
+ * @src_cnt: number of source pages
+ * @len: length in bytes
+ * @flags: ASYNC_TX_PQ_ZERO_P, ASYNC_TX_PQ_ZERO_Q, ASYNC_TX_ASSUME_COHERENT,
+ *	ASYNC_TX_ACK, ASYNC_TX_DEP_ACK, ASYNC_TX_ASYNC_ONLY
+ * @depend_tx: depends on the result of this transaction.
+ * @cb_fn: function to call when the operation completes
+ * @cb_param: parameter to pass to the callback routine
+ */
+struct dma_async_tx_descriptor *
+async_pq(struct page **blocks, unsigned char *scf,
+	unsigned int offset, int src_cnt, size_t len,
+	enum async_tx_flags flags,
+	struct dma_async_tx_descriptor *depend_tx,
+	dma_async_tx_callback cb_fn, void *cb_param)
+{
+	struct dma_chan *chan = async_tx_find_channel(depend_tx, DMA_PQ,
+					&blocks[src_cnt], 2,
+					blocks, src_cnt, len);
+	struct dma_device *device = chan ? chan->device : NULL;
+	struct dma_async_tx_descriptor *tx = NULL;
+
+	if (!device && (flags & ASYNC_TX_ASYNC_ONLY))
+		return NULL;
+
+	if (device) {
+		/* run pq asynchronously */
+		tx = do_async_pq(chan, blocks, scf, offset, src_cnt,
+			len, flags, depend_tx, cb_fn,cb_param);
+	} else {
+		/* run pq synchronously */
+		if (!blocks[src_cnt+1]) {
+			struct page *pdst = blocks[src_cnt];
+			int i;
+
+			/* Calculate P-parity only.
+			 * As opposite to async_xor(), async_pq() assumes
+			 * that destinations are included into calculations,
+			 * so we should re-arrange the xor src list to
+			 * achieve the similar behavior.
+			 */
+			if (!(flags & ASYNC_TX_PQ_ZERO_P)) {
+				/* If async_pq() user doesn't set ZERO flag,
+				 * it's assumed that destination has some
+				 * reasonable data to include in calculations.
+				 * The destination must be at position 0, so
+				 * shift the sources and put pdst at the
+				 * beginning of the list.
+				 */
+				for (i = src_cnt - 1; i >= 0; i--)
+					blocks[i+1] = blocks[i];
+				blocks[0] = pdst;
+				src_cnt++;
+				flags |= ASYNC_TX_XOR_DROP_DST;
+			} else {
+				/* If async_pq() user want to clear P, then
+				 * this will be done automatically in async
+				 * case, and with the help of ZERO_DST in
+				 * the sync one.
+				 */
+				flags &= ~ASYNC_TX_PQ_ZERO_P;
+				flags |= ASYNC_TX_XOR_ZERO_DST;
+			}
+
+
+			return async_xor(pdst, blocks, offset,
+					 src_cnt, len, flags, depend_tx,
+					 cb_fn, cb_param);
+		}
+
+		/* wait for any prerequisite operations */
+		async_tx_quiesce(&depend_tx);
+
+		do_sync_pq(blocks, scf, offset, src_cnt, len, flags,
+			depend_tx, cb_fn, cb_param);
+	}
+
+	return tx;
+}
+EXPORT_SYMBOL_GPL(async_pq);
+
+/**
+ * do_sync_gen_syndrome - synchronously calculate P (xor) and Q (Reed-Solomon
+ *	code)
+ */
+static void
+do_sync_gen_syndrome(struct page **blocks, unsigned int offset,
+	int src_cnt, size_t len, enum async_tx_flags flags,
+	struct dma_async_tx_descriptor *depend_tx,
+	dma_async_tx_callback cb_fn, void *cb_param)
+{
+	int i;
+	void *tsrc[src_cnt+2];
+
+	for (i = 0; i < src_cnt + 2; i++)
+		tsrc[i] = page_address(blocks[i]) + offset;
+
+	raid6_call.gen_syndrome(i, len, tsrc);
+
+	async_tx_sync_epilog(cb_fn, cb_param);
+}
+
+/**
+ * async_gen_syndrome - attempt to generate P (xor) and Q (Reed-Solomon code)
+ *	with a dma engine for a given set of blocks.  This routine assumes a
+ *	field of GF(2^8) with a primitive polynomial of 0x11d and a generator
+ *	of {02}.
+ * @blocks: source block array ordered from 0..src_cnt-1 with the P destination
+ *	at blocks[src_cnt] and Q at blocks[src_cnt + 1]. Only one of two
+ *	destinations may be present (another then has to be set to NULL).
+ *	NOTE: client code must assume the contents of this array are destroyed
+ * @offset: offset in pages to start transaction
+ * @src_cnt: number of source pages: 2 < src_cnt <= 255
+ * @len: length of blocks in bytes
+ * @flags: ASYNC_TX_ACK, ASYNC_TX_DEP_ACK, ASYNC_TX_ASYNC_ONLY
+ * @depend_tx: P+Q operation depends on the result of this transaction.
+ * @cb_fn: function to call when P+Q generation completes
+ * @cb_param: parameter to pass to the callback routine
+ */
+struct dma_async_tx_descriptor *
+async_gen_syndrome(struct page **blocks, unsigned int offset, int src_cnt,
+	size_t len, enum async_tx_flags flags,
+	struct dma_async_tx_descriptor *depend_tx,
+	dma_async_tx_callback cb_fn, void *cb_param)
+{
+	struct dma_chan *chan = async_tx_find_channel(depend_tx, DMA_PQ,
+						     &blocks[src_cnt], 2,
+						     blocks, src_cnt, len);
+	struct dma_device *device = chan ? chan->device : NULL;
+	struct dma_async_tx_descriptor *tx = NULL;
+
+	BUG_ON(src_cnt > 255 || (!blocks[src_cnt] && !blocks[src_cnt+1]));
+
+	if (!device && (flags & ASYNC_TX_ASYNC_ONLY))
+		return NULL;
+
+	/* Synchronous gen_syndrome() doesn't take care of destinations,
+	 * but asynchronous implies them as sources; so, when generating
+	 * syndromes - command to clear destinations up explicitly
+	 */
+	if (blocks[src_cnt])
+		flags |= ASYNC_TX_PQ_ZERO_P;
+	if (blocks[src_cnt+1])
+		flags |= ASYNC_TX_PQ_ZERO_Q;
+
+	if (device) {
+		/* run the xor asynchronously */
+		tx = do_async_pq(chan, blocks, (uint8_t *)raid6_gfexp,
+				 offset, src_cnt, len, flags, depend_tx,
+				 cb_fn, cb_param);
+	} else {
+		/* run the pq synchronously */
+		/* wait for any prerequisite operations */
+		async_tx_quiesce(&depend_tx);
+
+		if (!blocks[src_cnt])
+			blocks[src_cnt] = spare_pages[2];
+		if (!blocks[src_cnt+1])
+			blocks[src_cnt+1] = spare_pages[2];
+		do_sync_gen_syndrome(blocks, offset, src_cnt, len, flags,
+				     depend_tx, cb_fn, cb_param);
+	}
+
+	return tx;
+}
+EXPORT_SYMBOL_GPL(async_gen_syndrome);
+
+/**
+ * async_pq_zero_sum - attempt a PQ parities check with a dma engine.
+ * @blocks: array of source pages. The 0..src_cnt-1 are the sources, the
+ *	src_cnt and src_cnt+1 are the P and Q destinations to check, resp.
+ *	Only one of two destinations may be present.
+ *	NOTE: client code must assume the contents of this array are destroyed
+ * @scf: coefficients to use in GF-multiplications
+ * @offset: offset in pages to start transaction
+ * @src_cnt: number of source pages
+ * @len: length in bytes
+ * @presult: where to store the result of P-ckeck, which is 0 if P-parity
+ *	OK, and non-zero otherwise.
+ * @qresult: where to store the result of P-ckeck, which is 0 if Q-parity
+ *	OK, and non-zero otherwise.
+ * @flags: ASYNC_TX_ASSUME_COHERENT, ASYNC_TX_ACK, ASYNC_TX_DEP_ACK
+ * @depend_tx: depends on the result of this transaction.
+ * @cb_fn: function to call when the xor completes
+ * @cb_param: parameter to pass to the callback routine
+ */
+struct dma_async_tx_descriptor *
+async_pq_zero_sum(struct page **blocks, unsigned char *scf,
+	unsigned int offset, int src_cnt, size_t len,
+	u32 *presult, u32 *qresult, enum async_tx_flags flags,
+	struct dma_async_tx_descriptor *depend_tx,
+	dma_async_tx_callback cb_fn, void *cb_param)
+{
+	struct dma_chan *chan = async_tx_find_channel(depend_tx,
+						      DMA_PQ_ZERO_SUM,
+						      &blocks[src_cnt], 2,
+						      blocks, src_cnt, len);
+	struct dma_device *device = chan ? chan->device : NULL;
+	struct dma_async_tx_descriptor *tx = NULL;
+
+	BUG_ON(src_cnt < 2);
+
+	if (device && src_cnt <= device->max_pq) {
+		dma_addr_t dma_src[src_cnt + 2];
+		enum dma_ctrl_flags dma_flags = cb_fn ? DMA_PREP_INTERRUPT : 0;
+		int i;
+
+		for (i = 0; i < src_cnt + 2; i++)
+			dma_src[i] = blocks[i] ? dma_map_page(device->dev,
+					blocks[i], offset, len,
+					DMA_TO_DEVICE) : 0;
+
+		tx = device->device_prep_dma_pqzero_sum(chan, dma_src, src_cnt,
+						      scf, len,
+						      presult, qresult,
+						      dma_flags);
+
+		if (unlikely(!tx)) {
+			async_tx_quiesce(&depend_tx);
+
+			while (unlikely(!tx)) {
+				dma_async_issue_pending(chan);
+				tx = device->device_prep_dma_pqzero_sum(chan,
+						dma_src, src_cnt, scf, len,
+						presult, qresult,
+						dma_flags);
+			}
+		}
+
+		async_tx_submit(chan, tx, flags, depend_tx, cb_fn, cb_param);
+	} else {
+		struct page *pdest = blocks[src_cnt];
+		struct page *qdest = blocks[src_cnt + 1];
+		enum async_tx_flags lflags = flags;
+
+		lflags &= ~ASYNC_TX_ACK;
+		lflags |= ASYNC_TX_PQ_ZERO_P | ASYNC_TX_PQ_ZERO_Q;
+
+		spin_lock(&spare_lock);
+		blocks[src_cnt] = spare_pages[0];
+		blocks[src_cnt + 1] = spare_pages[1];
+		tx = async_pq(blocks, scf, offset, src_cnt, len, lflags,
+			      depend_tx, NULL, NULL);
+
+		async_tx_quiesce(&tx);
+
+		if (presult && pdest)
+			*presult = memcmp(page_address(pdest) + offset,
+					  page_address(spare_pages[0]) +
+						   offset, len) == 0 ? 0 : 1;
+		if (qresult && qdest)
+			*qresult = memcmp(page_address(qdest) + offset,
+					  page_address(spare_pages[1]) +
+						   offset, len) == 0 ? 0 : 1;
+		spin_unlock(&spare_lock);
+	}
+
+	return tx;
+}
+EXPORT_SYMBOL_GPL(async_pq_zero_sum);
+
+/**
+ * async_syndrome_zero_sum - attempt a P (xor) and Q (Reed-Solomon code)
+ *	parities check with a dma engine. This routine assumes a field of
+ *	GF(2^8) with a primitive polynomial of 0x11d and a generator of {02}.
+ * @blocks: array of source pages. The 0..src_cnt-1 are the sources, the
+ *	src_cnt and src_cnt+1 are the P and Q destinations to check, resp.
+ *	Only one of two destinations may be present.
+ *	NOTE: client code must assume the contents of this array are destroyed
+ * @offset: offset in pages to start transaction
+ * @src_cnt: number of source pages
+ * @len: length in bytes
+ * @presult: where to store the result of P-ckeck: 0 if P-parity is OK,
+ *	and non-zero otherwise.
+ * @qresult: where to store the result of P-ckeck: 0 if Q-parity is OK.
+ *	and non-zero otherwise.
+ * @flags: ASYNC_TX_ASSUME_COHERENT, ASYNC_TX_ACK, ASYNC_TX_DEP_ACK
+ * @depend_tx: depends on the result of this transaction.
+ * @cb_fn: function to call when the xor completes
+ * @cb_param: parameter to pass to the callback routine
+ */
+struct dma_async_tx_descriptor *
+async_syndrome_zero_sum(struct page **blocks, unsigned int offset,
+	int src_cnt, size_t len, u32 *presult, u32 *qresult,
+	enum async_tx_flags flags, struct dma_async_tx_descriptor *depend_tx,
+	dma_async_tx_callback cb_fn, void *cb_param)
+{
+	struct dma_chan *chan = async_tx_find_channel(depend_tx,
+						      DMA_PQ_ZERO_SUM,
+						      &blocks[src_cnt], 2,
+						      blocks, src_cnt, len);
+	struct dma_device *device = chan ? chan->device : NULL;
+	struct dma_async_tx_descriptor *tx = NULL;
+
+	BUG_ON(src_cnt < 2);
+
+	if (device && src_cnt <= device->max_pq) {
+		dma_addr_t dma_src[src_cnt + 2];
+		enum dma_ctrl_flags dma_flags = cb_fn ? DMA_PREP_INTERRUPT : 0;
+		int i;
+
+		for (i = 0; i < src_cnt + 2; i++)
+			dma_src[i] = blocks[i] ? dma_map_page(device->dev,
+					blocks[i], offset, len,
+					DMA_TO_DEVICE) : 0;
+
+		tx = device->device_prep_dma_pqzero_sum(chan, dma_src, src_cnt,
+						      (uint8_t *)raid6_gfexp,
+						      len, presult, qresult,
+						      dma_flags);
+
+		if (unlikely(!tx)) {
+			async_tx_quiesce(&depend_tx);
+			while (unlikely(!tx)) {
+				dma_async_issue_pending(chan);
+				tx = device->device_prep_dma_pqzero_sum(chan,
+						dma_src, src_cnt,
+						(uint8_t *)raid6_gfexp, len,
+						presult, qresult,
+						dma_flags);
+			}
+		}
+
+		async_tx_submit(chan, tx, flags, depend_tx, cb_fn, cb_param);
+	} else {
+		struct page *pdest = blocks[src_cnt];
+		struct page *qdest = blocks[src_cnt + 1];
+		enum async_tx_flags lflags = flags;
+
+		lflags &= ~ASYNC_TX_ACK;
+
+		spin_lock(&spare_lock);
+		blocks[src_cnt] = spare_pages[0];
+		blocks[src_cnt + 1] = spare_pages[1];
+		tx = async_gen_syndrome(blocks, offset,
+					src_cnt, len, lflags,
+					depend_tx, NULL, NULL);
+		async_tx_quiesce(&tx);
+
+		if (presult && pdest)
+			*presult = memcmp(page_address(pdest) + offset,
+					  page_address(spare_pages[0]) +
+						   offset, len) == 0 ? 0 : 1;
+		if (qresult && qdest)
+			*qresult = memcmp(page_address(qdest) + offset,
+					  page_address(spare_pages[1]) +
+						   offset, len) == 0 ? 0 : 1;
+		spin_unlock(&spare_lock);
+	}
+
+	return tx;
+}
+EXPORT_SYMBOL_GPL(async_syndrome_zero_sum);
+
+static int __init async_pq_init(void)
+{
+	spin_lock_init(&spare_lock);
+
+	spare_pages[0] = alloc_page(GFP_KERNEL);
+	if (!spare_pages[0])
+		goto abort;
+	spare_pages[1] = alloc_page(GFP_KERNEL);
+	if (!spare_pages[1])
+		goto abort;
+	spare_pages[2] = alloc_page(GFP_KERNEL);
+	if (!spare_pages[2])
+		goto abort;
+	return 0;
+abort:
+	safe_put_page(spare_pages[2]);
+	safe_put_page(spare_pages[1]);
+	safe_put_page(spare_pages[0]);
+	printk(KERN_ERR "%s: cannot allocate spare!\n", __func__);
+	return -ENOMEM;
+}
+
+static void __exit async_pq_exit(void)
+{
+	safe_put_page(spare_pages[2]);
+	safe_put_page(spare_pages[1]);
+	safe_put_page(spare_pages[0]);
+}
+
+module_init(async_pq_init);
+module_exit(async_pq_exit);
+
+MODULE_AUTHOR("Yuri Tikhonov <yur@emcraft.com>");
+MODULE_DESCRIPTION("asynchronous pq/pq-zero-sum api");
+MODULE_LICENSE("GPL");
diff --git a/include/linux/async_tx.h b/include/linux/async_tx.h
index 0f50d4c..5d6b639 100644
--- a/include/linux/async_tx.h
+++ b/include/linux/async_tx.h
@@ -42,6 +42,12 @@ struct dma_chan_ref {
  * @ASYNC_TX_XOR_ZERO_DST: this flag must be used for xor operations where the
  * the destination address is not a source.  The asynchronous case handles this
  * implicitly, the synchronous case needs to zero the destination block.
+ * @ASYNC_TX_PQ_ZERO_P: this flag must be used for async_pq operations since the
+ * destination there is always the source (the result of P after async_pq is
+ * xor-ed with the previous content of P block if this flag isn't set).
+ * @ASYNC_TX_PQ_ZERO_Q: this flag must be used for async_pq operations since the
+ * destination there is always the source (the result of Q after async_pq is
+ * xor-ed with the previous content of Q block if this flag isn't set).
  * @ASYNC_TX_XOR_DROP_DST: this flag must be used if the destination address is
  * also one of the source addresses.  In the synchronous case the destination
  * address is an implied source, whereas the asynchronous case it must be listed
@@ -50,12 +56,17 @@ struct dma_chan_ref {
  * @ASYNC_TX_ACK: immediately ack the descriptor, precludes setting up a
  * dependency chain
  * @ASYNC_TX_DEP_ACK: ack the dependency descriptor.  Useful for chaining.
+ * @ASYNC_TX_ASYNC_ONLY: if set then try to perform operation requested only in
+ * the asynchronous mode.
  */
 enum async_tx_flags {
 	ASYNC_TX_XOR_ZERO_DST	 = (1 << 0),
-	ASYNC_TX_XOR_DROP_DST	 = (1 << 1),
-	ASYNC_TX_ACK		 = (1 << 3),
-	ASYNC_TX_DEP_ACK	 = (1 << 4),
+	ASYNC_TX_PQ_ZERO_P	 = (1 << 1),
+	ASYNC_TX_PQ_ZERO_Q	 = (1 << 2),
+	ASYNC_TX_XOR_DROP_DST	 = (1 << 3),
+	ASYNC_TX_ACK		 = (1 << 4),
+	ASYNC_TX_DEP_ACK	 = (1 << 5),
+	ASYNC_TX_ASYNC_ONLY	 = (1 << 6),
 };
 
 #ifdef CONFIG_DMA_ENGINE
@@ -146,5 +157,33 @@ async_trigger_callback(enum async_tx_flags flags,
 	struct dma_async_tx_descriptor *depend_tx,
 	dma_async_tx_callback cb_fn, void *cb_fn_param);
 
+struct dma_async_tx_descriptor *
+async_pqxor(struct page *pdest, struct page *qdest,
+	struct page **src_list, unsigned char *scoef_list,
+	unsigned int offset, int src_cnt, size_t len, enum async_tx_flags flags,
+	struct dma_async_tx_descriptor *depend_tx,
+	dma_async_tx_callback callback, void *callback_param);
+
+struct dma_async_tx_descriptor *
+async_gen_syndrome(struct page *pdest, struct page *qdest,
+	struct page **src_list, unsigned int offset, int src_cnt, size_t len,
+	enum async_tx_flags flags, struct dma_async_tx_descriptor *depend_tx,
+	dma_async_tx_callback callback, void *callback_param);
+
+struct dma_async_tx_descriptor *
+async_pqxor_zero_sum(struct page *pdest, struct page *qdest,
+	struct page **src_list, unsigned char *scoef_list,
+	unsigned int offset, int src_cnt, size_t len,
+	u32 *presult, u32 *qresult, enum async_tx_flags flags,
+	struct dma_async_tx_descriptor *depend_tx,
+	dma_async_tx_callback callback, void *callback_param);
+
+struct dma_async_tx_descriptor *
+async_syndrome_zero_sum(struct page *pdest, struct page *qdest,
+	struct page **src_list, unsigned int offset, int src_cnt, size_t len,
+	u32 *presult, u32 *qresult, enum async_tx_flags flags,
+	struct dma_async_tx_descriptor *depend_tx,
+	dma_async_tx_callback callback, void *callback_param);
+
 void async_tx_quiesce(struct dma_async_tx_descriptor **tx);
 #endif /* _ASYNC_TX_H_ */
diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index adb0b08..84525c3 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -81,7 +81,7 @@ enum dma_status {
 enum dma_transaction_type {
 	DMA_MEMCPY,
 	DMA_XOR,
-	DMA_PQ_XOR,
+	DMA_PQ,
 	DMA_DUAL_XOR,
 	DMA_PQ_UPDATE,
 	DMA_ZERO_SUM,
@@ -123,6 +123,8 @@ enum dma_ctrl_flags {
 	DMA_CTRL_ACK = (1 << 1),
 	DMA_COMPL_SKIP_SRC_UNMAP = (1 << 2),
 	DMA_COMPL_SKIP_DEST_UNMAP = (1 << 3),
+	DMA_PREP_ZERO_P = (1 << 4),
+	DMA_PREP_ZERO_Q = (1 << 5),
 };
 
 /**
@@ -299,6 +301,7 @@ struct dma_async_tx_descriptor {
  * @global_node: list_head for global dma_device_list
  * @cap_mask: one or more dma_capability flags
  * @max_xor: maximum number of xor sources, 0 if no capability
+ * @max_pq: maximum number of PQ sources, 0 if no capability
  * @refcount: reference count
  * @done: IO completion struct
  * @dev_id: unique device ID
@@ -308,7 +311,9 @@ struct dma_async_tx_descriptor {
  * @device_free_chan_resources: release DMA channel's resources
  * @device_prep_dma_memcpy: prepares a memcpy operation
  * @device_prep_dma_xor: prepares a xor operation
+ * @device_prep_dma_pq: prepares a pq operation
  * @device_prep_dma_zero_sum: prepares a zero_sum operation
+ * @device_prep_dma_pqzero_sum: prepares a pqzero_sum operation
  * @device_prep_dma_memset: prepares a memset operation
  * @device_prep_dma_interrupt: prepares an end of chain interrupt operation
  * @device_prep_slave_sg: prepares a slave dma operation
@@ -322,6 +327,7 @@ struct dma_device {
 	struct list_head global_node;
 	dma_cap_mask_t  cap_mask;
 	int max_xor;
+	int max_pq;
 
 	struct kref refcount;
 	struct completion done;
@@ -339,9 +345,17 @@ struct dma_device {
 	struct dma_async_tx_descriptor *(*device_prep_dma_xor)(
 		struct dma_chan *chan, dma_addr_t dest, dma_addr_t *src,
 		unsigned int src_cnt, size_t len, unsigned long flags);
+	struct dma_async_tx_descriptor *(*device_prep_dma_pq)(
+		struct dma_chan *chan, dma_addr_t *dst, dma_addr_t *src,
+		unsigned int src_cnt, unsigned char *scf,
+		size_t len, unsigned long flags);
 	struct dma_async_tx_descriptor *(*device_prep_dma_zero_sum)(
 		struct dma_chan *chan, dma_addr_t *src,	unsigned int src_cnt,
 		size_t len, u32 *result, unsigned long flags);
+	struct dma_async_tx_descriptor *(*device_prep_dma_pqzero_sum)(
+		struct dma_chan *chan, dma_addr_t *src, unsigned int src_cnt,
+		unsigned char *scf,
+		size_t len, u32 *presult, u32 *qresult, unsigned long flags);
 	struct dma_async_tx_descriptor *(*device_prep_dma_memset)(
 		struct dma_chan *chan, dma_addr_t dest, int value, size_t len,
 		unsigned long flags);
-- 
1.5.6.1

^ permalink raw reply related	[flat|nested] 3+ messages in thread

* Re: [PATCH 02/11][v2] async_tx: add support for asynchronous GF multiplication
  2008-12-08 21:55 [PATCH 02/11][v2] async_tx: add support for asynchronous GF multiplication Yuri Tikhonov
@ 2008-12-17 18:34 ` Dan Williams
  2008-12-19  7:43   ` Re[2]: " Yuri Tikhonov
  0 siblings, 1 reply; 3+ messages in thread
From: Dan Williams @ 2008-12-17 18:34 UTC (permalink / raw)
  To: Yuri Tikhonov; +Cc: linux-raid, linuxppc-dev, wd, dzu, yanok

Hi Yuri,

On Mon, Dec 8, 2008 at 2:55 PM, Yuri Tikhonov <yur@emcraft.com> wrote:
> This adds support for doing asynchronous GF multiplication by adding
> four additional functions to async_tx API:
>
>  async_pq() does simultaneous XOR of sources and XOR of sources
>  GF-multiplied by given coefficients.
>
>  async_pq_zero_sum() checks if results of calculations match given
>  ones.
>
>  async_gen_syndrome() does sumultaneous XOR and R/S syndrome of sources.
>
>  async_syndrome_zerosum() checks if results of XOR/syndrome calculation
>  matches given ones.
>
> Latter two functions just use async_pq() with the approprite coefficients
> in asynchronous case but have significant optimizations if synchronous
> case.
>

I like this separation of gen_syndrome and generic pq.

[..]
> +       /* DMAs use destinations as sources, so use BIDIRECTIONAL mapping */
> +       dma_dest[0] = !blocks[src_cnt] ? 0 :
> +                               dma_map_page(dma->dev, blocks[src_cnt],
> +                                            offset, len, DMA_BIDIRECTIONAL);

"0" could be a valid dma address on some architectures.
DMA_ERROR_CODE looks like the closest fit for what we are trying to do
here, but that only exists on sparc and powerpc.  We could add a
"dest_mask" parameter to device_prep_dma_pq where the mask is  1 =
p-only, 2 = q-only, and 3 = p and q.

> +       dma_dest[1] = !blocks[src_cnt+1] ? 0 :
> +                               dma_map_page(dma->dev, blocks[src_cnt+1],
> +                                            offset, len, DMA_BIDIRECTIONAL);
> +
> +       for (i = 0; i < src_cnt; i++)
> +               dma_src[i] = dma_map_page(dma->dev, blocks[i],
> +                                         offset, len, DMA_TO_DEVICE);
> +
> +       while (src_cnt) {
> +               async_flags = flags;
> +               pq_src_cnt = min(src_cnt, dma->max_pq);
> +               /* if we are submitting additional pqs, leave the chain open,
> +                * clear the callback parameters, and leave the destination
> +                * buffers mapped
> +                */
> +               if (src_cnt > pq_src_cnt) {
> +                       async_flags &= ~ASYNC_TX_ACK;
> +                       dma_flags |= DMA_COMPL_SKIP_DEST_UNMAP;
> +                       _cb_fn = NULL;
> +                       _cb_param = NULL;
> +               } else {
> +                       _cb_fn = cb_fn;
> +                       _cb_param = cb_param;
> +               }
> +               if (_cb_fn)
> +                       dma_flags |= DMA_PREP_INTERRUPT;
> +
> +               /* Since we have clobbered the src_list we are committed
> +                * to doing this asynchronously.  Drivers force forward
> +                * progress in case they can not provide a descriptor
> +                */
> +               tx = dma->device_prep_dma_pq(chan, dma_dest,
> +                                            &dma_src[src_off], pq_src_cnt,
> +                                            scf_list ? &scf_list[src_off] :
> +                                                       NULL,
> +                                            len, dma_flags);

...one nit for readability can we replace these ternary conditionals
with proper if-else statements?  i.e.

                if (scf_list)
                        scf = &scf_list[src_off];
                else
                        scf = NULL;
                tx = dma->device_prep_dma_pq(chan, dma_dest,
                                             &dma_src[src_off], pq_src_cnt,
                                             scf, len, dma_flags);

> +               if (unlikely(!tx))
> +                       async_tx_quiesce(&depend_tx);
> +
> +               /* spin wait for the preceeding transactions to complete */
> +               while (unlikely(!tx)) {
> +                       dma_async_issue_pending(chan);
> +                       tx = dma->device_prep_dma_pq(chan, dma_dest,
> +                                       &dma_src[src_off], pq_src_cnt,
> +                                       scf_list ? &scf_list[src_off] : NULL,
> +                                       len, dma_flags);
> +               }
> +
> +               async_tx_submit(chan, tx, async_flags, depend_tx,
> +                               _cb_fn, _cb_param);
> +
> +               depend_tx = tx;
> +               flags |= ASYNC_TX_DEP_ACK;
> +
> +               if (src_cnt > pq_src_cnt) {
> +                       /* drop completed sources */
> +                       src_cnt -= pq_src_cnt;
> +                       src_off += pq_src_cnt;
> +
> +                       /* use the intermediate result as a source; we
> +                        * clear DMA_PREP_ZERO, so prep_dma_pq will
> +                        * include destination(s) into calculations
> +                        */
> +                       dma_flags = 0;
> +               } else
> +                       break;
> +       }
> +
> +       return tx;
> +}
> +
> +/**
> + * do_sync_pq - synchronously calculate P and Q
> + */
> +static void
> +do_sync_pq(struct page **blocks, unsigned char *scf, unsigned int offset,
> +       int src_cnt, size_t len, enum async_tx_flags flags,
> +       struct dma_async_tx_descriptor *depend_tx,
> +       dma_async_tx_callback cb_fn, void *cb_param)
> +{
> +       int i, pos;
> +       uint8_t *p, *q, *src;
> +
> +       /* set destination addresses */
> +       p = blocks[src_cnt] ?
> +               (uint8_t *)(page_address(blocks[src_cnt]) + offset) :
> +               NULL;
> +       q = blocks[src_cnt+1] ?
> +               (uint8_t *)(page_address(blocks[src_cnt+1]) + offset) :
> +               NULL;
> +

...more ternary conditional to if-else conversion

> +       if (flags & ASYNC_TX_PQ_ZERO_P) {
> +               BUG_ON(!p);
> +               memset(p, 0, len);
> +       }
> +
> +       if (flags & ASYNC_TX_PQ_ZERO_Q) {
> +               BUG_ON(!q);
> +               memset(q, 0, len);
> +       }
> +
> +       for (i = 0; i < src_cnt; i++) {
> +               src = (uint8_t *)(page_address(blocks[i]) + offset);
> +               for (pos = 0; pos < len; pos++) {
> +                       if (p)
> +                               p[pos] ^= src[pos];
> +                       if (q)
> +                               q[pos] ^= raid6_gfmul[scf[i]][src[pos]];
> +               }
> +       }
> +       async_tx_sync_epilog(cb_fn, cb_param);
> +}
> +
> +/**
> + * async_pq - attempt to do XOR and Galois calculations in parallel using
> + *     a dma engine.
> + * @blocks: source block array from 0 to (src_cnt-1) with the p destination
> + *     at blocks[src_cnt] and q at blocks[src_cnt + 1]. Only one of two
> + *     destinations may be present (another then has to be set to NULL).
> + *     By default, the result of calculations is XOR-ed with the initial
> + *     content of the destinationa buffers. Use ASYNC_TX_PQ_ZERO_x flags
> + *     to avoid this.
> + *     NOTE: client code must assume the contents of this array are destroyed
> + * @scf: array of source coefficients used in GF-multiplication
> + * @offset: offset in pages to start transaction
> + * @src_cnt: number of source pages
> + * @len: length in bytes
> + * @flags: ASYNC_TX_PQ_ZERO_P, ASYNC_TX_PQ_ZERO_Q, ASYNC_TX_ASSUME_COHERENT,
> + *     ASYNC_TX_ACK, ASYNC_TX_DEP_ACK, ASYNC_TX_ASYNC_ONLY
> + * @depend_tx: depends on the result of this transaction.
> + * @cb_fn: function to call when the operation completes
> + * @cb_param: parameter to pass to the callback routine
> + */
> +struct dma_async_tx_descriptor *
> +async_pq(struct page **blocks, unsigned char *scf,
> +       unsigned int offset, int src_cnt, size_t len,
> +       enum async_tx_flags flags,
> +       struct dma_async_tx_descriptor *depend_tx,
> +       dma_async_tx_callback cb_fn, void *cb_param)
> +{
> +       struct dma_chan *chan = async_tx_find_channel(depend_tx, DMA_PQ,
> +                                       &blocks[src_cnt], 2,
> +                                       blocks, src_cnt, len);
> +       struct dma_device *device = chan ? chan->device : NULL;
> +       struct dma_async_tx_descriptor *tx = NULL;
> +
> +       if (!device && (flags & ASYNC_TX_ASYNC_ONLY))
> +               return NULL;
> +
> +       if (device) {
> +               /* run pq asynchronously */
> +               tx = do_async_pq(chan, blocks, scf, offset, src_cnt,
> +                       len, flags, depend_tx, cb_fn,cb_param);
> +       } else {
> +               /* run pq synchronously */
> +               if (!blocks[src_cnt+1]) {
> +                       struct page *pdst = blocks[src_cnt];
> +                       int i;
> +
> +                       /* Calculate P-parity only.
> +                        * As opposite to async_xor(), async_pq() assumes
> +                        * that destinations are included into calculations,
> +                        * so we should re-arrange the xor src list to
> +                        * achieve the similar behavior.
> +                        */
> +                       if (!(flags & ASYNC_TX_PQ_ZERO_P)) {
> +                               /* If async_pq() user doesn't set ZERO flag,
> +                                * it's assumed that destination has some
> +                                * reasonable data to include in calculations.
> +                                * The destination must be at position 0, so
> +                                * shift the sources and put pdst at the
> +                                * beginning of the list.
> +                                */
> +                               for (i = src_cnt - 1; i >= 0; i--)
> +                                       blocks[i+1] = blocks[i];
> +                               blocks[0] = pdst;
> +                               src_cnt++;
> +                               flags |= ASYNC_TX_XOR_DROP_DST;
> +                       } else {
> +                               /* If async_pq() user want to clear P, then
> +                                * this will be done automatically in async
> +                                * case, and with the help of ZERO_DST in
> +                                * the sync one.
> +                                */
> +                               flags &= ~ASYNC_TX_PQ_ZERO_P;
> +                               flags |= ASYNC_TX_XOR_ZERO_DST;
> +                       }
> +
> +
> +                       return async_xor(pdst, blocks, offset,
> +                                        src_cnt, len, flags, depend_tx,
> +                                        cb_fn, cb_param);
> +               }
> +
> +               /* wait for any prerequisite operations */
> +               async_tx_quiesce(&depend_tx);
> +
> +               do_sync_pq(blocks, scf, offset, src_cnt, len, flags,
> +                       depend_tx, cb_fn, cb_param);
> +       }
> +
> +       return tx;
> +}
> +EXPORT_SYMBOL_GPL(async_pq);
> +
> +/**
> + * do_sync_gen_syndrome - synchronously calculate P (xor) and Q (Reed-Solomon
> + *     code)
> + */
> +static void
> +do_sync_gen_syndrome(struct page **blocks, unsigned int offset,
> +       int src_cnt, size_t len, enum async_tx_flags flags,
> +       struct dma_async_tx_descriptor *depend_tx,
> +       dma_async_tx_callback cb_fn, void *cb_param)
> +{
> +       int i;
> +       void *tsrc[src_cnt+2];
> +
> +       for (i = 0; i < src_cnt + 2; i++)
> +               tsrc[i] = page_address(blocks[i]) + offset;
> +
> +       raid6_call.gen_syndrome(i, len, tsrc);
> +
> +       async_tx_sync_epilog(cb_fn, cb_param);
> +}
> +
> +/**
> + * async_gen_syndrome - attempt to generate P (xor) and Q (Reed-Solomon code)
> + *     with a dma engine for a given set of blocks.  This routine assumes a
> + *     field of GF(2^8) with a primitive polynomial of 0x11d and a generator
> + *     of {02}.
> + * @blocks: source block array ordered from 0..src_cnt-1 with the P destination
> + *     at blocks[src_cnt] and Q at blocks[src_cnt + 1]. Only one of two
> + *     destinations may be present (another then has to be set to NULL).
> + *     NOTE: client code must assume the contents of this array are destroyed
> + * @offset: offset in pages to start transaction
> + * @src_cnt: number of source pages: 2 < src_cnt <= 255
> + * @len: length of blocks in bytes
> + * @flags: ASYNC_TX_ACK, ASYNC_TX_DEP_ACK, ASYNC_TX_ASYNC_ONLY
> + * @depend_tx: P+Q operation depends on the result of this transaction.
> + * @cb_fn: function to call when P+Q generation completes
> + * @cb_param: parameter to pass to the callback routine
> + */
> +struct dma_async_tx_descriptor *
> +async_gen_syndrome(struct page **blocks, unsigned int offset, int src_cnt,
> +       size_t len, enum async_tx_flags flags,
> +       struct dma_async_tx_descriptor *depend_tx,
> +       dma_async_tx_callback cb_fn, void *cb_param)
> +{
> +       struct dma_chan *chan = async_tx_find_channel(depend_tx, DMA_PQ,
> +                                                    &blocks[src_cnt], 2,
> +                                                    blocks, src_cnt, len);
> +       struct dma_device *device = chan ? chan->device : NULL;
> +       struct dma_async_tx_descriptor *tx = NULL;
> +
> +       BUG_ON(src_cnt > 255 || (!blocks[src_cnt] && !blocks[src_cnt+1]));
> +
> +       if (!device && (flags & ASYNC_TX_ASYNC_ONLY))
> +               return NULL;
> +
> +       /* Synchronous gen_syndrome() doesn't take care of destinations,
> +        * but asynchronous implies them as sources; so, when generating
> +        * syndromes - command to clear destinations up explicitly
> +        */
> +       if (blocks[src_cnt])
> +               flags |= ASYNC_TX_PQ_ZERO_P;
> +       if (blocks[src_cnt+1])
> +               flags |= ASYNC_TX_PQ_ZERO_Q;
> +
> +       if (device) {
> +               /* run the xor asynchronously */
> +               tx = do_async_pq(chan, blocks, (uint8_t *)raid6_gfexp,
> +                                offset, src_cnt, len, flags, depend_tx,
> +                                cb_fn, cb_param);
> +       } else {
> +               /* run the pq synchronously */
> +               /* wait for any prerequisite operations */
> +               async_tx_quiesce(&depend_tx);
> +
> +               if (!blocks[src_cnt])
> +                       blocks[src_cnt] = spare_pages[2];
> +               if (!blocks[src_cnt+1])
> +                       blocks[src_cnt+1] = spare_pages[2];
> +               do_sync_gen_syndrome(blocks, offset, src_cnt, len, flags,
> +                                    depend_tx, cb_fn, cb_param);
> +       }
> +
> +       return tx;
> +}
> +EXPORT_SYMBOL_GPL(async_gen_syndrome);
> +
> +/**
> + * async_pq_zero_sum - attempt a PQ parities check with a dma engine.
> + * @blocks: array of source pages. The 0..src_cnt-1 are the sources, the
> + *     src_cnt and src_cnt+1 are the P and Q destinations to check, resp.
> + *     Only one of two destinations may be present.
> + *     NOTE: client code must assume the contents of this array are destroyed
> + * @scf: coefficients to use in GF-multiplications
> + * @offset: offset in pages to start transaction
> + * @src_cnt: number of source pages
> + * @len: length in bytes
> + * @presult: where to store the result of P-ckeck, which is 0 if P-parity
> + *     OK, and non-zero otherwise.
> + * @qresult: where to store the result of P-ckeck, which is 0 if Q-parity
> + *     OK, and non-zero otherwise.
> + * @flags: ASYNC_TX_ASSUME_COHERENT, ASYNC_TX_ACK, ASYNC_TX_DEP_ACK
> + * @depend_tx: depends on the result of this transaction.
> + * @cb_fn: function to call when the xor completes
> + * @cb_param: parameter to pass to the callback routine
> + */
> +struct dma_async_tx_descriptor *
> +async_pq_zero_sum(struct page **blocks, unsigned char *scf,
> +       unsigned int offset, int src_cnt, size_t len,
> +       u32 *presult, u32 *qresult, enum async_tx_flags flags,
> +       struct dma_async_tx_descriptor *depend_tx,
> +       dma_async_tx_callback cb_fn, void *cb_param)
> +{
> +       struct dma_chan *chan = async_tx_find_channel(depend_tx,
> +                                                     DMA_PQ_ZERO_SUM,
> +                                                     &blocks[src_cnt], 2,
> +                                                     blocks, src_cnt, len);
> +       struct dma_device *device = chan ? chan->device : NULL;
> +       struct dma_async_tx_descriptor *tx = NULL;
> +
> +       BUG_ON(src_cnt < 2);
> +
> +       if (device && src_cnt <= device->max_pq) {
> +               dma_addr_t dma_src[src_cnt + 2];
> +               enum dma_ctrl_flags dma_flags = cb_fn ? DMA_PREP_INTERRUPT : 0;
> +               int i;
> +
> +               for (i = 0; i < src_cnt + 2; i++)
> +                       dma_src[i] = blocks[i] ? dma_map_page(device->dev,
> +                                       blocks[i], offset, len,
> +                                       DMA_TO_DEVICE) : 0;

If we go with the "dest_mask" approach to specifying p and q then we
need to separate them into their own parameter here... although in
this case it would be a "src_mask" to select p or q.

> +
> +               tx = device->device_prep_dma_pqzero_sum(chan, dma_src, src_cnt,
> +                                                     scf, len,
> +                                                     presult, qresult,
> +                                                     dma_flags);
> +
> +               if (unlikely(!tx)) {
> +                       async_tx_quiesce(&depend_tx);
> +
> +                       while (unlikely(!tx)) {
> +                               dma_async_issue_pending(chan);
> +                               tx = device->device_prep_dma_pqzero_sum(chan,
> +                                               dma_src, src_cnt, scf, len,
> +                                               presult, qresult,
> +                                               dma_flags);
> +                       }
> +               }
> +
> +               async_tx_submit(chan, tx, flags, depend_tx, cb_fn, cb_param);
> +       } else {
> +               struct page *pdest = blocks[src_cnt];
> +               struct page *qdest = blocks[src_cnt + 1];
> +               enum async_tx_flags lflags = flags;
> +
> +               lflags &= ~ASYNC_TX_ACK;
> +               lflags |= ASYNC_TX_PQ_ZERO_P | ASYNC_TX_PQ_ZERO_Q;
> +
> +               spin_lock(&spare_lock);
> +               blocks[src_cnt] = spare_pages[0];
> +               blocks[src_cnt + 1] = spare_pages[1];
> +               tx = async_pq(blocks, scf, offset, src_cnt, len, lflags,
> +                             depend_tx, NULL, NULL);
> +
> +               async_tx_quiesce(&tx);
> +
> +               if (presult && pdest)
> +                       *presult = memcmp(page_address(pdest) + offset,
> +                                         page_address(spare_pages[0]) +
> +                                                  offset, len) == 0 ? 0 : 1;
> +               if (qresult && qdest)
> +                       *qresult = memcmp(page_address(qdest) + offset,
> +                                         page_address(spare_pages[1]) +
> +                                                  offset, len) == 0 ? 0 : 1;
> +               spin_unlock(&spare_lock);
> +       }
> +
> +       return tx;
> +}
> +EXPORT_SYMBOL_GPL(async_pq_zero_sum);
> +
> +/**
> + * async_syndrome_zero_sum - attempt a P (xor) and Q (Reed-Solomon code)
> + *     parities check with a dma engine. This routine assumes a field of
> + *     GF(2^8) with a primitive polynomial of 0x11d and a generator of {02}.
> + * @blocks: array of source pages. The 0..src_cnt-1 are the sources, the
> + *     src_cnt and src_cnt+1 are the P and Q destinations to check, resp.
> + *     Only one of two destinations may be present.
> + *     NOTE: client code must assume the contents of this array are destroyed
> + * @offset: offset in pages to start transaction
> + * @src_cnt: number of source pages
> + * @len: length in bytes
> + * @presult: where to store the result of P-ckeck: 0 if P-parity is OK,
> + *     and non-zero otherwise.
> + * @qresult: where to store the result of P-ckeck: 0 if Q-parity is OK.
> + *     and non-zero otherwise.
> + * @flags: ASYNC_TX_ASSUME_COHERENT, ASYNC_TX_ACK, ASYNC_TX_DEP_ACK
> + * @depend_tx: depends on the result of this transaction.
> + * @cb_fn: function to call when the xor completes
> + * @cb_param: parameter to pass to the callback routine
> + */
> +struct dma_async_tx_descriptor *
> +async_syndrome_zero_sum(struct page **blocks, unsigned int offset,
> +       int src_cnt, size_t len, u32 *presult, u32 *qresult,
> +       enum async_tx_flags flags, struct dma_async_tx_descriptor *depend_tx,
> +       dma_async_tx_callback cb_fn, void *cb_param)
> +{
> +       struct dma_chan *chan = async_tx_find_channel(depend_tx,
> +                                                     DMA_PQ_ZERO_SUM,
> +                                                     &blocks[src_cnt], 2,
> +                                                     blocks, src_cnt, len);
> +       struct dma_device *device = chan ? chan->device : NULL;
> +       struct dma_async_tx_descriptor *tx = NULL;
> +
> +       BUG_ON(src_cnt < 2);
> +
> +       if (device && src_cnt <= device->max_pq) {
> +               dma_addr_t dma_src[src_cnt + 2];
> +               enum dma_ctrl_flags dma_flags = cb_fn ? DMA_PREP_INTERRUPT : 0;
> +               int i;
> +
> +               for (i = 0; i < src_cnt + 2; i++)
> +                       dma_src[i] = blocks[i] ? dma_map_page(device->dev,
> +                                       blocks[i], offset, len,
> +                                       DMA_TO_DEVICE) : 0;
> +
> +               tx = device->device_prep_dma_pqzero_sum(chan, dma_src, src_cnt,
> +                                                     (uint8_t *)raid6_gfexp,
> +                                                     len, presult, qresult,
> +                                                     dma_flags);
> +
> +               if (unlikely(!tx)) {
> +                       async_tx_quiesce(&depend_tx);
> +                       while (unlikely(!tx)) {
> +                               dma_async_issue_pending(chan);
> +                               tx = device->device_prep_dma_pqzero_sum(chan,
> +                                               dma_src, src_cnt,
> +                                               (uint8_t *)raid6_gfexp, len,
> +                                               presult, qresult,
> +                                               dma_flags);
> +                       }
> +               }
> +
> +               async_tx_submit(chan, tx, flags, depend_tx, cb_fn, cb_param);
> +       } else {
> +               struct page *pdest = blocks[src_cnt];
> +               struct page *qdest = blocks[src_cnt + 1];
> +               enum async_tx_flags lflags = flags;
> +
> +               lflags &= ~ASYNC_TX_ACK;
> +
> +               spin_lock(&spare_lock);
> +               blocks[src_cnt] = spare_pages[0];
> +               blocks[src_cnt + 1] = spare_pages[1];
> +               tx = async_gen_syndrome(blocks, offset,
> +                                       src_cnt, len, lflags,
> +                                       depend_tx, NULL, NULL);
> +               async_tx_quiesce(&tx);
> +
> +               if (presult && pdest)
> +                       *presult = memcmp(page_address(pdest) + offset,
> +                                         page_address(spare_pages[0]) +
> +                                                  offset, len) == 0 ? 0 : 1;
> +               if (qresult && qdest)
> +                       *qresult = memcmp(page_address(qdest) + offset,
> +                                         page_address(spare_pages[1]) +
> +                                                  offset, len) == 0 ? 0 : 1;
> +               spin_unlock(&spare_lock);
> +       }
> +
> +       return tx;
> +}
> +EXPORT_SYMBOL_GPL(async_syndrome_zero_sum);
> +
> +static int __init async_pq_init(void)
> +{
> +       spin_lock_init(&spare_lock);
> +
> +       spare_pages[0] = alloc_page(GFP_KERNEL);
> +       if (!spare_pages[0])
> +               goto abort;
> +       spare_pages[1] = alloc_page(GFP_KERNEL);
> +       if (!spare_pages[1])
> +               goto abort;
> +       spare_pages[2] = alloc_page(GFP_KERNEL);
> +       if (!spare_pages[2])
> +               goto abort;
> +       return 0;
> +abort:
> +       safe_put_page(spare_pages[2]);
> +       safe_put_page(spare_pages[1]);
> +       safe_put_page(spare_pages[0]);
> +       printk(KERN_ERR "%s: cannot allocate spare!\n", __func__);
> +       return -ENOMEM;
> +}
> +
> +static void __exit async_pq_exit(void)
> +{
> +       safe_put_page(spare_pages[2]);
> +       safe_put_page(spare_pages[1]);
> +       safe_put_page(spare_pages[0]);
> +}
> +
> +module_init(async_pq_init);
> +module_exit(async_pq_exit);
> +
> +MODULE_AUTHOR("Yuri Tikhonov <yur@emcraft.com>");
> +MODULE_DESCRIPTION("asynchronous pq/pq-zero-sum api");
> +MODULE_LICENSE("GPL");
> diff --git a/include/linux/async_tx.h b/include/linux/async_tx.h
> index 0f50d4c..5d6b639 100644
> --- a/include/linux/async_tx.h
> +++ b/include/linux/async_tx.h
> @@ -42,6 +42,12 @@ struct dma_chan_ref {
>  * @ASYNC_TX_XOR_ZERO_DST: this flag must be used for xor operations where the
>  * the destination address is not a source.  The asynchronous case handles this
>  * implicitly, the synchronous case needs to zero the destination block.
> + * @ASYNC_TX_PQ_ZERO_P: this flag must be used for async_pq operations since the
> + * destination there is always the source (the result of P after async_pq is
> + * xor-ed with the previous content of P block if this flag isn't set).
> + * @ASYNC_TX_PQ_ZERO_Q: this flag must be used for async_pq operations since the
> + * destination there is always the source (the result of Q after async_pq is
> + * xor-ed with the previous content of Q block if this flag isn't set).
>  * @ASYNC_TX_XOR_DROP_DST: this flag must be used if the destination address is
>  * also one of the source addresses.  In the synchronous case the destination
>  * address is an implied source, whereas the asynchronous case it must be listed
> @@ -50,12 +56,17 @@ struct dma_chan_ref {
>  * @ASYNC_TX_ACK: immediately ack the descriptor, precludes setting up a
>  * dependency chain
>  * @ASYNC_TX_DEP_ACK: ack the dependency descriptor.  Useful for chaining.
> + * @ASYNC_TX_ASYNC_ONLY: if set then try to perform operation requested only in
> + * the asynchronous mode.
>  */
>  enum async_tx_flags {
>        ASYNC_TX_XOR_ZERO_DST    = (1 << 0),
> -       ASYNC_TX_XOR_DROP_DST    = (1 << 1),
> -       ASYNC_TX_ACK             = (1 << 3),
> -       ASYNC_TX_DEP_ACK         = (1 << 4),
> +       ASYNC_TX_PQ_ZERO_P       = (1 << 1),
> +       ASYNC_TX_PQ_ZERO_Q       = (1 << 2),
> +       ASYNC_TX_XOR_DROP_DST    = (1 << 3),
> +       ASYNC_TX_ACK             = (1 << 4),
> +       ASYNC_TX_DEP_ACK         = (1 << 5),
> +       ASYNC_TX_ASYNC_ONLY      = (1 << 6),
>  };
>
>  #ifdef CONFIG_DMA_ENGINE
> @@ -146,5 +157,33 @@ async_trigger_callback(enum async_tx_flags flags,
>        struct dma_async_tx_descriptor *depend_tx,
>        dma_async_tx_callback cb_fn, void *cb_fn_param);
>
> +struct dma_async_tx_descriptor *
> +async_pqxor(struct page *pdest, struct page *qdest,
> +       struct page **src_list, unsigned char *scoef_list,
> +       unsigned int offset, int src_cnt, size_t len, enum async_tx_flags flags,
> +       struct dma_async_tx_descriptor *depend_tx,
> +       dma_async_tx_callback callback, void *callback_param);
> +

...forgot to update the declartion.

In this case async_pq() can be declared static since nothing outside
of async_pq.c calls it.

> +struct dma_async_tx_descriptor *
> +async_gen_syndrome(struct page *pdest, struct page *qdest,
> +       struct page **src_list, unsigned int offset, int src_cnt, size_t len,
> +       enum async_tx_flags flags, struct dma_async_tx_descriptor *depend_tx,
> +       dma_async_tx_callback callback, void *callback_param);
> +

...forgot to update the declartion.

> +struct dma_async_tx_descriptor *
> +async_pqxor_zero_sum(struct page *pdest, struct page *qdest,
> +       struct page **src_list, unsigned char *scoef_list,
> +       unsigned int offset, int src_cnt, size_t len,
> +       u32 *presult, u32 *qresult, enum async_tx_flags flags,
> +       struct dma_async_tx_descriptor *depend_tx,
> +       dma_async_tx_callback callback, void *callback_param);
> +

...ditto

> +struct dma_async_tx_descriptor *
> +async_syndrome_zero_sum(struct page *pdest, struct page *qdest,
> +       struct page **src_list, unsigned int offset, int src_cnt, size_t len,
> +       u32 *presult, u32 *qresult, enum async_tx_flags flags,
> +       struct dma_async_tx_descriptor *depend_tx,
> +       dma_async_tx_callback callback, void *callback_param);
> +

...ditto again.

>  void async_tx_quiesce(struct dma_async_tx_descriptor **tx);
>  #endif /* _ASYNC_TX_H_ */
> diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
> index adb0b08..84525c3 100644
> --- a/include/linux/dmaengine.h
> +++ b/include/linux/dmaengine.h
> @@ -81,7 +81,7 @@ enum dma_status {
>  enum dma_transaction_type {
>        DMA_MEMCPY,
>        DMA_XOR,
> -       DMA_PQ_XOR,
> +       DMA_PQ,
>        DMA_DUAL_XOR,
>        DMA_PQ_UPDATE,
>        DMA_ZERO_SUM,
> @@ -123,6 +123,8 @@ enum dma_ctrl_flags {
>        DMA_CTRL_ACK = (1 << 1),
>        DMA_COMPL_SKIP_SRC_UNMAP = (1 << 2),
>        DMA_COMPL_SKIP_DEST_UNMAP = (1 << 3),
> +       DMA_PREP_ZERO_P = (1 << 4),
> +       DMA_PREP_ZERO_Q = (1 << 5),
>  };

I would rather not add operation-type-specific flags to
dma_ctrl_flags.  In this case can we set up a dependency chain with
async_memset()?

>
>  /**
> @@ -299,6 +301,7 @@ struct dma_async_tx_descriptor {
>  * @global_node: list_head for global dma_device_list
>  * @cap_mask: one or more dma_capability flags
>  * @max_xor: maximum number of xor sources, 0 if no capability
> + * @max_pq: maximum number of PQ sources, 0 if no capability
>  * @refcount: reference count
>  * @done: IO completion struct
>  * @dev_id: unique device ID
> @@ -308,7 +311,9 @@ struct dma_async_tx_descriptor {
>  * @device_free_chan_resources: release DMA channel's resources
>  * @device_prep_dma_memcpy: prepares a memcpy operation
>  * @device_prep_dma_xor: prepares a xor operation
> + * @device_prep_dma_pq: prepares a pq operation
>  * @device_prep_dma_zero_sum: prepares a zero_sum operation
> + * @device_prep_dma_pqzero_sum: prepares a pqzero_sum operation
>  * @device_prep_dma_memset: prepares a memset operation
>  * @device_prep_dma_interrupt: prepares an end of chain interrupt operation
>  * @device_prep_slave_sg: prepares a slave dma operation
> @@ -322,6 +327,7 @@ struct dma_device {
>        struct list_head global_node;
>        dma_cap_mask_t  cap_mask;
>        int max_xor;
> +       int max_pq;
>

max_xor and max_pq can be changed to unsigned shorts to keep the size
of the struct the same.

>        struct kref refcount;
>        struct completion done;
> @@ -339,9 +345,17 @@ struct dma_device {
>        struct dma_async_tx_descriptor *(*device_prep_dma_xor)(
>                struct dma_chan *chan, dma_addr_t dest, dma_addr_t *src,
>                unsigned int src_cnt, size_t len, unsigned long flags);
> +       struct dma_async_tx_descriptor *(*device_prep_dma_pq)(
> +               struct dma_chan *chan, dma_addr_t *dst, dma_addr_t *src,
> +               unsigned int src_cnt, unsigned char *scf,
> +               size_t len, unsigned long flags);
>        struct dma_async_tx_descriptor *(*device_prep_dma_zero_sum)(
>                struct dma_chan *chan, dma_addr_t *src, unsigned int src_cnt,
>                size_t len, u32 *result, unsigned long flags);
> +       struct dma_async_tx_descriptor *(*device_prep_dma_pqzero_sum)(
> +               struct dma_chan *chan, dma_addr_t *src, unsigned int src_cnt,
> +               unsigned char *scf,
> +               size_t len, u32 *presult, u32 *qresult, unsigned long flags);

I would rather we turn the 'result' parameter into a pointer to flags
where bit 0 is the xor/p result and bit1 is the q result.

Thanks,
Dan

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re[2]: [PATCH 02/11][v2] async_tx: add support for asynchronous GF multiplication
  2008-12-17 18:34 ` Dan Williams
@ 2008-12-19  7:43   ` Yuri Tikhonov
  0 siblings, 0 replies; 3+ messages in thread
From: Yuri Tikhonov @ 2008-12-19  7:43 UTC (permalink / raw)
  To: Dan Williams; +Cc: linux-raid, linuxppc-dev, wd, dzu, yanok

Hello Dan,

On Wednesday, December 17, 2008 you wrote:

 [..]

>> +       /* DMAs use destinations as sources, so use BIDIRECTIONAL mappin=
g */
>> +       dma_dest[0] =3D !blocks[src_cnt] ? 0 :
>> +                               dma_map_page(dma->dev, blocks[src_cnt],
>> +                                            offset, len, DMA_BIDIRECTIO=
NAL);

> "0" could be a valid dma address on some architectures.
> DMA_ERROR_CODE looks like the closest fit for what we are trying to do
> here, but that only exists on sparc and powerpc.  We could add a
> "dest_mask" parameter to device_prep_dma_pq where the mask is  1 =3D
> p-only, 2 =3D q-only, and 3 =3D p and q.

 Understood. We can just introduce new DMA_xxx flags and pass them=20
among the other ones passed with device_prep_dma_pq() to ADMA driver=20
instead of introducing a new "dest_mask" parameter. Though, I guess,=20
you meant exactly the same.

>> +       dma_dest[1] =3D !blocks[src_cnt+1] ? 0 :
>> +                               dma_map_page(dma->dev, blocks[src_cnt+1],
>> +                                            offset, len, DMA_BIDIRECTIO=
NAL);
>> +
>> +       for (i =3D 0; i < src_cnt; i++)
>> +               dma_src[i] =3D dma_map_page(dma->dev, blocks[i],
>> +                                         offset, len, DMA_TO_DEVICE);
>> +
>> +       while (src_cnt) {
>> +               async_flags =3D flags;
>> +               pq_src_cnt =3D min(src_cnt, dma->max_pq);
>> +               /* if we are submitting additional pqs, leave the chain =
open,
>> +                * clear the callback parameters, and leave the destinat=
ion
>> +                * buffers mapped
>> +                */
>> +               if (src_cnt > pq_src_cnt) {
>> +                       async_flags &=3D ~ASYNC_TX_ACK;
>> +                       dma_flags |=3D DMA_COMPL_SKIP_DEST_UNMAP;
>> +                       _cb_fn =3D NULL;
>> +                       _cb_param =3D NULL;
>> +               } else {
>> +                       _cb_fn =3D cb_fn;
>> +                       _cb_param =3D cb_param;
>> +               }
>> +               if (_cb_fn)
>> +                       dma_flags |=3D DMA_PREP_INTERRUPT;
>> +
>> +               /* Since we have clobbered the src_list we are committed
>> +                * to doing this asynchronously.  Drivers force forward
>> +                * progress in case they can not provide a descriptor
>> +                */
>> +               tx =3D dma->device_prep_dma_pq(chan, dma_dest,
>> +                                            &dma_src[src_off], pq_src_c=
nt,
>> +                                            scf_list ? &scf_list[src_of=
f] :
>> +                                                       NULL,
>> +                                            len, dma_flags);

> ...one nit for readability can we replace these ternary conditionals
> with proper if-else statements?  i.e.

>                 if (scf_list)
>                         scf =3D &scf_list[src_off];
>                 else
>                         scf =3D NULL;
>                 tx =3D dma->device_prep_dma_pq(chan, dma_dest,
>                                              &dma_src[src_off], pq_src_cn=
t,
>                                              scf, len, dma_flags);

 Thanks for pointing this. Sure. Furthermore, it's additionally even a=20
question of performance: e.g. in do_async_pq() we do this "? : " in a=20
cycle, whereas there is absolutely no reason to think it changes.

 [..]

>> +/**
>> + * async_pq_zero_sum - attempt a PQ parities check with a dma engine.
>> + * @blocks: array of source pages. The 0..src_cnt-1 are the sources, the
>> + *     src_cnt and src_cnt+1 are the P and Q destinations to check, res=
p.
>> + *     Only one of two destinations may be present.
>> + *     NOTE: client code must assume the contents of this array are des=
troyed
>> + * @scf: coefficients to use in GF-multiplications
>> + * @offset: offset in pages to start transaction
>> + * @src_cnt: number of source pages
>> + * @len: length in bytes
>> + * @presult: where to store the result of P-ckeck, which is 0 if P-pari=
ty
>> + *     OK, and non-zero otherwise.
>> + * @qresult: where to store the result of P-ckeck, which is 0 if Q-pari=
ty
>> + *     OK, and non-zero otherwise.
>> + * @flags: ASYNC_TX_ASSUME_COHERENT, ASYNC_TX_ACK, ASYNC_TX_DEP_ACK
>> + * @depend_tx: depends on the result of this transaction.
>> + * @cb_fn: function to call when the xor completes
>> + * @cb_param: parameter to pass to the callback routine
>> + */
>> +struct dma_async_tx_descriptor *
>> +async_pq_zero_sum(struct page **blocks, unsigned char *scf,
>> +       unsigned int offset, int src_cnt, size_t len,
>> +       u32 *presult, u32 *qresult, enum async_tx_flags flags,
>> +       struct dma_async_tx_descriptor *depend_tx,
>> +       dma_async_tx_callback cb_fn, void *cb_param)
>> +{
>> +       struct dma_chan *chan =3D async_tx_find_channel(depend_tx,
>> +                                                     DMA_PQ_ZERO_SUM,
>> +                                                     &blocks[src_cnt], =
2,
>> +                                                     blocks, src_cnt, l=
en);
>> +       struct dma_device *device =3D chan ? chan->device : NULL;
>> +       struct dma_async_tx_descriptor *tx =3D NULL;
>> +
>> +       BUG_ON(src_cnt < 2);
>> +
>> +       if (device && src_cnt <=3D device->max_pq) {
>> +               dma_addr_t dma_src[src_cnt + 2];
>> +               enum dma_ctrl_flags dma_flags =3D cb_fn ? DMA_PREP_INTER=
RUPT : 0;
>> +               int i;
>> +
>> +               for (i =3D 0; i < src_cnt + 2; i++)
>> +                       dma_src[i] =3D blocks[i] ? dma_map_page(device->=
dev,
>> +                                       blocks[i], offset, len,
>> +                                       DMA_TO_DEVICE) : 0;

> If we go with the "dest_mask" approach to specifying p and q then we
> need to separate them into their own parameter here... although in
> this case it would be a "src_mask" to select p or q.

 We shouldn't do this if enhance 'enum dma_ctrl_flags' with, say,=20
DMA_PREP_P_PRESENT, DMA_PREP_Q_PRESENT. The adma driver which support=20
device_prep_dma_pqzero_sum() then should use/or not first dma_src=20
(which are destinations) depending on dma_flags set.

 [..]

>> diff --git a/include/linux/async_tx.h b/include/linux/async_tx.h
>> index 0f50d4c..5d6b639 100644
>> --- a/include/linux/async_tx.h
>> +++ b/include/linux/async_tx.h
>> @@ -42,6 +42,12 @@ struct dma_chan_ref {
>>  * @ASYNC_TX_XOR_ZERO_DST: this flag must be used for xor operations whe=
re the
>>  * the destination address is not a source.  The asynchronous case handl=
es this
>>  * implicitly, the synchronous case needs to zero the destination block.
>> + * @ASYNC_TX_PQ_ZERO_P: this flag must be used for async_pq operations =
since the
>> + * destination there is always the source (the result of P after async_=
pq is
>> + * xor-ed with the previous content of P block if this flag isn't set).
>> + * @ASYNC_TX_PQ_ZERO_Q: this flag must be used for async_pq operations =
since the
>> + * destination there is always the source (the result of Q after async_=
pq is
>> + * xor-ed with the previous content of Q block if this flag isn't set).
>>  * @ASYNC_TX_XOR_DROP_DST: this flag must be used if the destination add=
ress is
>>  * also one of the source addresses.  In the synchronous case the destin=
ation
>>  * address is an implied source, whereas the asynchronous case it must b=
e listed
>> @@ -50,12 +56,17 @@ struct dma_chan_ref {
>>  * @ASYNC_TX_ACK: immediately ack the descriptor, precludes setting up a
>>  * dependency chain
>>  * @ASYNC_TX_DEP_ACK: ack the dependency descriptor.  Useful for chainin=
g.
>> + * @ASYNC_TX_ASYNC_ONLY: if set then try to perform operation requested=
 only in
>> + * the asynchronous mode.
>>  */
>>  enum async_tx_flags {
>>        ASYNC_TX_XOR_ZERO_DST    =3D (1 << 0),
>> -       ASYNC_TX_XOR_DROP_DST    =3D (1 << 1),
>> -       ASYNC_TX_ACK             =3D (1 << 3),
>> -       ASYNC_TX_DEP_ACK         =3D (1 << 4),
>> +       ASYNC_TX_PQ_ZERO_P       =3D (1 << 1),
>> +       ASYNC_TX_PQ_ZERO_Q       =3D (1 << 2),
>> +       ASYNC_TX_XOR_DROP_DST    =3D (1 << 3),
>> +       ASYNC_TX_ACK             =3D (1 << 4),
>> +       ASYNC_TX_DEP_ACK         =3D (1 << 5),
>> +       ASYNC_TX_ASYNC_ONLY      =3D (1 << 6),
>>  };
>>
>>  #ifdef CONFIG_DMA_ENGINE
>> @@ -146,5 +157,33 @@ async_trigger_callback(enum async_tx_flags flags,
>>        struct dma_async_tx_descriptor *depend_tx,
>>        dma_async_tx_callback cb_fn, void *cb_fn_param);
>>
>> +struct dma_async_tx_descriptor *
>> +async_pqxor(struct page *pdest, struct page *qdest,
>> +       struct page **src_list, unsigned char *scoef_list,
>> +       unsigned int offset, int src_cnt, size_t len, enum async_tx_flag=
s flags,
>> +       struct dma_async_tx_descriptor *depend_tx,
>> +       dma_async_tx_callback callback, void *callback_param);
>> +

> ...forgot to update the declartion.

 Argh.. Missed this when re-generated my final internal patch version.

> In this case async_pq() can be declared static since nothing outside
> of async_pq.c calls it.

 It's not true. async_r6_dd_recov() and async_r6_dp_recov() functions=20
actively utilize async_pq(). See crypto/async_tx/async_r6recov.c.

 [..]

>>  void async_tx_quiesce(struct dma_async_tx_descriptor **tx);
>>  #endif /* _ASYNC_TX_H_ */
>> diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
>> index adb0b08..84525c3 100644
>> --- a/include/linux/dmaengine.h
>> +++ b/include/linux/dmaengine.h
>> @@ -81,7 +81,7 @@ enum dma_status {
>>  enum dma_transaction_type {
>>        DMA_MEMCPY,
>>        DMA_XOR,
>> -       DMA_PQ_XOR,
>> +       DMA_PQ,
>>        DMA_DUAL_XOR,
>>        DMA_PQ_UPDATE,
>>        DMA_ZERO_SUM,
>> @@ -123,6 +123,8 @@ enum dma_ctrl_flags {
>>        DMA_CTRL_ACK =3D (1 << 1),
>>        DMA_COMPL_SKIP_SRC_UNMAP =3D (1 << 2),
>>        DMA_COMPL_SKIP_DEST_UNMAP =3D (1 << 3),
>> +       DMA_PREP_ZERO_P =3D (1 << 4),
>> +       DMA_PREP_ZERO_Q =3D (1 << 5),
>>  };

> I would rather not add operation-type-specific flags to
> dma_ctrl_flags.

 But we need somehow:

1) point the ADMA driver should it clear the destination or not;
2) if (1), then what destination(s) to clear.

 Above I even propose to add two more flags here :) Are there any=20
reasons why we should spare dma_ctrl_flags, and, instead of adding a=20
couple of new flag bits which are even do not lead to the sizeof(enum)=20
growth, increase the stack usage and, in general, the time of=20
functions calls by adding new parameters to ADMA methods ?

>   In this case can we set up a dependency chain with
> async_memset()?

 Well, we can. But wouldn't this be an overhead? For example,=20
ppc440spe DMA allows to do so-called RXOR which overwrites, and=20
doesn't take care of destinations. So, we can do ZERO_DST(s)+PQ in one=20
short on one DMA engine. Again, I'm not sure that keeping=20
dma_ctrl_flags unchanged is worthy of creating such a dependency;=20
it'll obviously lead both to degradation of performance & increasing=20
of CPU utilization.

>>
>>  /**
>> @@ -299,6 +301,7 @@ struct dma_async_tx_descriptor {
>>  * @global_node: list_head for global dma_device_list
>>  * @cap_mask: one or more dma_capability flags
>>  * @max_xor: maximum number of xor sources, 0 if no capability
>> + * @max_pq: maximum number of PQ sources, 0 if no capability
>>  * @refcount: reference count
>>  * @done: IO completion struct
>>  * @dev_id: unique device ID
>> @@ -308,7 +311,9 @@ struct dma_async_tx_descriptor {
>>  * @device_free_chan_resources: release DMA channel's resources
>>  * @device_prep_dma_memcpy: prepares a memcpy operation
>>  * @device_prep_dma_xor: prepares a xor operation
>> + * @device_prep_dma_pq: prepares a pq operation
>>  * @device_prep_dma_zero_sum: prepares a zero_sum operation
>> + * @device_prep_dma_pqzero_sum: prepares a pqzero_sum operation
>>  * @device_prep_dma_memset: prepares a memset operation
>>  * @device_prep_dma_interrupt: prepares an end of chain interrupt operat=
ion
>>  * @device_prep_slave_sg: prepares a slave dma operation
>> @@ -322,6 +327,7 @@ struct dma_device {
>>        struct list_head global_node;
>>        dma_cap_mask_t  cap_mask;
>>        int max_xor;
>> +       int max_pq;
>>

> max_xor and max_pq can be changed to unsigned shorts to keep the size
> of the struct the same.

 Right.

>>        struct kref refcount;
>>        struct completion done;
>> @@ -339,9 +345,17 @@ struct dma_device {
>>        struct dma_async_tx_descriptor *(*device_prep_dma_xor)(
>>                struct dma_chan *chan, dma_addr_t dest, dma_addr_t *src,
>>                unsigned int src_cnt, size_t len, unsigned long flags);
>> +       struct dma_async_tx_descriptor *(*device_prep_dma_pq)(
>> +               struct dma_chan *chan, dma_addr_t *dst, dma_addr_t *src,
>> +               unsigned int src_cnt, unsigned char *scf,
>> +               size_t len, unsigned long flags);
>>        struct dma_async_tx_descriptor *(*device_prep_dma_zero_sum)(
>>                struct dma_chan *chan, dma_addr_t *src, unsigned int src_=
cnt,
>>                size_t len, u32 *result, unsigned long flags);
>> +       struct dma_async_tx_descriptor *(*device_prep_dma_pqzero_sum)(
>> +               struct dma_chan *chan, dma_addr_t *src, unsigned int src=
_cnt,
>> +               unsigned char *scf,
>> +               size_t len, u32 *presult, u32 *qresult, unsigned long fl=
ags);

> I would rather we turn the 'result' parameter into a pointer to flags
> where bit 0 is the xor/p result and bit1 is the q result.

 Yes, this'll be better.


 Thanks for reviewing. I'll re-generate ASYNC_TX patch (in the parts=20
where I absolutely agreed with you), and then re-post. Any comments=20
regarding RAID-6 part?

 Regards, Yuri

 --
 Yuri Tikhonov, Senior Software Engineer
 Emcraft Systems, www.emcraft.com

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2008-12-19  7:43 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-12-08 21:55 [PATCH 02/11][v2] async_tx: add support for asynchronous GF multiplication Yuri Tikhonov
2008-12-17 18:34 ` Dan Williams
2008-12-19  7:43   ` Re[2]: " Yuri Tikhonov

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).