[PATCH][RFC] RAID5/DMA/memcpy: zero copy the bio page when possible

linux-raid.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH][RFC]      RAID5/DMA/memcpy: zero copy the bio page when possible
@ 2012-01-11  7:47 b29237
  2012-01-11  7:48 ` Shi Xuelin-B29237
  2012-01-11  8:08 ` NeilBrown
  0 siblings, 2 replies; 5+ messages in thread
From: b29237 @ 2012-01-11  7:47 UTC (permalink / raw)
  To: neilb, linux-raid, linux-kernel, r58472; +Cc: Forrest shi

From: Forrest shi <b29237@freescale.com>

     each disk holds one page in the stripe header, the page works as a
     buffer cache for raid computing and it is copied to and from bio page
     which requested by upper level read/write call.

     if the write access would overwrite the whole page, the copy from
     bio page to stripe header page could be avoid. We can do xor directly
     on the bio page.

     this patch implements this strategy and improves raid5 write performance.

     Signed-off-by: Forrest Shi<b29237@freescale.com>
---
 drivers/dma/Kconfig        |    8 +++
 drivers/md/raid5.c         |  125 ++++++++++++++++++++++++++++++++++++++-----
 drivers/md/raid5.h         |    6 ++
 include/linux/page-flags.h |   11 ++++
 mm/filemap.c               |   21 +++++++
 5 files changed, 156 insertions(+), 15 deletions(-)

diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig
index dd8e959..28f960b 100644
--- a/drivers/dma/Kconfig
+++ b/drivers/dma/Kconfig
@@ -280,6 +280,14 @@ config ASYNC_TX_DMA
 
 	  If unsure, say N.
 
+config RAID_ZERO_COPY 
+        bool "Optimized DMA/XOR offload: reduce raid5 memcpy which offloaded for dma"
+        depends on ASYNC_TX_DMA
+        help
+          This allows the async_tx api try to reduce raid5 memcpy operations for
+          dma. If you have dma device to support memcpy offloading, you can set
+          it as Y, else N.
+
 config DMATEST
 	tristate "DMA Test client"
 	depends on DMA_ENGINE
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index cbb50d3..33afeeb 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -3,7 +3,8 @@
  *	   Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman
  *	   Copyright (C) 1999, 2000 Ingo Molnar
  *	   Copyright (C) 2002, 2003 H. Peter Anvin
- *
+ *	   Copyright (C) 2010, Freescale Semiconductor, Inc. All rights
+ *		reserved.
  * RAID-4/5/6 management functions.
  * Thanks to Penguin Computing for making the RAID-6 development possible
  * by donating a test server!
@@ -558,6 +559,14 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
 				set_bit(STRIPE_DEGRADED, &sh->state);
 			pr_debug("skip op %ld on disc %d for sector %llu\n",
 				bi->bi_rw, i, (unsigned long long)sh->sector);
+#ifdef CONFIG_RAID_ZERO_COPY
+			if (test_bit(R5_DirectAccess, &sh->dev[i].flags)) {
+				struct page *pg = sh->dev[i].page;
+				BUG_ON(sh->dev[i].req.bi_io_vec[0].bv_page ==
+					pg);
+				sh->dev[i].req.bi_io_vec[0].bv_page = pg;
+			}
+#endif
 			clear_bit(R5_LOCKED, &sh->dev[i].flags);
 			set_bit(STRIPE_HANDLE, &sh->state);
 		}
@@ -685,6 +694,7 @@ static void ops_run_biofill(struct stripe_head *sh)
 			dev->read = rbi = dev->toread;
 			dev->toread = NULL;
 			spin_unlock_irq(&conf->device_lock);
+
 			while (rbi && rbi->bi_sector <
 				dev->sector + STRIPE_SECTORS) {
 				tx = async_copy_data(0, rbi, dev->page,
@@ -754,10 +764,17 @@ ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu)
 		__func__, (unsigned long long)sh->sector, target);
 	BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
 
-	for (i = disks; i--; )
-		if (i != target)
-			xor_srcs[count++] = sh->dev[i].page;
-
+	for (i = disks; i--; ) {
+		struct r5dev *dev = &sh->dev[i];
+		struct page *pg = dev->page;
+		if (i != target) {
+#ifdef CONFIG_RAID_ZERO_COPY
+			if (test_bit(R5_DirectAccess, &dev->flags))
+				pg = dev->req.bi_io_vec[0].bv_page;
+#endif
+			xor_srcs[count++] = pg;
+		}
+	}
 	atomic_inc(&sh->count);
 
 	init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL,
@@ -993,8 +1010,14 @@ ops_run_prexor(struct stripe_head *sh, struct raid5_percpu *percpu,
 	for (i = disks; i--; ) {
 		struct r5dev *dev = &sh->dev[i];
 		/* Only process blocks that are known to be uptodate */
-		if (test_bit(R5_Wantdrain, &dev->flags))
-			xor_srcs[count++] = dev->page;
+		if (test_bit(R5_Wantdrain, &dev->flags)) {
+			struct page *pg = dev->page;
+#ifdef CONFIG_RAID_ZERO_COPY
+			if (test_bit(R5_DirectAccess, &dev->flags))
+				pg = dev->req.bi_io_vec[0].bv_page;
+#endif
+			xor_srcs[count++] = pg;
+		}
 	}
 
 	init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx,
@@ -1004,6 +1027,32 @@ ops_run_prexor(struct stripe_head *sh, struct raid5_percpu *percpu,
 	return tx;
 }
 
+#ifdef CONFIG_RAID_ZERO_COPY
+static struct page *raid5_zero_copy(struct bio *bio, sector_t sector)
+{
+	sector_t bi_sector = bio->bi_sector;
+	struct page *page = NULL;
+	struct bio_vec *bv;
+	int i;
+
+	bio_for_each_segment(bv, bio, i) {
+		if (sector == bi_sector)
+			page = bio_iovec_idx(bio, i)->bv_page;
+
+		bi_sector += bio_iovec_idx(bio, i)->bv_len >> 9;
+		if (bi_sector >= sector + STRIPE_SECTORS) {
+			/* check if the stripe is covered by one page */
+			if (page == bio_iovec_idx(bio, i)->bv_page) {
+				SetPageConstant(page);
+				return page;
+			}
+			return NULL;
+		}
+	}
+	return NULL;
+}
+#endif
+
 static struct dma_async_tx_descriptor *
 ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
 {
@@ -1025,8 +1074,28 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
 			dev->towrite = NULL;
 			BUG_ON(dev->written);
 			wbi = dev->written = chosen;
+#ifdef CONFIG_RAID_ZERO_COPY
+			set_bit(R5_LOCKED, &dev->flags);
+			BUG_ON(test_bit(R5_DirectAccess, &dev->flags));
 			spin_unlock(&sh->lock);
 
+			if (!wbi->bi_next && test_bit(R5_OVERWRITE, &dev->flags)
+					&& test_bit(R5_Insync, &dev->flags)) {
+				struct page *pg = raid5_zero_copy(wbi,
+								dev->sector);
+				if (pg) {
+					dev->req.bi_io_vec[0].bv_page = pg;
+					set_bit(R5_DirectAccess, &dev->flags);
+					clear_bit(R5_UPTODATE, &dev->flags);
+					clear_bit(R5_OVERWRITE, &dev->flags);
+					continue;
+				}
+			}
+			clear_bit(R5_OVERWRITE, &dev->flags);
+			set_bit(R5_UPTODATE, &dev->flags);
+#else
+			spin_unlock(&sh->lock);
+#endif
 			while (wbi && wbi->bi_sector <
 				dev->sector + STRIPE_SECTORS) {
 				if (wbi->bi_rw & REQ_FUA)
@@ -1102,15 +1171,29 @@ ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu,
 		xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
 		for (i = disks; i--; ) {
 			struct r5dev *dev = &sh->dev[i];
-			if (dev->written)
-				xor_srcs[count++] = dev->page;
+			struct page *pg = dev->page;
+
+			if (dev->written) {
+#ifdef CONFIG_RAID_ZERO_COPY
+				if (test_bit(R5_DirectAccess, &dev->flags))
+					pg = dev->req.bi_io_vec[0].bv_page;
+#endif
+				xor_srcs[count++] = pg;
+			}
 		}
 	} else {
 		xor_dest = sh->dev[pd_idx].page;
 		for (i = disks; i--; ) {
 			struct r5dev *dev = &sh->dev[i];
-			if (i != pd_idx)
-				xor_srcs[count++] = dev->page;
+			struct page *pg = dev->page;
+
+			if (i != pd_idx) {
+#ifdef CONFIG_RAID_ZERO_COPY
+				if (test_bit(R5_DirectAccess, &dev->flags))
+					pg = dev->req.bi_io_vec[0].bv_page;
+#endif
+				xor_srcs[count++] = pg;
+			}
 		}
 	}
 
@@ -1637,6 +1720,7 @@ static void raid5_end_read_request(struct bio * bi, int error)
 			md_error(conf->mddev, rdev);
 		}
 	}
+
 	rdev_dec_pending(conf->disks[i].rdev, conf->mddev);
 	clear_bit(R5_LOCKED, &sh->dev[i].flags);
 	set_bit(STRIPE_HANDLE, &sh->state);
@@ -1666,15 +1750,19 @@ static void raid5_end_write_request(struct bio *bi, int error)
 		md_error(conf->mddev, conf->disks[i].rdev);
 
 	rdev_dec_pending(conf->disks[i].rdev, conf->mddev);
-	
+#ifdef CONFIG_RAID_ZERO_COPY
+	if (test_bit(R5_DirectAccess, &sh->dev[i].flags)) {
+		BUG_ON(sh->dev[i].req.bi_io_vec[0].bv_page == sh->dev[i].page);
+		sh->dev[i].req.bi_io_vec[0].bv_page = sh->dev[i].page;
+	}
+#endif
 	clear_bit(R5_LOCKED, &sh->dev[i].flags);
 	set_bit(STRIPE_HANDLE, &sh->state);
 	release_stripe(sh);
 }
 
-
 static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous);
-	
+
 static void raid5_build_block(struct stripe_head *sh, int i, int previous)
 {
 	struct r5dev *dev = &sh->dev[i];
@@ -2505,7 +2593,11 @@ static void handle_stripe_clean_event(raid5_conf_t *conf,
 		if (sh->dev[i].written) {
 			dev = &sh->dev[i];
 			if (!test_bit(R5_LOCKED, &dev->flags) &&
-				test_bit(R5_UPTODATE, &dev->flags)) {
+				(test_bit(R5_UPTODATE, &dev->flags)
+#ifdef CONFIG_RAID_ZERO_COPY
+				|| test_bit(R5_DirectAccess, &dev->flags)
+#endif
+			)) {
 				/* We can return any write requests */
 				struct bio *wbi, *wbi2;
 				int bitmap_end = 0;
@@ -2513,6 +2605,9 @@ static void handle_stripe_clean_event(raid5_conf_t *conf,
 				spin_lock_irq(&conf->device_lock);
 				wbi = dev->written;
 				dev->written = NULL;
+#ifdef CONFIG_RAID_ZERO_COPY
+				clear_bit(R5_DirectAccess, &dev->flags);
+#endif
 				while (wbi && wbi->bi_sector <
 					dev->sector + STRIPE_SECTORS) {
 					wbi2 = r5_next_bio(wbi, dev->sector);
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 3ca77a2..b1db89f 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -276,6 +276,12 @@ struct r6_state {
 				    */
 #define R5_Wantdrain	13 /* dev->towrite needs to be drained */
 #define R5_WantFUA	14	/* Write should be FUA */
+
+#ifdef CONFIG_RAID_ZERO_COPY
+#define R5_DirectAccess 15 /* access cached pages directly instead of
+                                       * sh pages */
+#endif
+
 /*
  * Write method
  */
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 6081493..fc74541 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -104,6 +104,9 @@ enum pageflags {
 #ifdef CONFIG_MEMORY_FAILURE
 	PG_hwpoison,		/* hardware poisoned page. Don't touch */
 #endif
+#ifdef CONFIG_RAID_ZERO_COPY
+        PG_constant,            /* const page not modified during raid5 io */
+#endif
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 	PG_compound_lock,
 #endif
@@ -196,6 +199,14 @@ static inline int __TestClearPage##uname(struct page *page) { return 0; }
 
 struct page;	/* forward declaration */
 
+#ifdef CONFIG_RAID_ZERO_COPY
+#define PageConstant(page) test_bit(PG_constant, &(page)->flags)
+#define SetPageConstant(page) set_bit(PG_constant, &(page)->flags)
+#define ClearPageConstant(page) clear_bit(PG_constant, &(page->flags))
+#define TestSetPageConstant(page) test_and_set_bit(PG_constant, &(page)->flags)
+extern void clear_page_constant(struct page *page);
+#endif
+
 TESTPAGEFLAG(Locked, locked)
 PAGEFLAG(Error, error) TESTCLEARFLAG(Error, error)
 PAGEFLAG(Referenced, referenced) TESTCLEARFLAG(Referenced, referenced)
diff --git a/mm/filemap.c b/mm/filemap.c
index a8251a8..5052b72 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -28,6 +28,11 @@
 #include <linux/backing-dev.h>
 #include <linux/pagevec.h>
 #include <linux/blkdev.h>
+
+#ifdef CONFIG_RAID_ZERO_COPY
+#include <linux/rmap.h>
+#endif
+
 #include <linux/security.h>
 #include <linux/syscalls.h>
 #include <linux/cpuset.h>
@@ -636,10 +641,26 @@ void end_page_writeback(struct page *page)
 		BUG();
 
 	smp_mb__after_clear_bit();
+
+#ifdef CONFIG_RAID_ZERO_COPY
+	clear_page_constant(page);
+#endif
+
 	wake_up_page(page, PG_writeback);
 }
 EXPORT_SYMBOL(end_page_writeback);
 
+#ifdef CONFIG_RAID_ZERO_COPY
+void clear_page_constant(struct page *page)
+{
+	if (PageConstant(page)) {
+		ClearPageConstant(page);
+		SetPageUptodate(page);
+	}
+}
+EXPORT_SYMBOL(clear_page_constant);
+#endif
+
 /**
  * __lock_page - get a lock on the page, assuming we need to sleep to get it
  * @page: the page to lock
-- 
1.7.0.4



^ permalink raw reply related	[flat|nested] 5+ messages in thread

* RE: [PATCH][RFC]      RAID5/DMA/memcpy: zero copy the bio page when possible
  2012-01-11  7:47 [PATCH][RFC] RAID5/DMA/memcpy: zero copy the bio page when possible b29237
@ 2012-01-11  7:48 ` Shi Xuelin-B29237
  2012-01-11  8:08 ` NeilBrown
  1 sibling, 0 replies; 5+ messages in thread
From: Shi Xuelin-B29237 @ 2012-01-11  7:48 UTC (permalink / raw)
  To: neilb@suse.de, linux-raid@vger.kernel.org,
	linux-kernel@vger.kernel.org, Li Yang-R58472
  Cc: Shi Xuelin-B29237

Revised version according to NeilBrown's comments.
Add more description about the patch and fix one typo error of {}.

Thanks,
Forrest

-----Original Message-----
From: Shi Xuelin-B29237 
Sent: 2012年1月11日 15:48
To: neilb@suse.de; linux-raid@vger.kernel.org; linux-kernel@vger.kernel.org; Li Yang-R58472
Cc: Shi Xuelin-B29237
Subject: [PATCH][RFC] RAID5/DMA/memcpy: zero copy the bio page when possible

From: Forrest shi <b29237@freescale.com>

     each disk holds one page in the stripe header, the page works as a
     buffer cache for raid computing and it is copied to and from bio page
     which requested by upper level read/write call.

     if the write access would overwrite the whole page, the copy from
     bio page to stripe header page could be avoid. We can do xor directly
     on the bio page.

     this patch implements this strategy and improves raid5 write performance.

     Signed-off-by: Forrest Shi<b29237@freescale.com>
---
 drivers/dma/Kconfig        |    8 +++
 drivers/md/raid5.c         |  125 ++++++++++++++++++++++++++++++++++++++-----
 drivers/md/raid5.h         |    6 ++
 include/linux/page-flags.h |   11 ++++
 mm/filemap.c               |   21 +++++++
 5 files changed, 156 insertions(+), 15 deletions(-)

diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig index dd8e959..28f960b 100644
--- a/drivers/dma/Kconfig
+++ b/drivers/dma/Kconfig
@@ -280,6 +280,14 @@ config ASYNC_TX_DMA
 
 	  If unsure, say N.
 
+config RAID_ZERO_COPY 
+        bool "Optimized DMA/XOR offload: reduce raid5 memcpy which offloaded for dma"
+        depends on ASYNC_TX_DMA
+        help
+          This allows the async_tx api try to reduce raid5 memcpy operations for
+          dma. If you have dma device to support memcpy offloading, you can set
+          it as Y, else N.
+
 config DMATEST
 	tristate "DMA Test client"
 	depends on DMA_ENGINE
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index cbb50d3..33afeeb 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -3,7 +3,8 @@
  *	   Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman
  *	   Copyright (C) 1999, 2000 Ingo Molnar
  *	   Copyright (C) 2002, 2003 H. Peter Anvin
- *
+ *	   Copyright (C) 2010, Freescale Semiconductor, Inc. All rights
+ *		reserved.
  * RAID-4/5/6 management functions.
  * Thanks to Penguin Computing for making the RAID-6 development possible
  * by donating a test server!
@@ -558,6 +559,14 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
 				set_bit(STRIPE_DEGRADED, &sh->state);
 			pr_debug("skip op %ld on disc %d for sector %llu\n",
 				bi->bi_rw, i, (unsigned long long)sh->sector);
+#ifdef CONFIG_RAID_ZERO_COPY
+			if (test_bit(R5_DirectAccess, &sh->dev[i].flags)) {
+				struct page *pg = sh->dev[i].page;
+				BUG_ON(sh->dev[i].req.bi_io_vec[0].bv_page ==
+					pg);
+				sh->dev[i].req.bi_io_vec[0].bv_page = pg;
+			}
+#endif
 			clear_bit(R5_LOCKED, &sh->dev[i].flags);
 			set_bit(STRIPE_HANDLE, &sh->state);
 		}
@@ -685,6 +694,7 @@ static void ops_run_biofill(struct stripe_head *sh)
 			dev->read = rbi = dev->toread;
 			dev->toread = NULL;
 			spin_unlock_irq(&conf->device_lock);
+
 			while (rbi && rbi->bi_sector <
 				dev->sector + STRIPE_SECTORS) {
 				tx = async_copy_data(0, rbi, dev->page, @@ -754,10 +764,17 @@ ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu)
 		__func__, (unsigned long long)sh->sector, target);
 	BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
 
-	for (i = disks; i--; )
-		if (i != target)
-			xor_srcs[count++] = sh->dev[i].page;
-
+	for (i = disks; i--; ) {
+		struct r5dev *dev = &sh->dev[i];
+		struct page *pg = dev->page;
+		if (i != target) {
+#ifdef CONFIG_RAID_ZERO_COPY
+			if (test_bit(R5_DirectAccess, &dev->flags))
+				pg = dev->req.bi_io_vec[0].bv_page; #endif
+			xor_srcs[count++] = pg;
+		}
+	}
 	atomic_inc(&sh->count);
 
 	init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL, @@ -993,8 +1010,14 @@ ops_run_prexor(struct stripe_head *sh, struct raid5_percpu *percpu,
 	for (i = disks; i--; ) {
 		struct r5dev *dev = &sh->dev[i];
 		/* Only process blocks that are known to be uptodate */
-		if (test_bit(R5_Wantdrain, &dev->flags))
-			xor_srcs[count++] = dev->page;
+		if (test_bit(R5_Wantdrain, &dev->flags)) {
+			struct page *pg = dev->page;
+#ifdef CONFIG_RAID_ZERO_COPY
+			if (test_bit(R5_DirectAccess, &dev->flags))
+				pg = dev->req.bi_io_vec[0].bv_page; #endif
+			xor_srcs[count++] = pg;
+		}
 	}
 
 	init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx, @@ -1004,6 +1027,32 @@ ops_run_prexor(struct stripe_head *sh, struct raid5_percpu *percpu,
 	return tx;
 }
 
+#ifdef CONFIG_RAID_ZERO_COPY
+static struct page *raid5_zero_copy(struct bio *bio, sector_t sector) {
+	sector_t bi_sector = bio->bi_sector;
+	struct page *page = NULL;
+	struct bio_vec *bv;
+	int i;
+
+	bio_for_each_segment(bv, bio, i) {
+		if (sector == bi_sector)
+			page = bio_iovec_idx(bio, i)->bv_page;
+
+		bi_sector += bio_iovec_idx(bio, i)->bv_len >> 9;
+		if (bi_sector >= sector + STRIPE_SECTORS) {
+			/* check if the stripe is covered by one page */
+			if (page == bio_iovec_idx(bio, i)->bv_page) {
+				SetPageConstant(page);
+				return page;
+			}
+			return NULL;
+		}
+	}
+	return NULL;
+}
+#endif
+
 static struct dma_async_tx_descriptor *  ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)  { @@ -1025,8 +1074,28 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
 			dev->towrite = NULL;
 			BUG_ON(dev->written);
 			wbi = dev->written = chosen;
+#ifdef CONFIG_RAID_ZERO_COPY
+			set_bit(R5_LOCKED, &dev->flags);
+			BUG_ON(test_bit(R5_DirectAccess, &dev->flags));
 			spin_unlock(&sh->lock);
 
+			if (!wbi->bi_next && test_bit(R5_OVERWRITE, &dev->flags)
+					&& test_bit(R5_Insync, &dev->flags)) {
+				struct page *pg = raid5_zero_copy(wbi,
+								dev->sector);
+				if (pg) {
+					dev->req.bi_io_vec[0].bv_page = pg;
+					set_bit(R5_DirectAccess, &dev->flags);
+					clear_bit(R5_UPTODATE, &dev->flags);
+					clear_bit(R5_OVERWRITE, &dev->flags);
+					continue;
+				}
+			}
+			clear_bit(R5_OVERWRITE, &dev->flags);
+			set_bit(R5_UPTODATE, &dev->flags);
+#else
+			spin_unlock(&sh->lock);
+#endif
 			while (wbi && wbi->bi_sector <
 				dev->sector + STRIPE_SECTORS) {
 				if (wbi->bi_rw & REQ_FUA)
@@ -1102,15 +1171,29 @@ ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu,
 		xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
 		for (i = disks; i--; ) {
 			struct r5dev *dev = &sh->dev[i];
-			if (dev->written)
-				xor_srcs[count++] = dev->page;
+			struct page *pg = dev->page;
+
+			if (dev->written) {
+#ifdef CONFIG_RAID_ZERO_COPY
+				if (test_bit(R5_DirectAccess, &dev->flags))
+					pg = dev->req.bi_io_vec[0].bv_page; #endif
+				xor_srcs[count++] = pg;
+			}
 		}
 	} else {
 		xor_dest = sh->dev[pd_idx].page;
 		for (i = disks; i--; ) {
 			struct r5dev *dev = &sh->dev[i];
-			if (i != pd_idx)
-				xor_srcs[count++] = dev->page;
+			struct page *pg = dev->page;
+
+			if (i != pd_idx) {
+#ifdef CONFIG_RAID_ZERO_COPY
+				if (test_bit(R5_DirectAccess, &dev->flags))
+					pg = dev->req.bi_io_vec[0].bv_page; #endif
+				xor_srcs[count++] = pg;
+			}
 		}
 	}
 
@@ -1637,6 +1720,7 @@ static void raid5_end_read_request(struct bio * bi, int error)
 			md_error(conf->mddev, rdev);
 		}
 	}
+
 	rdev_dec_pending(conf->disks[i].rdev, conf->mddev);
 	clear_bit(R5_LOCKED, &sh->dev[i].flags);
 	set_bit(STRIPE_HANDLE, &sh->state);
@@ -1666,15 +1750,19 @@ static void raid5_end_write_request(struct bio *bi, int error)
 		md_error(conf->mddev, conf->disks[i].rdev);
 
 	rdev_dec_pending(conf->disks[i].rdev, conf->mddev);
-	
+#ifdef CONFIG_RAID_ZERO_COPY
+	if (test_bit(R5_DirectAccess, &sh->dev[i].flags)) {
+		BUG_ON(sh->dev[i].req.bi_io_vec[0].bv_page == sh->dev[i].page);
+		sh->dev[i].req.bi_io_vec[0].bv_page = sh->dev[i].page;
+	}
+#endif
 	clear_bit(R5_LOCKED, &sh->dev[i].flags);
 	set_bit(STRIPE_HANDLE, &sh->state);
 	release_stripe(sh);
 }
 
-
 static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous);
-	
+
 static void raid5_build_block(struct stripe_head *sh, int i, int previous)  {
 	struct r5dev *dev = &sh->dev[i];
@@ -2505,7 +2593,11 @@ static void handle_stripe_clean_event(raid5_conf_t *conf,
 		if (sh->dev[i].written) {
 			dev = &sh->dev[i];
 			if (!test_bit(R5_LOCKED, &dev->flags) &&
-				test_bit(R5_UPTODATE, &dev->flags)) {
+				(test_bit(R5_UPTODATE, &dev->flags) #ifdef CONFIG_RAID_ZERO_COPY
+				|| test_bit(R5_DirectAccess, &dev->flags) #endif
+			)) {
 				/* We can return any write requests */
 				struct bio *wbi, *wbi2;
 				int bitmap_end = 0;
@@ -2513,6 +2605,9 @@ static void handle_stripe_clean_event(raid5_conf_t *conf,
 				spin_lock_irq(&conf->device_lock);
 				wbi = dev->written;
 				dev->written = NULL;
+#ifdef CONFIG_RAID_ZERO_COPY
+				clear_bit(R5_DirectAccess, &dev->flags); #endif
 				while (wbi && wbi->bi_sector <
 					dev->sector + STRIPE_SECTORS) {
 					wbi2 = r5_next_bio(wbi, dev->sector); diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 3ca77a2..b1db89f 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -276,6 +276,12 @@ struct r6_state {
 				    */
 #define R5_Wantdrain	13 /* dev->towrite needs to be drained */
 #define R5_WantFUA	14	/* Write should be FUA */
+
+#ifdef CONFIG_RAID_ZERO_COPY
+#define R5_DirectAccess 15 /* access cached pages directly instead of
+                                       * sh pages */ #endif
+
 /*
  * Write method
  */
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 6081493..fc74541 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -104,6 +104,9 @@ enum pageflags {
 #ifdef CONFIG_MEMORY_FAILURE
 	PG_hwpoison,		/* hardware poisoned page. Don't touch */
 #endif
+#ifdef CONFIG_RAID_ZERO_COPY
+        PG_constant,            /* const page not modified during raid5 io */
+#endif
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 	PG_compound_lock,
 #endif
@@ -196,6 +199,14 @@ static inline int __TestClearPage##uname(struct page *page) { return 0; }
 
 struct page;	/* forward declaration */
 
+#ifdef CONFIG_RAID_ZERO_COPY
+#define PageConstant(page) test_bit(PG_constant, &(page)->flags) 
+#define SetPageConstant(page) set_bit(PG_constant, &(page)->flags) 
+#define ClearPageConstant(page) clear_bit(PG_constant, &(page->flags)) 
+#define TestSetPageConstant(page) test_and_set_bit(PG_constant, 
+&(page)->flags) extern void clear_page_constant(struct page *page); 
+#endif
+
 TESTPAGEFLAG(Locked, locked)
 PAGEFLAG(Error, error) TESTCLEARFLAG(Error, error)  PAGEFLAG(Referenced, referenced) TESTCLEARFLAG(Referenced, referenced) diff --git a/mm/filemap.c b/mm/filemap.c index a8251a8..5052b72 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -28,6 +28,11 @@
 #include <linux/backing-dev.h>
 #include <linux/pagevec.h>
 #include <linux/blkdev.h>
+
+#ifdef CONFIG_RAID_ZERO_COPY
+#include <linux/rmap.h>
+#endif
+
 #include <linux/security.h>
 #include <linux/syscalls.h>
 #include <linux/cpuset.h>
@@ -636,10 +641,26 @@ void end_page_writeback(struct page *page)
 		BUG();
 
 	smp_mb__after_clear_bit();
+
+#ifdef CONFIG_RAID_ZERO_COPY
+	clear_page_constant(page);
+#endif
+
 	wake_up_page(page, PG_writeback);
 }
 EXPORT_SYMBOL(end_page_writeback);
 
+#ifdef CONFIG_RAID_ZERO_COPY
+void clear_page_constant(struct page *page) {
+	if (PageConstant(page)) {
+		ClearPageConstant(page);
+		SetPageUptodate(page);
+	}
+}
+EXPORT_SYMBOL(clear_page_constant);
+#endif
+
 /**
  * __lock_page - get a lock on the page, assuming we need to sleep to get it
  * @page: the page to lock
--
1.7.0.4

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH][RFC]      RAID5/DMA/memcpy: zero copy the bio page when possible
  2012-01-11  7:47 [PATCH][RFC] RAID5/DMA/memcpy: zero copy the bio page when possible b29237
  2012-01-11  7:48 ` Shi Xuelin-B29237
@ 2012-01-11  8:08 ` NeilBrown
  1 sibling, 0 replies; 5+ messages in thread
From: NeilBrown @ 2012-01-11  8:08 UTC (permalink / raw)
  To: b29237; +Cc: linux-raid, linux-kernel, r58472

[-- Attachment #1: Type: text/plain, Size: 13149 bytes --]

On Wed, 11 Jan 2012 15:47:34 +0800 <b29237@freescale.com> wrote:

> From: Forrest shi <b29237@freescale.com>
> 
>      each disk holds one page in the stripe header, the page works as a
>      buffer cache for raid computing and it is copied to and from bio page
>      which requested by upper level read/write call.
> 
>      if the write access would overwrite the whole page, the copy from
>      bio page to stripe header page could be avoid. We can do xor directly
>      on the bio page.
> 
>      this patch implements this strategy and improves raid5 write performance.

I was hoping for a free more words...

The "PageConstant" thing seems to be an important part of the strategy -
explaining that wouldn't hurt.

I don't think the code is right (but I might have misunderstood due to lack
of documentation....)

The page that you xor into the parity block must remain constant until the
write completes else you have an inconsistent stripe.

Your 'PageConstant' doesn't seem to keep the page constant, but just checks
afterwards if it has changed.

And then it does SetPageUptodate which looks like a serious layering
violation.

i.e. I think this patch is wrong at a fairly deep level.  If you still think
it is right, you need to put more effort into explaining why.

NeilBrown


> 
>      Signed-off-by: Forrest Shi<b29237@freescale.com>
> ---
>  drivers/dma/Kconfig        |    8 +++
>  drivers/md/raid5.c         |  125 ++++++++++++++++++++++++++++++++++++++-----
>  drivers/md/raid5.h         |    6 ++
>  include/linux/page-flags.h |   11 ++++
>  mm/filemap.c               |   21 +++++++
>  5 files changed, 156 insertions(+), 15 deletions(-)
> 
> diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig
> index dd8e959..28f960b 100644
> --- a/drivers/dma/Kconfig
> +++ b/drivers/dma/Kconfig
> @@ -280,6 +280,14 @@ config ASYNC_TX_DMA
>  
>  	  If unsure, say N.
>  
> +config RAID_ZERO_COPY 
> +        bool "Optimized DMA/XOR offload: reduce raid5 memcpy which offloaded for dma"
> +        depends on ASYNC_TX_DMA
> +        help
> +          This allows the async_tx api try to reduce raid5 memcpy operations for
> +          dma. If you have dma device to support memcpy offloading, you can set
> +          it as Y, else N.
> +
>  config DMATEST
>  	tristate "DMA Test client"
>  	depends on DMA_ENGINE
> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
> index cbb50d3..33afeeb 100644
> --- a/drivers/md/raid5.c
> +++ b/drivers/md/raid5.c
> @@ -3,7 +3,8 @@
>   *	   Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman
>   *	   Copyright (C) 1999, 2000 Ingo Molnar
>   *	   Copyright (C) 2002, 2003 H. Peter Anvin
> - *
> + *	   Copyright (C) 2010, Freescale Semiconductor, Inc. All rights
> + *		reserved.
>   * RAID-4/5/6 management functions.
>   * Thanks to Penguin Computing for making the RAID-6 development possible
>   * by donating a test server!
> @@ -558,6 +559,14 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
>  				set_bit(STRIPE_DEGRADED, &sh->state);
>  			pr_debug("skip op %ld on disc %d for sector %llu\n",
>  				bi->bi_rw, i, (unsigned long long)sh->sector);
> +#ifdef CONFIG_RAID_ZERO_COPY
> +			if (test_bit(R5_DirectAccess, &sh->dev[i].flags)) {
> +				struct page *pg = sh->dev[i].page;
> +				BUG_ON(sh->dev[i].req.bi_io_vec[0].bv_page ==
> +					pg);
> +				sh->dev[i].req.bi_io_vec[0].bv_page = pg;
> +			}
> +#endif
>  			clear_bit(R5_LOCKED, &sh->dev[i].flags);
>  			set_bit(STRIPE_HANDLE, &sh->state);
>  		}
> @@ -685,6 +694,7 @@ static void ops_run_biofill(struct stripe_head *sh)
>  			dev->read = rbi = dev->toread;
>  			dev->toread = NULL;
>  			spin_unlock_irq(&conf->device_lock);
> +
>  			while (rbi && rbi->bi_sector <
>  				dev->sector + STRIPE_SECTORS) {
>  				tx = async_copy_data(0, rbi, dev->page,
> @@ -754,10 +764,17 @@ ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu)
>  		__func__, (unsigned long long)sh->sector, target);
>  	BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
>  
> -	for (i = disks; i--; )
> -		if (i != target)
> -			xor_srcs[count++] = sh->dev[i].page;
> -
> +	for (i = disks; i--; ) {
> +		struct r5dev *dev = &sh->dev[i];
> +		struct page *pg = dev->page;
> +		if (i != target) {
> +#ifdef CONFIG_RAID_ZERO_COPY
> +			if (test_bit(R5_DirectAccess, &dev->flags))
> +				pg = dev->req.bi_io_vec[0].bv_page;
> +#endif
> +			xor_srcs[count++] = pg;
> +		}
> +	}
>  	atomic_inc(&sh->count);
>  
>  	init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL,
> @@ -993,8 +1010,14 @@ ops_run_prexor(struct stripe_head *sh, struct raid5_percpu *percpu,
>  	for (i = disks; i--; ) {
>  		struct r5dev *dev = &sh->dev[i];
>  		/* Only process blocks that are known to be uptodate */
> -		if (test_bit(R5_Wantdrain, &dev->flags))
> -			xor_srcs[count++] = dev->page;
> +		if (test_bit(R5_Wantdrain, &dev->flags)) {
> +			struct page *pg = dev->page;
> +#ifdef CONFIG_RAID_ZERO_COPY
> +			if (test_bit(R5_DirectAccess, &dev->flags))
> +				pg = dev->req.bi_io_vec[0].bv_page;
> +#endif
> +			xor_srcs[count++] = pg;
> +		}
>  	}
>  
>  	init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx,
> @@ -1004,6 +1027,32 @@ ops_run_prexor(struct stripe_head *sh, struct raid5_percpu *percpu,
>  	return tx;
>  }
>  
> +#ifdef CONFIG_RAID_ZERO_COPY
> +static struct page *raid5_zero_copy(struct bio *bio, sector_t sector)
> +{
> +	sector_t bi_sector = bio->bi_sector;
> +	struct page *page = NULL;
> +	struct bio_vec *bv;
> +	int i;
> +
> +	bio_for_each_segment(bv, bio, i) {
> +		if (sector == bi_sector)
> +			page = bio_iovec_idx(bio, i)->bv_page;
> +
> +		bi_sector += bio_iovec_idx(bio, i)->bv_len >> 9;
> +		if (bi_sector >= sector + STRIPE_SECTORS) {
> +			/* check if the stripe is covered by one page */
> +			if (page == bio_iovec_idx(bio, i)->bv_page) {
> +				SetPageConstant(page);
> +				return page;
> +			}
> +			return NULL;
> +		}
> +	}
> +	return NULL;
> +}
> +#endif
> +
>  static struct dma_async_tx_descriptor *
>  ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
>  {
> @@ -1025,8 +1074,28 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
>  			dev->towrite = NULL;
>  			BUG_ON(dev->written);
>  			wbi = dev->written = chosen;
> +#ifdef CONFIG_RAID_ZERO_COPY
> +			set_bit(R5_LOCKED, &dev->flags);
> +			BUG_ON(test_bit(R5_DirectAccess, &dev->flags));
>  			spin_unlock(&sh->lock);
>  
> +			if (!wbi->bi_next && test_bit(R5_OVERWRITE, &dev->flags)
> +					&& test_bit(R5_Insync, &dev->flags)) {
> +				struct page *pg = raid5_zero_copy(wbi,
> +								dev->sector);
> +				if (pg) {
> +					dev->req.bi_io_vec[0].bv_page = pg;
> +					set_bit(R5_DirectAccess, &dev->flags);
> +					clear_bit(R5_UPTODATE, &dev->flags);
> +					clear_bit(R5_OVERWRITE, &dev->flags);
> +					continue;
> +				}
> +			}
> +			clear_bit(R5_OVERWRITE, &dev->flags);
> +			set_bit(R5_UPTODATE, &dev->flags);
> +#else
> +			spin_unlock(&sh->lock);
> +#endif
>  			while (wbi && wbi->bi_sector <
>  				dev->sector + STRIPE_SECTORS) {
>  				if (wbi->bi_rw & REQ_FUA)
> @@ -1102,15 +1171,29 @@ ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu,
>  		xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
>  		for (i = disks; i--; ) {
>  			struct r5dev *dev = &sh->dev[i];
> -			if (dev->written)
> -				xor_srcs[count++] = dev->page;
> +			struct page *pg = dev->page;
> +
> +			if (dev->written) {
> +#ifdef CONFIG_RAID_ZERO_COPY
> +				if (test_bit(R5_DirectAccess, &dev->flags))
> +					pg = dev->req.bi_io_vec[0].bv_page;
> +#endif
> +				xor_srcs[count++] = pg;
> +			}
>  		}
>  	} else {
>  		xor_dest = sh->dev[pd_idx].page;
>  		for (i = disks; i--; ) {
>  			struct r5dev *dev = &sh->dev[i];
> -			if (i != pd_idx)
> -				xor_srcs[count++] = dev->page;
> +			struct page *pg = dev->page;
> +
> +			if (i != pd_idx) {
> +#ifdef CONFIG_RAID_ZERO_COPY
> +				if (test_bit(R5_DirectAccess, &dev->flags))
> +					pg = dev->req.bi_io_vec[0].bv_page;
> +#endif
> +				xor_srcs[count++] = pg;
> +			}
>  		}
>  	}
>  
> @@ -1637,6 +1720,7 @@ static void raid5_end_read_request(struct bio * bi, int error)
>  			md_error(conf->mddev, rdev);
>  		}
>  	}
> +
>  	rdev_dec_pending(conf->disks[i].rdev, conf->mddev);
>  	clear_bit(R5_LOCKED, &sh->dev[i].flags);
>  	set_bit(STRIPE_HANDLE, &sh->state);
> @@ -1666,15 +1750,19 @@ static void raid5_end_write_request(struct bio *bi, int error)
>  		md_error(conf->mddev, conf->disks[i].rdev);
>  
>  	rdev_dec_pending(conf->disks[i].rdev, conf->mddev);
> -	
> +#ifdef CONFIG_RAID_ZERO_COPY
> +	if (test_bit(R5_DirectAccess, &sh->dev[i].flags)) {
> +		BUG_ON(sh->dev[i].req.bi_io_vec[0].bv_page == sh->dev[i].page);
> +		sh->dev[i].req.bi_io_vec[0].bv_page = sh->dev[i].page;
> +	}
> +#endif
>  	clear_bit(R5_LOCKED, &sh->dev[i].flags);
>  	set_bit(STRIPE_HANDLE, &sh->state);
>  	release_stripe(sh);
>  }
>  
> -
>  static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous);
> -	
> +
>  static void raid5_build_block(struct stripe_head *sh, int i, int previous)
>  {
>  	struct r5dev *dev = &sh->dev[i];
> @@ -2505,7 +2593,11 @@ static void handle_stripe_clean_event(raid5_conf_t *conf,
>  		if (sh->dev[i].written) {
>  			dev = &sh->dev[i];
>  			if (!test_bit(R5_LOCKED, &dev->flags) &&
> -				test_bit(R5_UPTODATE, &dev->flags)) {
> +				(test_bit(R5_UPTODATE, &dev->flags)
> +#ifdef CONFIG_RAID_ZERO_COPY
> +				|| test_bit(R5_DirectAccess, &dev->flags)
> +#endif
> +			)) {
>  				/* We can return any write requests */
>  				struct bio *wbi, *wbi2;
>  				int bitmap_end = 0;
> @@ -2513,6 +2605,9 @@ static void handle_stripe_clean_event(raid5_conf_t *conf,
>  				spin_lock_irq(&conf->device_lock);
>  				wbi = dev->written;
>  				dev->written = NULL;
> +#ifdef CONFIG_RAID_ZERO_COPY
> +				clear_bit(R5_DirectAccess, &dev->flags);
> +#endif
>  				while (wbi && wbi->bi_sector <
>  					dev->sector + STRIPE_SECTORS) {
>  					wbi2 = r5_next_bio(wbi, dev->sector);
> diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
> index 3ca77a2..b1db89f 100644
> --- a/drivers/md/raid5.h
> +++ b/drivers/md/raid5.h
> @@ -276,6 +276,12 @@ struct r6_state {
>  				    */
>  #define R5_Wantdrain	13 /* dev->towrite needs to be drained */
>  #define R5_WantFUA	14	/* Write should be FUA */
> +
> +#ifdef CONFIG_RAID_ZERO_COPY
> +#define R5_DirectAccess 15 /* access cached pages directly instead of
> +                                       * sh pages */
> +#endif
> +
>  /*
>   * Write method
>   */
> diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
> index 6081493..fc74541 100644
> --- a/include/linux/page-flags.h
> +++ b/include/linux/page-flags.h
> @@ -104,6 +104,9 @@ enum pageflags {
>  #ifdef CONFIG_MEMORY_FAILURE
>  	PG_hwpoison,		/* hardware poisoned page. Don't touch */
>  #endif
> +#ifdef CONFIG_RAID_ZERO_COPY
> +        PG_constant,            /* const page not modified during raid5 io */
> +#endif
>  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
>  	PG_compound_lock,
>  #endif
> @@ -196,6 +199,14 @@ static inline int __TestClearPage##uname(struct page *page) { return 0; }
>  
>  struct page;	/* forward declaration */
>  
> +#ifdef CONFIG_RAID_ZERO_COPY
> +#define PageConstant(page) test_bit(PG_constant, &(page)->flags)
> +#define SetPageConstant(page) set_bit(PG_constant, &(page)->flags)
> +#define ClearPageConstant(page) clear_bit(PG_constant, &(page->flags))
> +#define TestSetPageConstant(page) test_and_set_bit(PG_constant, &(page)->flags)
> +extern void clear_page_constant(struct page *page);
> +#endif
> +
>  TESTPAGEFLAG(Locked, locked)
>  PAGEFLAG(Error, error) TESTCLEARFLAG(Error, error)
>  PAGEFLAG(Referenced, referenced) TESTCLEARFLAG(Referenced, referenced)
> diff --git a/mm/filemap.c b/mm/filemap.c
> index a8251a8..5052b72 100644
> --- a/mm/filemap.c
> +++ b/mm/filemap.c
> @@ -28,6 +28,11 @@
>  #include <linux/backing-dev.h>
>  #include <linux/pagevec.h>
>  #include <linux/blkdev.h>
> +
> +#ifdef CONFIG_RAID_ZERO_COPY
> +#include <linux/rmap.h>
> +#endif
> +
>  #include <linux/security.h>
>  #include <linux/syscalls.h>
>  #include <linux/cpuset.h>
> @@ -636,10 +641,26 @@ void end_page_writeback(struct page *page)
>  		BUG();
>  
>  	smp_mb__after_clear_bit();
> +
> +#ifdef CONFIG_RAID_ZERO_COPY
> +	clear_page_constant(page);
> +#endif
> +
>  	wake_up_page(page, PG_writeback);
>  }
>  EXPORT_SYMBOL(end_page_writeback);
>  
> +#ifdef CONFIG_RAID_ZERO_COPY
> +void clear_page_constant(struct page *page)
> +{
> +	if (PageConstant(page)) {
> +		ClearPageConstant(page);
> +		SetPageUptodate(page);
> +	}
> +}
> +EXPORT_SYMBOL(clear_page_constant);
> +#endif
> +
>  /**
>   * __lock_page - get a lock on the page, assuming we need to sleep to get it
>   * @page: the page to lock


[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 828 bytes --]

^ permalink raw reply	[flat|nested] 5+ messages in thread

* [PATCH][RFC]  RAID5/DMA/memcpy: zero copy the bio page when possible
@ 2011-12-28  5:20 b29237
  2012-01-02 23:13 ` NeilBrown
  0 siblings, 1 reply; 5+ messages in thread
From: b29237 @ 2011-12-28  5:20 UTC (permalink / raw)
  To: neilb, linux-raid, linux-kernel, r58472; +Cc: Forrest shi

From: Forrest shi <b29237@freescale.com>

	use bio page directly instead of copying the bio page to stripe
	header page when possible.

	Signed-off-by: Forrest Shi<b29237@freescale.com>
---
 drivers/dma/Kconfig        |    8 +++
 drivers/md/raid5.c         |  122 +++++++++++++++++++++++++++++++++++++++-----
 drivers/md/raid5.h         |    6 ++
 include/linux/page-flags.h |   11 ++++
 mm/filemap.c               |   21 ++++++++
 5 files changed, 154 insertions(+), 14 deletions(-)

diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig
index dd8e959..8e90272 100644
--- a/drivers/dma/Kconfig
+++ b/drivers/dma/Kconfig
@@ -280,6 +280,14 @@ config ASYNC_TX_DMA
 
 	  If unsure, say N.
 
+config OPTIMIZE_FSL_DMA_MEMCPY
+        bool "Optimized DMA/XOR offload: reduce raid5 memcpy which offloaded for fsl dma"
+        depends on ASYNC_TX_DMA
+        help
+          This allows the async_tx api try to reduce raid5 memcpy operations for
+          fsl dma. If you have fsl dma and talitos enabled, you can set it as Y,
+          else N.
+
 config DMATEST
 	tristate "DMA Test client"
 	depends on DMA_ENGINE
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index cbb50d3..9b80e52 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -3,7 +3,8 @@
  *	   Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman
  *	   Copyright (C) 1999, 2000 Ingo Molnar
  *	   Copyright (C) 2002, 2003 H. Peter Anvin
- *
+ *	   Copyright (C) 2010, Freescale Semiconductor, Inc. All rights
+ *		reserved.
  * RAID-4/5/6 management functions.
  * Thanks to Penguin Computing for making the RAID-6 development possible
  * by donating a test server!
@@ -558,6 +559,14 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
 				set_bit(STRIPE_DEGRADED, &sh->state);
 			pr_debug("skip op %ld on disc %d for sector %llu\n",
 				bi->bi_rw, i, (unsigned long long)sh->sector);
+#ifdef CONFIG_OPTIMIZE_FSL_DMA_MEMCPY
+			if (test_bit(R5_DirectAccess, &sh->dev[i].flags)) {
+				struct page *pg = sh->dev[i].page;
+				BUG_ON(sh->dev[i].req.bi_io_vec[0].bv_page ==
+					pg);
+				sh->dev[i].req.bi_io_vec[0].bv_page = pg;
+			}
+#endif
 			clear_bit(R5_LOCKED, &sh->dev[i].flags);
 			set_bit(STRIPE_HANDLE, &sh->state);
 		}
@@ -685,6 +694,7 @@ static void ops_run_biofill(struct stripe_head *sh)
 			dev->read = rbi = dev->toread;
 			dev->toread = NULL;
 			spin_unlock_irq(&conf->device_lock);
+
 			while (rbi && rbi->bi_sector <
 				dev->sector + STRIPE_SECTORS) {
 				tx = async_copy_data(0, rbi, dev->page,
@@ -754,10 +764,16 @@ ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu)
 		__func__, (unsigned long long)sh->sector, target);
 	BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
 
-	for (i = disks; i--; )
+	for (i = disks; i--; ) {
+		struct r5dev *dev = &sh->dev[i];
+		struct page *pg = dev->page;
 		if (i != target)
-			xor_srcs[count++] = sh->dev[i].page;
-
+#ifdef CONFIG_OPTIMIZE_FSL_DMA_MEMCPY
+			if (test_bit(R5_DirectAccess, &dev->flags))
+				pg = dev->req.bi_io_vec[0].bv_page;
+#endif
+			xor_srcs[count++] = pg;
+	}
 	atomic_inc(&sh->count);
 
 	init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL,
@@ -993,8 +1009,14 @@ ops_run_prexor(struct stripe_head *sh, struct raid5_percpu *percpu,
 	for (i = disks; i--; ) {
 		struct r5dev *dev = &sh->dev[i];
 		/* Only process blocks that are known to be uptodate */
-		if (test_bit(R5_Wantdrain, &dev->flags))
-			xor_srcs[count++] = dev->page;
+		if (test_bit(R5_Wantdrain, &dev->flags)) {
+			struct page *pg = dev->page;
+#ifdef CONFIG_OPTIMIZE_FSL_DMA_MEMCPY
+			if (test_bit(R5_DirectAccess, &dev->flags))
+				pg = dev->req.bi_io_vec[0].bv_page;
+#endif
+			xor_srcs[count++] = pg;
+		}
 	}
 
 	init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx,
@@ -1004,6 +1026,32 @@ ops_run_prexor(struct stripe_head *sh, struct raid5_percpu *percpu,
 	return tx;
 }
 
+#ifdef CONFIG_OPTIMIZE_FSL_DMA_MEMCPY
+static struct page *raid5_zero_copy(struct bio *bio, sector_t sector)
+{
+        sector_t bi_sector = bio->bi_sector;
+        struct page *page = NULL;
+        struct bio_vec *bv;
+        int i;
+
+        bio_for_each_segment(bv, bio, i) {
+                if (sector == bi_sector)
+                        page = bio_iovec_idx(bio, i)->bv_page;
+
+                bi_sector += bio_iovec_idx(bio, i)->bv_len >> 9;
+                if (bi_sector >= sector + STRIPE_SECTORS) {
+                        /* check if the stripe is covered by one page */
+                        if (page == bio_iovec_idx(bio, i)->bv_page) {
+                                SetPageConstant(page);
+                                return page;
+                        }
+                        return NULL;
+                }
+        }
+        return NULL;
+}
+#endif
+
 static struct dma_async_tx_descriptor *
 ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
 {
@@ -1025,8 +1073,28 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
 			dev->towrite = NULL;
 			BUG_ON(dev->written);
 			wbi = dev->written = chosen;
+#ifdef CONFIG_OPTIMIZE_FSL_DMA_MEMCPY
+			set_bit(R5_LOCKED, &dev->flags);
+			BUG_ON(test_bit(R5_DirectAccess, &dev->flags));
 			spin_unlock(&sh->lock);
 
+			if (!wbi->bi_next && test_bit(R5_OVERWRITE, &dev->flags)
+					&& test_bit(R5_Insync, &dev->flags)) {
+				struct page *pg = raid5_zero_copy(wbi,
+								dev->sector);
+				if (pg) {
+					dev->req.bi_io_vec[0].bv_page = pg;
+					set_bit(R5_DirectAccess, &dev->flags);
+					clear_bit(R5_UPTODATE, &dev->flags);
+					clear_bit(R5_OVERWRITE, &dev->flags);
+					continue;
+				}
+			}
+			clear_bit(R5_OVERWRITE, &dev->flags);
+			set_bit(R5_UPTODATE, &dev->flags);
+#else
+			spin_unlock(&sh->lock);
+#endif
 			while (wbi && wbi->bi_sector <
 				dev->sector + STRIPE_SECTORS) {
 				if (wbi->bi_rw & REQ_FUA)
@@ -1102,15 +1170,29 @@ ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu,
 		xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
 		for (i = disks; i--; ) {
 			struct r5dev *dev = &sh->dev[i];
-			if (dev->written)
-				xor_srcs[count++] = dev->page;
+			struct page *pg = dev->page;
+
+			if (dev->written) {
+#ifdef CONFIG_OPTIMIZE_FSL_DMA_MEMCPY
+				if (test_bit(R5_DirectAccess, &dev->flags))
+					pg = dev->req.bi_io_vec[0].bv_page;
+#endif
+				xor_srcs[count++] = pg;
+			}
 		}
 	} else {
 		xor_dest = sh->dev[pd_idx].page;
 		for (i = disks; i--; ) {
 			struct r5dev *dev = &sh->dev[i];
-			if (i != pd_idx)
-				xor_srcs[count++] = dev->page;
+			struct page *pg = dev->page;
+
+			if (i != pd_idx) {
+#ifdef CONFIG_OPTIMIZE_FSL_DMA_MEMCPY
+				if (test_bit(R5_DirectAccess, &dev->flags))
+					pg = dev->req.bi_io_vec[0].bv_page;
+#endif
+				xor_srcs[count++] = pg;
+			}
 		}
 	}
 
@@ -1637,6 +1719,7 @@ static void raid5_end_read_request(struct bio * bi, int error)
 			md_error(conf->mddev, rdev);
 		}
 	}
+
 	rdev_dec_pending(conf->disks[i].rdev, conf->mddev);
 	clear_bit(R5_LOCKED, &sh->dev[i].flags);
 	set_bit(STRIPE_HANDLE, &sh->state);
@@ -1666,15 +1749,19 @@ static void raid5_end_write_request(struct bio *bi, int error)
 		md_error(conf->mddev, conf->disks[i].rdev);
 
 	rdev_dec_pending(conf->disks[i].rdev, conf->mddev);
-	
+#ifdef CONFIG_OPTIMIZE_FSL_DMA_MEMCPY
+	if (test_bit(R5_DirectAccess, &sh->dev[i].flags)) {
+		BUG_ON(sh->dev[i].req.bi_io_vec[0].bv_page == sh->dev[i].page);
+		sh->dev[i].req.bi_io_vec[0].bv_page = sh->dev[i].page;
+	}
+#endif
 	clear_bit(R5_LOCKED, &sh->dev[i].flags);
 	set_bit(STRIPE_HANDLE, &sh->state);
 	release_stripe(sh);
 }
 
-
 static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous);
-	
+
 static void raid5_build_block(struct stripe_head *sh, int i, int previous)
 {
 	struct r5dev *dev = &sh->dev[i];
@@ -2505,7 +2592,11 @@ static void handle_stripe_clean_event(raid5_conf_t *conf,
 		if (sh->dev[i].written) {
 			dev = &sh->dev[i];
 			if (!test_bit(R5_LOCKED, &dev->flags) &&
-				test_bit(R5_UPTODATE, &dev->flags)) {
+				(test_bit(R5_UPTODATE, &dev->flags)
+#ifdef CONFIG_OPTIMIZE_FSL_DMA_MEMCPY
+				|| test_bit(R5_DirectAccess, &dev->flags)
+#endif
+			)) {
 				/* We can return any write requests */
 				struct bio *wbi, *wbi2;
 				int bitmap_end = 0;
@@ -2513,6 +2604,9 @@ static void handle_stripe_clean_event(raid5_conf_t *conf,
 				spin_lock_irq(&conf->device_lock);
 				wbi = dev->written;
 				dev->written = NULL;
+#ifdef CONFIG_OPTIMIZE_FSL_DMA_MEMCPY
+				clear_bit(R5_DirectAccess, &dev->flags);
+#endif
 				while (wbi && wbi->bi_sector <
 					dev->sector + STRIPE_SECTORS) {
 					wbi2 = r5_next_bio(wbi, dev->sector);
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 3ca77a2..dccf34f 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -276,6 +276,12 @@ struct r6_state {
 				    */
 #define R5_Wantdrain	13 /* dev->towrite needs to be drained */
 #define R5_WantFUA	14	/* Write should be FUA */
+
+#ifdef CONFIG_OPTIMIZE_FSL_DMA_MEMCPY
+#define R5_DirectAccess 15 /* access cached pages directly instead of
+                                       * sh pages */
+#endif
+
 /*
  * Write method
  */
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 6081493..d2bbc94 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -104,6 +104,9 @@ enum pageflags {
 #ifdef CONFIG_MEMORY_FAILURE
 	PG_hwpoison,		/* hardware poisoned page. Don't touch */
 #endif
+#ifdef CONFIG_OPTIMIZE_FSL_DMA_MEMCPY
+        PG_constant,            /* const page not modified during raid5 io */
+#endif
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 	PG_compound_lock,
 #endif
@@ -196,6 +199,14 @@ static inline int __TestClearPage##uname(struct page *page) { return 0; }
 
 struct page;	/* forward declaration */
 
+#ifdef CONFIG_OPTIMIZE_FSL_DMA_MEMCPY
+#define PageConstant(page) test_bit(PG_constant, &(page)->flags)
+#define SetPageConstant(page) set_bit(PG_constant, &(page)->flags)
+#define ClearPageConstant(page) clear_bit(PG_constant, &(page->flags))
+#define TestSetPageConstant(page) test_and_set_bit(PG_constant, &(page)->flags)
+extern void clear_page_constant(struct page *page);
+#endif
+
 TESTPAGEFLAG(Locked, locked)
 PAGEFLAG(Error, error) TESTCLEARFLAG(Error, error)
 PAGEFLAG(Referenced, referenced) TESTCLEARFLAG(Referenced, referenced)
diff --git a/mm/filemap.c b/mm/filemap.c
index a8251a8..f7d98ad 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -28,6 +28,11 @@
 #include <linux/backing-dev.h>
 #include <linux/pagevec.h>
 #include <linux/blkdev.h>
+
+#ifdef CONFIG_OPTIMIZE_FSL_DMA_MEMCPY
+#include <linux/rmap.h>
+#endif
+
 #include <linux/security.h>
 #include <linux/syscalls.h>
 #include <linux/cpuset.h>
@@ -636,10 +641,26 @@ void end_page_writeback(struct page *page)
 		BUG();
 
 	smp_mb__after_clear_bit();
+
+#ifdef CONFIG_OPTIMIZE_FSL_DMA_MEMCPY
+	clear_page_constant(page);
+#endif
+
 	wake_up_page(page, PG_writeback);
 }
 EXPORT_SYMBOL(end_page_writeback);
 
+#ifdef CONFIG_OPTIMIZE_FSL_DMA_MEMCPY
+void clear_page_constant(struct page *page)
+{
+	if (PageConstant(page)) {
+		ClearPageConstant(page);
+		SetPageUptodate(page);
+	}
+}
+EXPORT_SYMBOL(clear_page_constant);
+#endif
+
 /**
  * __lock_page - get a lock on the page, assuming we need to sleep to get it
  * @page: the page to lock
-- 
1.7.0.4

^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH][RFC]  RAID5/DMA/memcpy: zero copy the bio page when possible
  2011-12-28  5:20 b29237
@ 2012-01-02 23:13 ` NeilBrown
  0 siblings, 0 replies; 5+ messages in thread
From: NeilBrown @ 2012-01-02 23:13 UTC (permalink / raw)
  To: b29237; +Cc: linux-raid, linux-kernel, r58472

[-- Attachment #1: Type: text/plain, Size: 1349 bytes --]

On Wed, 28 Dec 2011 13:20:08 +0800 <b29237@freescale.com> wrote:

> From: Forrest shi <b29237@freescale.com>
> 
> 	use bio page directly instead of copying the bio page to stripe
> 	header page when possible.
> 
> 	Signed-off-by: Forrest Shi<b29237@freescale.com>

I'm sorry but if you want anyone to review this and comment on it you need to
give us a bit more help - like telling us what the purpose of the patch is.

There are a couple of hints but what is needed is a 'big picture'
description.  What is the problem you are fixing?  How are you fixing it?
What are the costs/benefits?

> @@ -754,10 +764,16 @@ ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu)
>  		__func__, (unsigned long long)sh->sector, target);
>  	BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
>  
> -	for (i = disks; i--; )
> +	for (i = disks; i--; ) {
> +		struct r5dev *dev = &sh->dev[i];
> +		struct page *pg = dev->page;
>  		if (i != target)
> -			xor_srcs[count++] = sh->dev[i].page;
> -
> +#ifdef CONFIG_OPTIMIZE_FSL_DMA_MEMCPY
> +			if (test_bit(R5_DirectAccess, &dev->flags))
> +				pg = dev->req.bi_io_vec[0].bv_page;
> +#endif
> +			xor_srcs[count++] = pg;
> +	}
>  	atomic_inc(&sh->count);

And something is very wrong here.  I think you need { } around the body of
the 'if' statement.

NeilBrown


[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 828 bytes --]

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2012-01-11  8:08 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2012-01-11  7:47 [PATCH][RFC] RAID5/DMA/memcpy: zero copy the bio page when possible b29237
2012-01-11  7:48 ` Shi Xuelin-B29237
2012-01-11  8:08 ` NeilBrown
  -- strict thread matches above, loose matches on Subject: below --
2011-12-28  5:20 b29237
2012-01-02 23:13 ` NeilBrown

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).