[patch v2 0/6] raid5: automatically batch adjacent full stripe write

linux-raid.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [patch v2 0/6] raid5: automatically batch adjacent full stripe write
@ 2014-09-10 12:40 shli
  2014-09-10 12:40 ` [patch v2 1/6] raid5: use flex_array for scribble data shli
                   ` (6 more replies)
  0 siblings, 7 replies; 9+ messages in thread
From: shli @ 2014-09-10 12:40 UTC (permalink / raw)
  To: neilb, linux-raid

This is the 2nd attempt to make adjacent full stripe write together. The main
change against v1 is how to detect if stripes can be batched. Also some bugs
are fixed.

Thanks,
Shaohua

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [patch v2 1/6] raid5: use flex_array for scribble data
  2014-09-10 12:40 [patch v2 0/6] raid5: automatically batch adjacent full stripe write shli
@ 2014-09-10 12:40 ` shli
  2014-09-10 12:40 ` [patch v2 2/6] raid5: add a new flag to track if a stripe can be batched shli
                   ` (5 subsequent siblings)
  6 siblings, 0 replies; 9+ messages in thread
From: shli @ 2014-09-10 12:40 UTC (permalink / raw)
  To: neilb, linux-raid

[-- Attachment #1: raid5-scribble-use-flex-array.patch --]
[-- Type: text/plain, Size: 12108 bytes --]

Use flex_array for scribble data. Next patch will batch several stripes
together, so scribble data should be able to cover several stripes, so this
patch also allocates scribble data for stripes across a chunk.

Signed-off-by: Shaohua Li <shli@fusionio.com>
---
 drivers/md/raid5.c |   89 +++++++++++++++++++++++++++++++++--------------------
 drivers/md/raid5.h |    6 ---
 2 files changed, 57 insertions(+), 38 deletions(-)

Index: linux/drivers/md/raid5.c
===================================================================
--- linux.orig/drivers/md/raid5.c	2014-09-10 19:21:00.503475395 +0800
+++ linux/drivers/md/raid5.c	2014-09-10 19:21:00.499475454 +0800
@@ -54,6 +54,7 @@
 #include <linux/slab.h>
 #include <linux/ratelimit.h>
 #include <linux/nodemask.h>
+#include <linux/flex_array.h>
 #include <trace/events/block.h>
 
 #include "md.h"
@@ -1112,16 +1113,29 @@ static void ops_complete_compute(void *s
 
 /* return a pointer to the address conversion region of the scribble buffer */
 static addr_conv_t *to_addr_conv(struct stripe_head *sh,
-				 struct raid5_percpu *percpu)
+				 struct raid5_percpu *percpu, int i)
 {
-	return percpu->scribble + sizeof(struct page *) * (sh->disks + 2);
+	void *addr;
+
+	addr = flex_array_get(percpu->scribble, i);
+	return addr + sizeof(struct page *) * (sh->disks + 2);
+}
+
+/* return a pointer to the address conversion region of the scribble buffer */
+static struct page **to_addr_page(struct stripe_head *sh,
+				 struct raid5_percpu *percpu, int i)
+{
+	void *addr;
+
+	addr = flex_array_get(percpu->scribble, i);
+	return addr;
 }
 
 static struct dma_async_tx_descriptor *
 ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu)
 {
 	int disks = sh->disks;
-	struct page **xor_srcs = percpu->scribble;
+	struct page **xor_srcs = to_addr_page(sh, percpu, 0);
 	int target = sh->ops.target;
 	struct r5dev *tgt = &sh->dev[target];
 	struct page *xor_dest = tgt->page;
@@ -1141,7 +1155,7 @@ ops_run_compute5(struct stripe_head *sh,
 	atomic_inc(&sh->count);
 
 	init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL,
-			  ops_complete_compute, sh, to_addr_conv(sh, percpu));
+			  ops_complete_compute, sh, to_addr_conv(sh, percpu, 0));
 	if (unlikely(count == 1))
 		tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit);
 	else
@@ -1186,7 +1200,7 @@ static struct dma_async_tx_descriptor *
 ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu)
 {
 	int disks = sh->disks;
-	struct page **blocks = percpu->scribble;
+	struct page **blocks = to_addr_page(sh, percpu, 0);
 	int target;
 	int qd_idx = sh->qd_idx;
 	struct dma_async_tx_descriptor *tx;
@@ -1219,7 +1233,7 @@ ops_run_compute6_1(struct stripe_head *s
 		BUG_ON(blocks[count+1] != dest); /* q should already be set */
 		init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
 				  ops_complete_compute, sh,
-				  to_addr_conv(sh, percpu));
+				  to_addr_conv(sh, percpu, 0));
 		tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit);
 	} else {
 		/* Compute any data- or p-drive using XOR */
@@ -1232,7 +1246,7 @@ ops_run_compute6_1(struct stripe_head *s
 
 		init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST,
 				  NULL, ops_complete_compute, sh,
-				  to_addr_conv(sh, percpu));
+				  to_addr_conv(sh, percpu, 0));
 		tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, &submit);
 	}
 
@@ -1251,7 +1265,7 @@ ops_run_compute6_2(struct stripe_head *s
 	struct r5dev *tgt = &sh->dev[target];
 	struct r5dev *tgt2 = &sh->dev[target2];
 	struct dma_async_tx_descriptor *tx;
-	struct page **blocks = percpu->scribble;
+	struct page **blocks = to_addr_page(sh, percpu, 0);
 	struct async_submit_ctl submit;
 
 	pr_debug("%s: stripe %llu block1: %d block2: %d\n",
@@ -1293,7 +1307,7 @@ ops_run_compute6_2(struct stripe_head *s
 			/* Missing P+Q, just recompute */
 			init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
 					  ops_complete_compute, sh,
-					  to_addr_conv(sh, percpu));
+					  to_addr_conv(sh, percpu, 0));
 			return async_gen_syndrome(blocks, 0, syndrome_disks+2,
 						  STRIPE_SIZE, &submit);
 		} else {
@@ -1317,21 +1331,21 @@ ops_run_compute6_2(struct stripe_head *s
 			init_async_submit(&submit,
 					  ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST,
 					  NULL, NULL, NULL,
-					  to_addr_conv(sh, percpu));
+					  to_addr_conv(sh, percpu, 0));
 			tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE,
 				       &submit);
 
 			count = set_syndrome_sources(blocks, sh);
 			init_async_submit(&submit, ASYNC_TX_FENCE, tx,
 					  ops_complete_compute, sh,
-					  to_addr_conv(sh, percpu));
+					  to_addr_conv(sh, percpu, 0));
 			return async_gen_syndrome(blocks, 0, count+2,
 						  STRIPE_SIZE, &submit);
 		}
 	} else {
 		init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
 				  ops_complete_compute, sh,
-				  to_addr_conv(sh, percpu));
+				  to_addr_conv(sh, percpu, 0));
 		if (failb == syndrome_disks) {
 			/* We're missing D+P. */
 			return async_raid6_datap_recov(syndrome_disks+2,
@@ -1360,7 +1374,7 @@ ops_run_prexor(struct stripe_head *sh, s
 	       struct dma_async_tx_descriptor *tx)
 {
 	int disks = sh->disks;
-	struct page **xor_srcs = percpu->scribble;
+	struct page **xor_srcs = to_addr_page(sh, percpu, 0);
 	int count = 0, pd_idx = sh->pd_idx, i;
 	struct async_submit_ctl submit;
 
@@ -1378,7 +1392,7 @@ ops_run_prexor(struct stripe_head *sh, s
 	}
 
 	init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx,
-			  ops_complete_prexor, sh, to_addr_conv(sh, percpu));
+			  ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0));
 	tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
 
 	return tx;
@@ -1482,7 +1496,7 @@ ops_run_reconstruct5(struct stripe_head
 		     struct dma_async_tx_descriptor *tx)
 {
 	int disks = sh->disks;
-	struct page **xor_srcs = percpu->scribble;
+	struct page **xor_srcs = to_addr_page(sh, percpu, 0);
 	struct async_submit_ctl submit;
 	int count = 0, pd_idx = sh->pd_idx, i;
 	struct page *xor_dest;
@@ -1535,7 +1549,7 @@ ops_run_reconstruct5(struct stripe_head
 	atomic_inc(&sh->count);
 
 	init_async_submit(&submit, flags, tx, ops_complete_reconstruct, sh,
-			  to_addr_conv(sh, percpu));
+			  to_addr_conv(sh, percpu, 0));
 	if (unlikely(count == 1))
 		tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit);
 	else
@@ -1547,7 +1561,7 @@ ops_run_reconstruct6(struct stripe_head
 		     struct dma_async_tx_descriptor *tx)
 {
 	struct async_submit_ctl submit;
-	struct page **blocks = percpu->scribble;
+	struct page **blocks = to_addr_page(sh, percpu, 0);
 	int count, i;
 
 	pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector);
@@ -1571,7 +1585,7 @@ ops_run_reconstruct6(struct stripe_head
 	atomic_inc(&sh->count);
 
 	init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_reconstruct,
-			  sh, to_addr_conv(sh, percpu));
+			  sh, to_addr_conv(sh, percpu, 0));
 	async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE,  &submit);
 }
 
@@ -1593,7 +1607,7 @@ static void ops_run_check_p(struct strip
 	int pd_idx = sh->pd_idx;
 	int qd_idx = sh->qd_idx;
 	struct page *xor_dest;
-	struct page **xor_srcs = percpu->scribble;
+	struct page **xor_srcs = to_addr_page(sh, percpu, 0);
 	struct dma_async_tx_descriptor *tx;
 	struct async_submit_ctl submit;
 	int count;
@@ -1612,7 +1626,7 @@ static void ops_run_check_p(struct strip
 	}
 
 	init_async_submit(&submit, 0, NULL, NULL, NULL,
-			  to_addr_conv(sh, percpu));
+			  to_addr_conv(sh, percpu, 0));
 	tx = async_xor_val(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
 			   &sh->ops.zero_sum_result, &submit);
 
@@ -1623,7 +1637,7 @@ static void ops_run_check_p(struct strip
 
 static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu, int checkp)
 {
-	struct page **srcs = percpu->scribble;
+	struct page **srcs = to_addr_page(sh, percpu, 0);
 	struct async_submit_ctl submit;
 	int count;
 
@@ -1636,7 +1650,7 @@ static void ops_run_check_pq(struct stri
 
 	atomic_inc(&sh->count);
 	init_async_submit(&submit, ASYNC_TX_ACK, NULL, ops_complete_check,
-			  sh, to_addr_conv(sh, percpu));
+			  sh, to_addr_conv(sh, percpu, 0));
 	async_syndrome_val(srcs, 0, count+2, STRIPE_SIZE,
 			   &sh->ops.zero_sum_result, percpu->spare_page, &submit);
 }
@@ -1776,13 +1790,21 @@ static int grow_stripes(struct r5conf *c
  * calculate over all devices (not just the data blocks), using zeros in place
  * of the P and Q blocks.
  */
-static size_t scribble_len(int num)
+static struct flex_array *scribble_alloc(int num, int cnt, gfp_t flags)
 {
+	struct flex_array *ret;
 	size_t len;
 
 	len = sizeof(struct page *) * (num+2) + sizeof(addr_conv_t) * (num+2);
-
-	return len;
+	ret = flex_array_alloc(len, cnt, flags);
+	if (!ret)
+		return NULL;
+	/* always prealloc all elements, so no locking is required */
+	if (flex_array_prealloc(ret, 0, cnt, flags)) {
+		flex_array_free(ret);
+		return NULL;
+	}
+	return ret;
 }
 
 static int resize_stripes(struct r5conf *conf, int newsize)
@@ -1900,16 +1922,16 @@ static int resize_stripes(struct r5conf
 		err = -ENOMEM;
 
 	get_online_cpus();
-	conf->scribble_len = scribble_len(newsize);
 	for_each_present_cpu(cpu) {
 		struct raid5_percpu *percpu;
-		void *scribble;
+		struct flex_array *scribble;
 
 		percpu = per_cpu_ptr(conf->percpu, cpu);
-		scribble = kmalloc(conf->scribble_len, GFP_NOIO);
+		scribble = scribble_alloc(newsize, conf->chunk_sectors /
+			STRIPE_SECTORS, GFP_NOIO);
 
 		if (scribble) {
-			kfree(percpu->scribble);
+			flex_array_free(percpu->scribble);
 			percpu->scribble = scribble;
 		} else {
 			err = -ENOMEM;
@@ -5610,7 +5632,7 @@ raid5_size(struct mddev *mddev, sector_t
 static void free_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu)
 {
 	safe_put_page(percpu->spare_page);
-	kfree(percpu->scribble);
+	flex_array_free(percpu->scribble);
 	percpu->spare_page = NULL;
 	percpu->scribble = NULL;
 }
@@ -5620,7 +5642,9 @@ static int alloc_scratch_buffer(struct r
 	if (conf->level == 6 && !percpu->spare_page)
 		percpu->spare_page = alloc_page(GFP_KERNEL);
 	if (!percpu->scribble)
-		percpu->scribble = kmalloc(conf->scribble_len, GFP_KERNEL);
+		percpu->scribble = scribble_alloc(max(conf->raid_disks,
+			conf->previous_raid_disks), conf->chunk_sectors /
+			STRIPE_SECTORS, GFP_KERNEL);
 
 	if (!percpu->scribble || (conf->level == 6 && !percpu->spare_page)) {
 		free_scratch_buffer(conf, percpu);
@@ -5790,7 +5814,6 @@ static struct r5conf *setup_conf(struct
 	else
 		conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks;
 	max_disks = max(conf->raid_disks, conf->previous_raid_disks);
-	conf->scribble_len = scribble_len(max_disks);
 
 	conf->disks = kzalloc(max_disks * sizeof(struct disk_info),
 			      GFP_KERNEL);
@@ -5818,6 +5841,7 @@ static struct r5conf *setup_conf(struct
 		INIT_LIST_HEAD(conf->temp_inactive_list + i);
 
 	conf->level = mddev->new_level;
+	conf->chunk_sectors = mddev->new_chunk_sectors;
 	if (raid5_alloc_percpu(conf) != 0)
 		goto abort;
 
@@ -5850,7 +5874,6 @@ static struct r5conf *setup_conf(struct
 			conf->fullsync = 1;
 	}
 
-	conf->chunk_sectors = mddev->new_chunk_sectors;
 	conf->level = mddev->new_level;
 	if (conf->level == 6)
 		conf->max_degraded = 2;
Index: linux/drivers/md/raid5.h
===================================================================
--- linux.orig/drivers/md/raid5.h	2014-09-10 19:21:00.503475395 +0800
+++ linux/drivers/md/raid5.h	2014-09-10 19:21:00.499475454 +0800
@@ -459,15 +459,11 @@ struct r5conf {
 	/* per cpu variables */
 	struct raid5_percpu {
 		struct page	*spare_page; /* Used when checking P/Q in raid6 */
-		void		*scribble;   /* space for constructing buffer
+		struct flex_array *scribble;   /* space for constructing buffer
 					      * lists and performing address
 					      * conversions
 					      */
 	} __percpu *percpu;
-	size_t			scribble_len; /* size of scribble region must be
-					       * associated with conf to handle
-					       * cpu hotplug while reshaping
-					       */
 #ifdef CONFIG_HOTPLUG_CPU
 	struct notifier_block	cpu_notify;
 #endif


^ permalink raw reply	[flat|nested] 9+ messages in thread

* [patch v2 2/6] raid5: add a new flag to track if a stripe can be batched
  2014-09-10 12:40 [patch v2 0/6] raid5: automatically batch adjacent full stripe write shli
  2014-09-10 12:40 ` [patch v2 1/6] raid5: use flex_array for scribble data shli
@ 2014-09-10 12:40 ` shli
  2014-09-10 12:40 ` [patch v2 3/6] raid5: track overwrite disk count shli
                   ` (4 subsequent siblings)
  6 siblings, 0 replies; 9+ messages in thread
From: shli @ 2014-09-10 12:40 UTC (permalink / raw)
  To: neilb, linux-raid

[-- Attachment #1: raid5-add-new-flag-for-batching.patch --]
[-- Type: text/plain, Size: 2788 bytes --]

A freshly new stripe with write request can be batched. Any time the stripe is
handled or new read is queued, the flag will be cleared.

Signed-off-by: Shaohua Li <shli@fusionio.com>
---
 drivers/md/raid5.c |   12 +++++++++---
 drivers/md/raid5.h |    1 +
 2 files changed, 10 insertions(+), 3 deletions(-)

Index: linux/drivers/md/raid5.c
===================================================================
--- linux.orig/drivers/md/raid5.c	2014-09-10 19:21:08.183378868 +0800
+++ linux/drivers/md/raid5.c	2014-09-10 19:21:08.179378919 +0800
@@ -558,6 +558,7 @@ retry:
 		goto retry;
 	insert_hash(conf, sh);
 	sh->cpu = smp_processor_id();
+	set_bit(STRIPE_BATCH_READY, &sh->state);
 }
 
 static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector,
@@ -2653,7 +2654,8 @@ schedule_reconstruction(struct stripe_he
  * toread/towrite point to the first in a chain.
  * The bi_next chain must be in order.
  */
-static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, int forwrite)
+static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx,
+	int forwrite, int previous)
 {
 	struct bio **bip;
 	struct r5conf *conf = sh->raid_conf;
@@ -2686,6 +2688,9 @@ static int add_stripe_bio(struct stripe_
 	if (*bip && (*bip)->bi_iter.bi_sector < bio_end_sector(bi))
 		goto overlap;
 
+	if (!forwrite || previous)
+		clear_bit(STRIPE_BATCH_READY, &sh->state);
+
 	BUG_ON(*bip && bi->bi_next && (*bip) != bi->bi_next);
 	if (*bip)
 		bi->bi_next = *bip;
@@ -3754,6 +3759,7 @@ static void handle_stripe(struct stripe_
 		return;
 	}
 
+	clear_bit(STRIPE_BATCH_READY, &sh->state);
 	if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state)) {
 		spin_lock(&sh->stripe_lock);
 		/* Cannot process 'sync' concurrently with 'discard' */
@@ -4739,7 +4745,7 @@ static void make_request(struct mddev *m
 			}
 
 			if (test_bit(STRIPE_EXPANDING, &sh->state) ||
-			    !add_stripe_bio(sh, bi, dd_idx, rw)) {
+			    !add_stripe_bio(sh, bi, dd_idx, rw, previous)) {
 				/* Stripe is busy expanding or
 				 * add failed due to overlap.  Flush everything
 				 * and wait a while
@@ -5148,7 +5154,7 @@ static int  retry_aligned_read(struct r5
 			return handled;
 		}
 
-		if (!add_stripe_bio(sh, raid_bio, dd_idx, 0)) {
+		if (!add_stripe_bio(sh, raid_bio, dd_idx, 0, 0)) {
 			release_stripe(sh);
 			raid5_set_bi_processed_stripes(raid_bio, scnt);
 			conf->retry_read_aligned = raid_bio;
Index: linux/drivers/md/raid5.h
===================================================================
--- linux.orig/drivers/md/raid5.h	2014-09-10 19:21:08.183378868 +0800
+++ linux/drivers/md/raid5.h	2014-09-10 19:21:08.179378919 +0800
@@ -327,6 +327,7 @@ enum {
 	STRIPE_ON_UNPLUG_LIST,
 	STRIPE_DISCARD,
 	STRIPE_ON_RELEASE_LIST,
+	STRIPE_BATCH_READY,
 };
 
 /*


^ permalink raw reply	[flat|nested] 9+ messages in thread

* [patch v2 3/6] raid5: track overwrite disk count
  2014-09-10 12:40 [patch v2 0/6] raid5: automatically batch adjacent full stripe write shli
  2014-09-10 12:40 ` [patch v2 1/6] raid5: use flex_array for scribble data shli
  2014-09-10 12:40 ` [patch v2 2/6] raid5: add a new flag to track if a stripe can be batched shli
@ 2014-09-10 12:40 ` shli
  2014-09-10 12:40 ` [patch v2 4/6] RAID5: batch adjacent full stripe write shli
                   ` (3 subsequent siblings)
  6 siblings, 0 replies; 9+ messages in thread
From: shli @ 2014-09-10 12:40 UTC (permalink / raw)
  To: neilb, linux-raid

[-- Attachment #1: raid5-track-full-stripe-write.patch --]
[-- Type: text/plain, Size: 3188 bytes --]

Track overwrite disk count, so we can know if a stripe is a full stripe write.

Signed-off-by: Shaohua Li <shli@fusionio.com>
---
 drivers/md/raid5.c |   14 +++++++++++++-
 drivers/md/raid5.h |    4 ++++
 2 files changed, 17 insertions(+), 1 deletion(-)

Index: linux/drivers/md/raid5.c
===================================================================
--- linux.orig/drivers/md/raid5.c	2014-09-10 19:21:10.495349803 +0800
+++ linux/drivers/md/raid5.c	2014-09-10 19:21:10.491349853 +0800
@@ -556,6 +556,7 @@ retry:
 	}
 	if (read_seqcount_retry(&conf->gen_lock, seq))
 		goto retry;
+	sh->overwrite_disks = 0;
 	insert_hash(conf, sh);
 	sh->cpu = smp_processor_id();
 	set_bit(STRIPE_BATCH_READY, &sh->state);
@@ -713,6 +714,12 @@ get_active_stripe(struct r5conf *conf, s
 	return sh;
 }
 
+static bool is_full_stripe_write(struct stripe_head *sh)
+{
+	BUG_ON(sh->overwrite_disks > (sh->disks - sh->raid_conf->max_degraded));
+	return sh->overwrite_disks == (sh->disks - sh->raid_conf->max_degraded);
+}
+
 /* Determine if 'data_offset' or 'new_data_offset' should be used
  * in this stripe_head.
  */
@@ -1418,6 +1425,7 @@ ops_run_biodrain(struct stripe_head *sh,
 			spin_lock_irq(&sh->stripe_lock);
 			chosen = dev->towrite;
 			dev->towrite = NULL;
+			sh->overwrite_disks = 0;
 			BUG_ON(dev->written);
 			wbi = dev->written = chosen;
 			spin_unlock_irq(&sh->stripe_lock);
@@ -2708,7 +2716,8 @@ static int add_stripe_bio(struct stripe_
 				sector = bio_end_sector(bi);
 		}
 		if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS)
-			set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags);
+			if (!test_and_set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags))
+				sh->overwrite_disks++;
 	}
 
 	pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n",
@@ -2780,6 +2789,7 @@ handle_failed_stripe(struct r5conf *conf
 		/* fail all writes first */
 		bi = sh->dev[i].towrite;
 		sh->dev[i].towrite = NULL;
+		sh->overwrite_disks = 0;
 		spin_unlock_irq(&sh->stripe_lock);
 		if (bi)
 			bitmap_end = 1;
@@ -4576,12 +4586,14 @@ static void make_discard_request(struct
 		}
 		set_bit(STRIPE_DISCARD, &sh->state);
 		finish_wait(&conf->wait_for_overlap, &w);
+		sh->overwrite_disks = 0;
 		for (d = 0; d < conf->raid_disks; d++) {
 			if (d == sh->pd_idx || d == sh->qd_idx)
 				continue;
 			sh->dev[d].towrite = bi;
 			set_bit(R5_OVERWRITE, &sh->dev[d].flags);
 			raid5_inc_bi_active_stripes(bi);
+			sh->overwrite_disks++;
 		}
 		spin_unlock_irq(&sh->stripe_lock);
 		if (conf->mddev->bitmap) {
Index: linux/drivers/md/raid5.h
===================================================================
--- linux.orig/drivers/md/raid5.h	2014-09-10 19:21:10.495349803 +0800
+++ linux/drivers/md/raid5.h	2014-09-10 19:21:10.491349853 +0800
@@ -210,6 +210,10 @@ struct stripe_head {
 	atomic_t		count;	      /* nr of active thread/requests */
 	int			bm_seq;	/* sequence number for bitmap flushes */
 	int			disks;		/* disks in stripe */
+	int			overwrite_disks; /* total overwrite disks in stripe,
+						  * this is only checked when stripe
+						  * has STRIPE_BATCH_READY
+						  */
 	enum check_states	check_state;
 	enum reconstruct_states reconstruct_state;
 	spinlock_t		stripe_lock;


^ permalink raw reply	[flat|nested] 9+ messages in thread

* [patch v2 4/6] RAID5: batch adjacent full stripe write
  2014-09-10 12:40 [patch v2 0/6] raid5: automatically batch adjacent full stripe write shli
                   ` (2 preceding siblings ...)
  2014-09-10 12:40 ` [patch v2 3/6] raid5: track overwrite disk count shli
@ 2014-09-10 12:40 ` shli
  2014-09-10 12:40 ` [patch v2 5/6] raid5: handle io error of batch list shli
                   ` (2 subsequent siblings)
  6 siblings, 0 replies; 9+ messages in thread
From: shli @ 2014-09-10 12:40 UTC (permalink / raw)
  To: neilb, linux-raid

[-- Attachment #1: raid5-batch-stripe.patch --]
[-- Type: text/plain, Size: 22844 bytes --]

stripe cache is 4k size. Even adjacent full stripe writes are handled in 4k
unit. Idealy we should use big size for adjacent full stripe writes. Bigger
stripe cache size means less stripes runing in the state machine so can reduce
cpu overhead. And also bigger size can cause bigger IO size dispatched to under
layer disks.

With below patch, we will automatically batch adjacent full stripe write
together. Such stripes will be added to the batch list. Only the first stripe
of the list will be put to handle_list and so run handle_stripe(). Some steps
of handle_stripe() are extended to cover all stripes of the list, including
ops_run_io, ops_run_biodrain and so on. With this patch, we have less stripes
running in handle_stripe() and we send IO of whole stripe list together to
increase IO size.

Stripes added to a batch list have some limitations. A batch list can only
include full stripe write and can't cross chunk boundary to make sure stripes
have the same parity disks. Stripes in a batch list must be in the same state
(no written, toread and so on). If a stripe is in a batch list, all new
read/write to add_stripe_bio will be blocked to overlap conflict till the batch
list is handled. The limitations will make sure stripes in a batch list be in
exactly the same state in the life circly.

I did test running 160k randwrite in a RAID5 array with 32k chunk size and 6
PCIe SSD. This patch improves around 30% performance and IO size to under layer
disk is exactly 32k. I also run a 4k randwrite test in the same array to make
sure the performance isn't changed with the patch.

Signed-off-by: Shaohua Li <shli@fusionio.com>
---
 drivers/md/raid5.c |  351 +++++++++++++++++++++++++++++++++++++++++++++++++----
 drivers/md/raid5.h |    4 
 2 files changed, 331 insertions(+), 24 deletions(-)

Index: linux/drivers/md/raid5.c
===================================================================
--- linux.orig/drivers/md/raid5.c	2014-09-10 19:21:12.595323404 +0800
+++ linux/drivers/md/raid5.c	2014-09-10 19:21:12.591323456 +0800
@@ -526,6 +526,7 @@ static void init_stripe(struct stripe_he
 	BUG_ON(atomic_read(&sh->count) != 0);
 	BUG_ON(test_bit(STRIPE_HANDLE, &sh->state));
 	BUG_ON(stripe_operations_active(sh));
+	BUG_ON(sh->batch_head);
 
 	pr_debug("init_stripe called, stripe %llu\n",
 		(unsigned long long)sh->sector);
@@ -720,6 +721,124 @@ static bool is_full_stripe_write(struct
 	return sh->overwrite_disks == (sh->disks - sh->raid_conf->max_degraded);
 }
 
+static void lock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2)
+{
+	local_irq_disable();
+	if (sh1 > sh2) {
+		spin_lock(&sh2->stripe_lock);
+		spin_lock_nested(&sh1->stripe_lock, 1);
+	} else {
+		spin_lock(&sh1->stripe_lock);
+		spin_lock_nested(&sh2->stripe_lock, 1);
+	}
+}
+
+static void unlock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2)
+{
+	spin_unlock(&sh1->stripe_lock);
+	spin_unlock(&sh2->stripe_lock);
+	local_irq_enable();
+}
+
+/* Only freshly new full stripe normal write stripe can be added to a batch list */
+static bool stripe_can_batch(struct stripe_head *sh)
+{
+	return test_bit(STRIPE_BATCH_READY, &sh->state) &&
+		is_full_stripe_write(sh);
+}
+
+/* we only do back search */
+static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh)
+{
+	struct stripe_head *head;
+	sector_t head_sector;
+	int hash;
+	int dd_idx;
+
+	if (!stripe_can_batch(sh))
+		return;
+	/* Don't cross chunks, so stripe pd_idx/qd_idx is the same */
+	if (!(sh->sector % conf->chunk_sectors))
+		return;
+	head_sector = sh->sector - STRIPE_SECTORS;
+
+	hash = stripe_hash_locks_hash(head_sector);
+	spin_lock_irq(conf->hash_locks + hash);
+	head = __find_stripe(conf, head_sector, conf->generation);
+	if (head && !atomic_inc_not_zero(&head->count)) {
+		spin_lock(&conf->device_lock);
+		if (!atomic_read(&head->count)) {
+			if (!test_bit(STRIPE_HANDLE, &head->state))
+				atomic_inc(&conf->active_stripes);
+			BUG_ON(list_empty(&head->lru) &&
+			       !test_bit(STRIPE_EXPANDING, &head->state));
+			list_del_init(&head->lru);
+			if (head->group) {
+				head->group->stripes_cnt--;
+				head->group = NULL;
+			}
+		}
+		atomic_inc(&head->count);
+		spin_unlock(&conf->device_lock);
+	}
+	spin_unlock_irq(conf->hash_locks + hash);
+
+	if (!head)
+		return;
+	if (!stripe_can_batch(head))
+		goto out;
+
+	lock_two_stripes(head, sh);
+	/* clear_batch_ready clear the flag */
+	if (!stripe_can_batch(head) || !stripe_can_batch(sh))
+		goto unlock_out;
+
+	if (sh->batch_head)
+		goto unlock_out;
+
+	dd_idx = 0;
+	while (dd_idx == sh->pd_idx || dd_idx == sh->qd_idx)
+		dd_idx++;
+	if (head->dev[dd_idx].towrite->bi_rw != sh->dev[dd_idx].towrite->bi_rw)
+		goto unlock_out;
+
+	if (head->batch_head) {
+		spin_lock(&head->batch_head->batch_lock);
+		/* This batch list is already running */
+		if (!stripe_can_batch(head)) {
+			spin_unlock(&head->batch_head->batch_lock);
+			goto unlock_out;
+		}
+
+		/*
+		 * at this point, head's BATCH_READY could be cleared, but we
+		 * can still add the stripe to batch list
+		 */
+		list_add(&sh->batch_list, &head->batch_list);
+		spin_unlock(&head->batch_head->batch_lock);
+
+		sh->batch_head = head->batch_head;
+	} else {
+		head->batch_head = head;
+		sh->batch_head = head->batch_head;
+		spin_lock(&head->batch_lock);
+		list_add_tail(&sh->batch_list, &head->batch_list);
+		spin_unlock(&head->batch_lock);
+	}
+
+	if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
+		if (atomic_dec_return(&conf->preread_active_stripes)
+		    < IO_THRESHOLD)
+			md_wakeup_thread(conf->mddev->thread);
+
+	atomic_inc(&sh->count);
+unlock_out:
+	unlock_two_stripes(head, sh);
+out:
+	release_stripe(head);
+}
+
+
 /* Determine if 'data_offset' or 'new_data_offset' should be used
  * in this stripe_head.
  */
@@ -750,6 +869,7 @@ static void ops_run_io(struct stripe_hea
 {
 	struct r5conf *conf = sh->raid_conf;
 	int i, disks = sh->disks;
+	struct stripe_head *head_sh = sh;
 
 	might_sleep();
 
@@ -758,6 +878,8 @@ static void ops_run_io(struct stripe_hea
 		int replace_only = 0;
 		struct bio *bi, *rbi;
 		struct md_rdev *rdev, *rrdev = NULL;
+
+		sh = head_sh;
 		if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) {
 			if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags))
 				rw = WRITE_FUA;
@@ -776,6 +898,7 @@ static void ops_run_io(struct stripe_hea
 		if (test_and_clear_bit(R5_SyncIO, &sh->dev[i].flags))
 			rw |= REQ_SYNC;
 
+again:
 		bi = &sh->dev[i].req;
 		rbi = &sh->dev[i].rreq; /* For writing to replacement */
 
@@ -794,7 +917,7 @@ static void ops_run_io(struct stripe_hea
 				/* We raced and saw duplicates */
 				rrdev = NULL;
 		} else {
-			if (test_bit(R5_ReadRepl, &sh->dev[i].flags) && rrdev)
+			if (test_bit(R5_ReadRepl, &head_sh->dev[i].flags) && rrdev)
 				rdev = rrdev;
 			rrdev = NULL;
 		}
@@ -865,13 +988,15 @@ static void ops_run_io(struct stripe_hea
 				__func__, (unsigned long long)sh->sector,
 				bi->bi_rw, i);
 			atomic_inc(&sh->count);
+			if (sh != head_sh)
+				atomic_inc(&head_sh->count);
 			if (use_new_offset(conf, sh))
 				bi->bi_iter.bi_sector = (sh->sector
 						 + rdev->new_data_offset);
 			else
 				bi->bi_iter.bi_sector = (sh->sector
 						 + rdev->data_offset);
-			if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
+			if (test_bit(R5_ReadNoMerge, &head_sh->dev[i].flags))
 				bi->bi_rw |= REQ_NOMERGE;
 
 			if (test_bit(R5_SkipCopy, &sh->dev[i].flags))
@@ -915,6 +1040,8 @@ static void ops_run_io(struct stripe_hea
 				__func__, (unsigned long long)sh->sector,
 				rbi->bi_rw, i);
 			atomic_inc(&sh->count);
+			if (sh != head_sh)
+				atomic_inc(&head_sh->count);
 			if (use_new_offset(conf, sh))
 				rbi->bi_iter.bi_sector = (sh->sector
 						  + rrdev->new_data_offset);
@@ -948,6 +1075,13 @@ static void ops_run_io(struct stripe_hea
 			clear_bit(R5_LOCKED, &sh->dev[i].flags);
 			set_bit(STRIPE_HANDLE, &sh->state);
 		}
+
+		if (!head_sh->batch_head)
+			continue;
+		sh = list_first_entry(&sh->batch_list, struct stripe_head,
+			batch_list);
+		if (sh != head_sh)
+			goto again;
 	}
 }
 
@@ -1063,6 +1197,7 @@ static void ops_run_biofill(struct strip
 	struct async_submit_ctl submit;
 	int i;
 
+	BUG_ON(sh->batch_head);
 	pr_debug("%s: stripe %llu\n", __func__,
 		(unsigned long long)sh->sector);
 
@@ -1152,6 +1287,8 @@ ops_run_compute5(struct stripe_head *sh,
 	struct async_submit_ctl submit;
 	int i;
 
+	BUG_ON(sh->batch_head);
+
 	pr_debug("%s: stripe %llu block: %d\n",
 		__func__, (unsigned long long)sh->sector, target);
 	BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
@@ -1218,6 +1355,7 @@ ops_run_compute6_1(struct stripe_head *s
 	int i;
 	int count;
 
+	BUG_ON(sh->batch_head);
 	if (sh->ops.target < 0)
 		target = sh->ops.target2;
 	else if (sh->ops.target2 < 0)
@@ -1276,6 +1414,7 @@ ops_run_compute6_2(struct stripe_head *s
 	struct page **blocks = to_addr_page(sh, percpu, 0);
 	struct async_submit_ctl submit;
 
+	BUG_ON(sh->batch_head);
 	pr_debug("%s: stripe %llu block1: %d block2: %d\n",
 		 __func__, (unsigned long long)sh->sector, target, target2);
 	BUG_ON(target < 0 || target2 < 0);
@@ -1389,6 +1528,7 @@ ops_run_prexor(struct stripe_head *sh, s
 	/* existing parity data subtracted */
 	struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
 
+	BUG_ON(sh->batch_head);
 	pr_debug("%s: stripe %llu\n", __func__,
 		(unsigned long long)sh->sector);
 
@@ -1411,17 +1551,21 @@ ops_run_biodrain(struct stripe_head *sh,
 {
 	int disks = sh->disks;
 	int i;
+	struct stripe_head *head_sh = sh;
 
 	pr_debug("%s: stripe %llu\n", __func__,
 		(unsigned long long)sh->sector);
 
 	for (i = disks; i--; ) {
-		struct r5dev *dev = &sh->dev[i];
+		struct r5dev *dev;
 		struct bio *chosen;
 
-		if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) {
+		sh = head_sh;
+		if (test_and_clear_bit(R5_Wantdrain, &head_sh->dev[i].flags)) {
 			struct bio *wbi;
 
+again:
+			dev = &sh->dev[i];
 			spin_lock_irq(&sh->stripe_lock);
 			chosen = dev->towrite;
 			dev->towrite = NULL;
@@ -1450,6 +1594,14 @@ ops_run_biodrain(struct stripe_head *sh,
 				}
 				wbi = r5_next_bio(wbi, dev->sector);
 			}
+
+			if (head_sh->batch_head) {
+				sh = list_first_entry(&sh->batch_list,
+					struct stripe_head, batch_list);
+				if (sh == head_sh)
+					continue;
+				goto again;
+			}
 		}
 	}
 
@@ -1505,12 +1657,15 @@ ops_run_reconstruct5(struct stripe_head
 		     struct dma_async_tx_descriptor *tx)
 {
 	int disks = sh->disks;
-	struct page **xor_srcs = to_addr_page(sh, percpu, 0);
+	struct page **xor_srcs;
 	struct async_submit_ctl submit;
-	int count = 0, pd_idx = sh->pd_idx, i;
+	int count, pd_idx = sh->pd_idx, i;
 	struct page *xor_dest;
 	int prexor = 0;
 	unsigned long flags;
+	int j = 0;
+	struct stripe_head *head_sh = sh;
+	int last_stripe;
 
 	pr_debug("%s: stripe %llu\n", __func__,
 		(unsigned long long)sh->sector);
@@ -1527,15 +1682,18 @@ ops_run_reconstruct5(struct stripe_head
 		ops_complete_reconstruct(sh);
 		return;
 	}
+again:
+	count = 0;
+	xor_srcs = to_addr_page(sh, percpu, j);
 	/* check if prexor is active which means only process blocks
 	 * that are part of a read-modify-write (written)
 	 */
-	if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
+	if (head_sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
 		prexor = 1;
 		xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
 		for (i = disks; i--; ) {
 			struct r5dev *dev = &sh->dev[i];
-			if (dev->written)
+			if (head_sh->dev[i].written)
 				xor_srcs[count++] = dev->page;
 		}
 	} else {
@@ -1552,17 +1710,31 @@ ops_run_reconstruct5(struct stripe_head
 	 * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST
 	 * for the synchronous xor case
 	 */
-	flags = ASYNC_TX_ACK |
-		(prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST);
-
-	atomic_inc(&sh->count);
+	last_stripe = !head_sh->batch_head || list_first_entry(&sh->batch_list,
+		struct stripe_head, batch_list) == head_sh;
+	if (last_stripe) {
+		flags = ASYNC_TX_ACK |
+			(prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST);
+
+		atomic_inc(&head_sh->count);
+		init_async_submit(&submit, flags, tx, ops_complete_reconstruct, head_sh,
+			  to_addr_conv(sh, percpu, j));
+	} else {
+		flags = prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST;
+		init_async_submit(&submit, flags, tx, NULL, NULL,
+			  to_addr_conv(sh, percpu, j));
+	}
 
-	init_async_submit(&submit, flags, tx, ops_complete_reconstruct, sh,
-			  to_addr_conv(sh, percpu, 0));
 	if (unlikely(count == 1))
 		tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit);
 	else
 		tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
+	if (!last_stripe) {
+		j++;
+		sh = list_first_entry(&sh->batch_list, struct stripe_head,
+			batch_list);
+		goto again;
+	}
 }
 
 static void
@@ -1570,8 +1742,10 @@ ops_run_reconstruct6(struct stripe_head
 		     struct dma_async_tx_descriptor *tx)
 {
 	struct async_submit_ctl submit;
-	struct page **blocks = to_addr_page(sh, percpu, 0);
-	int count, i;
+	struct page **blocks;
+	int count, i, j = 0;
+	struct stripe_head *head_sh = sh;
+	int last_stripe;
 
 	pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector);
 
@@ -1589,13 +1763,26 @@ ops_run_reconstruct6(struct stripe_head
 		return;
 	}
 
+again:
+	blocks = to_addr_page(sh, percpu, j);
 	count = set_syndrome_sources(blocks, sh);
+	last_stripe = !head_sh->batch_head || list_first_entry(&sh->batch_list,
+		struct stripe_head, batch_list) == head_sh;
 
-	atomic_inc(&sh->count);
-
-	init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_reconstruct,
-			  sh, to_addr_conv(sh, percpu, 0));
+	if (last_stripe) {
+		atomic_inc(&head_sh->count);
+		init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_reconstruct,
+			  head_sh, to_addr_conv(sh, percpu, j));
+	} else
+		init_async_submit(&submit, 0, tx, NULL, NULL,
+			  to_addr_conv(sh, percpu, j));
 	async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE,  &submit);
+	if (!last_stripe) {
+		j++;
+		sh = list_first_entry(&sh->batch_list, struct stripe_head,
+			batch_list);
+		goto again;
+	}
 }
 
 static void ops_complete_check(void *stripe_head_ref)
@@ -1625,6 +1812,7 @@ static void ops_run_check_p(struct strip
 	pr_debug("%s: stripe %llu\n", __func__,
 		(unsigned long long)sh->sector);
 
+	BUG_ON(sh->batch_head);
 	count = 0;
 	xor_dest = sh->dev[pd_idx].page;
 	xor_srcs[count++] = xor_dest;
@@ -1653,6 +1841,7 @@ static void ops_run_check_pq(struct stri
 	pr_debug("%s: stripe %llu checkp: %d\n", __func__,
 		(unsigned long long)sh->sector, checkp);
 
+	BUG_ON(sh->batch_head);
 	count = set_syndrome_sources(srcs, sh);
 	if (!checkp)
 		srcs[count] = NULL;
@@ -1720,7 +1909,7 @@ static void raid_run_ops(struct stripe_h
 			BUG();
 	}
 
-	if (overlap_clear)
+	if (overlap_clear && !sh->batch_head)
 		for (i = disks; i--; ) {
 			struct r5dev *dev = &sh->dev[i];
 			if (test_and_clear_bit(R5_Overlap, &dev->flags))
@@ -1750,6 +1939,10 @@ static int grow_one_stripe(struct r5conf
 	atomic_set(&sh->count, 1);
 	atomic_inc(&conf->active_stripes);
 	INIT_LIST_HEAD(&sh->lru);
+
+	spin_lock_init(&sh->batch_lock);
+	INIT_LIST_HEAD(&sh->batch_list);
+	sh->batch_head = NULL;
 	release_stripe(sh);
 	return 1;
 }
@@ -2193,6 +2386,9 @@ static void raid5_end_write_request(stru
 		clear_bit(R5_LOCKED, &sh->dev[i].flags);
 	set_bit(STRIPE_HANDLE, &sh->state);
 	release_stripe(sh);
+
+	if (sh->batch_head && sh != sh->batch_head)
+		release_stripe(sh->batch_head);
 }
 
 static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous);
@@ -2682,6 +2878,9 @@ static int add_stripe_bio(struct stripe_
 	 * protect it.
 	 */
 	spin_lock_irq(&sh->stripe_lock);
+	/* Don't allow new IO added to stripes in batch list */
+	if (sh->batch_head)
+		goto overlap;
 	if (forwrite) {
 		bip = &sh->dev[dd_idx].towrite;
 		if (*bip == NULL)
@@ -2731,6 +2930,9 @@ static int add_stripe_bio(struct stripe_
 		sh->bm_seq = conf->seq_flush+1;
 		set_bit(STRIPE_BIT_DELAY, &sh->state);
 	}
+
+	if (stripe_can_batch(sh))
+		stripe_add_to_batch_list(conf, sh);
 	return 1;
 
  overlap:
@@ -2763,6 +2965,7 @@ handle_failed_stripe(struct r5conf *conf
 				struct bio **return_bi)
 {
 	int i;
+	BUG_ON(sh->batch_head);
 	for (i = disks; i--; ) {
 		struct bio *bi;
 		int bitmap_end = 0;
@@ -2878,6 +3081,7 @@ handle_failed_sync(struct r5conf *conf,
 	int abort = 0;
 	int i;
 
+	BUG_ON(sh->batch_head);
 	clear_bit(STRIPE_SYNCING, &sh->state);
 	if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags))
 		wake_up(&conf->wait_for_overlap);
@@ -3034,6 +3238,7 @@ static void handle_stripe_fill(struct st
 {
 	int i;
 
+	BUG_ON(sh->batch_head);
 	/* look for blocks to read/compute, skip this if a compute
 	 * is already in flight, or if the stripe contents are in the
 	 * midst of changing due to a write
@@ -3058,6 +3263,9 @@ static void handle_stripe_clean_event(st
 	int i;
 	struct r5dev *dev;
 	int discard_pending = 0;
+	struct stripe_head *head_sh = sh;
+	bool do_endio = false;
+	int wakeup_nr = 0;
 
 	for (i = disks; i--; )
 		if (sh->dev[i].written) {
@@ -3073,8 +3281,11 @@ static void handle_stripe_clean_event(st
 					clear_bit(R5_UPTODATE, &dev->flags);
 				if (test_and_clear_bit(R5_SkipCopy, &dev->flags)) {
 					WARN_ON(test_bit(R5_UPTODATE, &dev->flags));
-					dev->page = dev->orig_page;
 				}
+				do_endio = true;
+
+returnbi:
+				dev->page = dev->orig_page;
 				wbi = dev->written;
 				dev->written = NULL;
 				while (wbi && wbi->bi_iter.bi_sector <
@@ -3091,6 +3302,16 @@ static void handle_stripe_clean_event(st
 						STRIPE_SECTORS,
 					 !test_bit(STRIPE_DEGRADED, &sh->state),
 						0);
+				if (head_sh->batch_head) {
+					sh = list_first_entry(&sh->batch_list,
+						struct stripe_head, batch_list);
+					if (sh != head_sh) {
+						dev = &sh->dev[i];
+						goto returnbi;
+					}
+				}
+				sh = head_sh;
+				dev = &sh->dev[i];
 			} else if (test_bit(R5_Discard, &dev->flags))
 				discard_pending = 1;
 			WARN_ON(test_bit(R5_SkipCopy, &dev->flags));
@@ -3112,8 +3333,17 @@ static void handle_stripe_clean_event(st
 		 * will be reinitialized
 		 */
 		spin_lock_irq(&conf->device_lock);
+unhash:
 		remove_hash(sh);
+		if (head_sh->batch_head) {
+			sh = list_first_entry(&sh->batch_list,
+				struct stripe_head, batch_list);
+			if (sh != head_sh)
+					goto unhash;
+		}
 		spin_unlock_irq(&conf->device_lock);
+		sh = head_sh;
+
 		if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state))
 			set_bit(STRIPE_HANDLE, &sh->state);
 
@@ -3122,6 +3352,39 @@ static void handle_stripe_clean_event(st
 	if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
 		if (atomic_dec_and_test(&conf->pending_full_writes))
 			md_wakeup_thread(conf->mddev->thread);
+
+	if (!head_sh->batch_head || !do_endio)
+		return;
+	for (i = 0; i < head_sh->disks; i++) {
+		if (test_and_clear_bit(R5_Overlap, &head_sh->dev[i].flags))
+			wakeup_nr++;
+	}
+	while (!list_empty(&head_sh->batch_list)) {
+		int i;
+		sh = list_first_entry(&head_sh->batch_list,
+			struct stripe_head, batch_list);
+		list_del_init(&sh->batch_list);
+
+		sh->state = head_sh->state & (~((1 << STRIPE_ACTIVE) |
+			(1 << STRIPE_PREREAD_ACTIVE)));
+		sh->check_state = head_sh->check_state;
+		sh->reconstruct_state = head_sh->reconstruct_state;
+		for (i = 0; i < sh->disks; i++) {
+			if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
+				wakeup_nr++;
+			sh->dev[i].flags = head_sh->dev[i].flags;
+		}
+
+		spin_lock_irq(&sh->stripe_lock);
+		sh->batch_head = NULL;
+		spin_unlock_irq(&sh->stripe_lock);
+		release_stripe(sh);
+	}
+
+	spin_lock_irq(&head_sh->stripe_lock);
+	head_sh->batch_head = NULL;
+	spin_unlock_irq(&head_sh->stripe_lock);
+	wake_up_nr(&conf->wait_for_overlap, wakeup_nr);
 }
 
 static void handle_stripe_dirtying(struct r5conf *conf,
@@ -3255,6 +3518,7 @@ static void handle_parity_checks5(struct
 {
 	struct r5dev *dev = NULL;
 
+	BUG_ON(sh->batch_head);
 	set_bit(STRIPE_HANDLE, &sh->state);
 
 	switch (sh->check_state) {
@@ -3346,6 +3610,7 @@ static void handle_parity_checks6(struct
 	int qd_idx = sh->qd_idx;
 	struct r5dev *dev;
 
+	BUG_ON(sh->batch_head);
 	set_bit(STRIPE_HANDLE, &sh->state);
 
 	BUG_ON(s->failed > 2);
@@ -3509,6 +3774,7 @@ static void handle_stripe_expansion(stru
 	 * copy some of them into a target stripe for expand.
 	 */
 	struct dma_async_tx_descriptor *tx = NULL;
+	BUG_ON(sh->batch_head);
 	clear_bit(STRIPE_EXPAND_SOURCE, &sh->state);
 	for (i = 0; i < sh->disks; i++)
 		if (i != sh->pd_idx && i != sh->qd_idx) {
@@ -3752,6 +4018,38 @@ static void analyse_stripe(struct stripe
 	rcu_read_unlock();
 }
 
+static int clear_batch_ready(struct stripe_head *sh)
+{
+	struct stripe_head *tmp;
+	if (!test_and_clear_bit(STRIPE_BATCH_READY, &sh->state))
+		return 0;
+	spin_lock(&sh->stripe_lock);
+	if (!sh->batch_head) {
+		spin_unlock(&sh->stripe_lock);
+		return 0;
+	}
+
+	/*
+	 * this stripe could be added to a batch list before we check
+	 * BATCH_READY, skips it
+	 */
+	if (sh->batch_head != sh) {
+		spin_unlock(&sh->stripe_lock);
+		return 1;
+	}
+	spin_lock(&sh->batch_lock);
+	list_for_each_entry(tmp, &sh->batch_list, batch_list)
+		clear_bit(STRIPE_BATCH_READY, &tmp->state);
+	spin_unlock(&sh->batch_lock);
+	spin_unlock(&sh->stripe_lock);
+
+	/*
+	 * BATCH_READY is cleared, no new stripes can be added.
+	 * batch_list can be accessed without lock
+	 */
+	return 0;
+}
+
 static void handle_stripe(struct stripe_head *sh)
 {
 	struct stripe_head_state s;
@@ -3769,7 +4067,11 @@ static void handle_stripe(struct stripe_
 		return;
 	}
 
-	clear_bit(STRIPE_BATCH_READY, &sh->state);
+	if (clear_batch_ready(sh) ) {
+		clear_bit_unlock(STRIPE_ACTIVE, &sh->state);
+		return;
+	}
+
 	if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state)) {
 		spin_lock(&sh->stripe_lock);
 		/* Cannot process 'sync' concurrently with 'discard' */
@@ -4770,7 +5072,8 @@ static void make_request(struct mddev *m
 			}
 			set_bit(STRIPE_HANDLE, &sh->state);
 			clear_bit(STRIPE_DELAYED, &sh->state);
-			if ((bi->bi_rw & REQ_SYNC) &&
+			if ((!sh->batch_head || sh == sh->batch_head) &&
+			    (bi->bi_rw & REQ_SYNC) &&
 			    !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
 				atomic_inc(&conf->preread_active_stripes);
 			release_stripe_plug(mddev, sh);
Index: linux/drivers/md/raid5.h
===================================================================
--- linux.orig/drivers/md/raid5.h	2014-09-10 19:21:12.595323404 +0800
+++ linux/drivers/md/raid5.h	2014-09-10 19:21:12.591323456 +0800
@@ -219,6 +219,10 @@ struct stripe_head {
 	spinlock_t		stripe_lock;
 	int			cpu;
 	struct r5worker_group	*group;
+
+	struct stripe_head	*batch_head; /* protected by stripe lock */
+	spinlock_t		batch_lock; /* only header's lock is useful */
+	struct list_head	batch_list; /* protected by head's batch lock*/
 	/**
 	 * struct stripe_operations
 	 * @target - STRIPE_OP_COMPUTE_BLK target


^ permalink raw reply	[flat|nested] 9+ messages in thread

* [patch v2 5/6] raid5: handle io error of batch list
  2014-09-10 12:40 [patch v2 0/6] raid5: automatically batch adjacent full stripe write shli
                   ` (3 preceding siblings ...)
  2014-09-10 12:40 ` [patch v2 4/6] RAID5: batch adjacent full stripe write shli
@ 2014-09-10 12:40 ` shli
  2014-09-10 12:40 ` [patch v2 6/6] raid5: handle expansion/resync case with stripe batching shli
  2014-09-11  7:15 ` [patch v2 0/6] raid5: automatically batch adjacent full stripe write NeilBrown
  6 siblings, 0 replies; 9+ messages in thread
From: shli @ 2014-09-10 12:40 UTC (permalink / raw)
  To: neilb, linux-raid

[-- Attachment #1: raid5-handle-batch-io-err.patch --]
[-- Type: text/plain, Size: 3070 bytes --]

If io error happens in any stripe of a batch list, the batch list will be
split, then normal process will run for the stripes in the list.

Signed-off-by: Shaohua Li <shli@fusionio.com>
---
 drivers/md/raid5.c |   47 +++++++++++++++++++++++++++++++++++++++++++++++
 drivers/md/raid5.h |    1 +
 2 files changed, 48 insertions(+)

Index: linux/drivers/md/raid5.c
===================================================================
--- linux.orig/drivers/md/raid5.c	2014-09-10 19:21:14.723296649 +0800
+++ linux/drivers/md/raid5.c	2014-09-10 19:21:14.715296750 +0800
@@ -1073,6 +1073,9 @@ again:
 			pr_debug("skip op %ld on disc %d for sector %llu\n",
 				bi->bi_rw, i, (unsigned long long)sh->sector);
 			clear_bit(R5_LOCKED, &sh->dev[i].flags);
+			if (sh->batch_head)
+				set_bit(STRIPE_BATCH_ERR,
+					&sh->batch_head->state);
 			set_bit(STRIPE_HANDLE, &sh->state);
 		}
 
@@ -2382,6 +2385,9 @@ static void raid5_end_write_request(stru
 	}
 	rdev_dec_pending(rdev, conf->mddev);
 
+	if (sh->batch_head && !uptodate)
+		set_bit(STRIPE_BATCH_ERR, &sh->batch_head->state);
+
 	if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags))
 		clear_bit(R5_LOCKED, &sh->dev[i].flags);
 	set_bit(STRIPE_HANDLE, &sh->state);
@@ -4050,6 +4056,45 @@ static int clear_batch_ready(struct stri
 	return 0;
 }
 
+static void check_break_stripe_batch_list(struct stripe_head *sh)
+{
+	struct stripe_head *head_sh, *next;
+	int i;
+
+	if (!test_and_clear_bit(STRIPE_BATCH_ERR, &sh->state))
+		return;
+
+	head_sh = sh;
+	do {
+		sh = list_first_entry(&sh->batch_list,
+			struct stripe_head, batch_list);
+		BUG_ON(sh == head_sh);
+	} while (!test_bit(STRIPE_DEGRADED, &sh->state));
+
+	while (sh != head_sh) {
+		next = list_first_entry(&sh->batch_list,
+			struct stripe_head, batch_list);
+		list_del_init(&sh->batch_list);
+
+		sh->state = head_sh->state & (~((1 << STRIPE_ACTIVE) |
+			(1 << STRIPE_PREREAD_ACTIVE) | (1 << STRIPE_DEGRADED)));
+		sh->check_state = head_sh->check_state;
+		sh->reconstruct_state = head_sh->reconstruct_state;
+		for (i = 0; i < sh->disks; i++)
+			sh->dev[i].flags = head_sh->dev[i].flags &
+				(~((1 << R5_WriteError) | (1 << R5_Overlap)));
+
+		spin_lock_irq(&sh->stripe_lock);
+		sh->batch_head = NULL;
+		spin_unlock_irq(&sh->stripe_lock);
+
+		set_bit(STRIPE_HANDLE, &sh->state);
+		release_stripe(sh);
+
+		sh = next;
+	}
+}
+
 static void handle_stripe(struct stripe_head *sh)
 {
 	struct stripe_head_state s;
@@ -4072,6 +4117,8 @@ static void handle_stripe(struct stripe_
 		return;
 	}
 
+	check_break_stripe_batch_list(sh);
+
 	if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state)) {
 		spin_lock(&sh->stripe_lock);
 		/* Cannot process 'sync' concurrently with 'discard' */
Index: linux/drivers/md/raid5.h
===================================================================
--- linux.orig/drivers/md/raid5.h	2014-09-10 19:21:14.723296649 +0800
+++ linux/drivers/md/raid5.h	2014-09-10 19:21:14.719296699 +0800
@@ -336,6 +336,7 @@ enum {
 	STRIPE_DISCARD,
 	STRIPE_ON_RELEASE_LIST,
 	STRIPE_BATCH_READY,
+	STRIPE_BATCH_ERR,
 };
 
 /*


^ permalink raw reply	[flat|nested] 9+ messages in thread

* [patch v2 6/6] raid5: handle expansion/resync case with stripe batching
  2014-09-10 12:40 [patch v2 0/6] raid5: automatically batch adjacent full stripe write shli
                   ` (4 preceding siblings ...)
  2014-09-10 12:40 ` [patch v2 5/6] raid5: handle io error of batch list shli
@ 2014-09-10 12:40 ` shli
  2014-09-11  7:15 ` [patch v2 0/6] raid5: automatically batch adjacent full stripe write NeilBrown
  6 siblings, 0 replies; 9+ messages in thread
From: shli @ 2014-09-10 12:40 UTC (permalink / raw)
  To: neilb, linux-raid

[-- Attachment #1: raid5-handle-resync-stripe-for-batching.patch --]
[-- Type: text/plain, Size: 3577 bytes --]

expansion/resync can grab a stripe when the stripe is in batch list. Since all
stripes in batch list must be in the same state, we can't allow some stripes
run into expansion/resync. So we delay expansion/resync for stripe in batch
list.

Signed-off-by: Shaohua Li <shli@fusionio.com>
---
 drivers/md/raid5.c |   22 +++++++++++++++-------
 drivers/md/raid5.h |    5 +++++
 2 files changed, 20 insertions(+), 7 deletions(-)

Index: linux/drivers/md/raid5.c
===================================================================
--- linux.orig/drivers/md/raid5.c	2014-09-10 19:21:16.759271053 +0800
+++ linux/drivers/md/raid5.c	2014-09-10 19:21:16.755271103 +0800
@@ -3371,8 +3371,10 @@ unhash:
 			struct stripe_head, batch_list);
 		list_del_init(&sh->batch_list);
 
-		sh->state = head_sh->state & (~((1 << STRIPE_ACTIVE) |
-			(1 << STRIPE_PREREAD_ACTIVE)));
+		set_mask_bits(&sh->state, ~STRIPE_EXPAND_SYNC_FLAG,
+			head_sh->state & (~((1 << STRIPE_ACTIVE) |
+			(1 << STRIPE_PREREAD_ACTIVE) |
+			STRIPE_EXPAND_SYNC_FLAG)));
 		sh->check_state = head_sh->check_state;
 		sh->reconstruct_state = head_sh->reconstruct_state;
 		for (i = 0; i < sh->disks; i++) {
@@ -3384,6 +3386,8 @@ unhash:
 		spin_lock_irq(&sh->stripe_lock);
 		sh->batch_head = NULL;
 		spin_unlock_irq(&sh->stripe_lock);
+		if (sh->state & STRIPE_EXPAND_SYNC_FLAG)
+			set_bit(STRIPE_HANDLE, &sh->state);
 		release_stripe(sh);
 	}
 
@@ -3391,6 +3395,8 @@ unhash:
 	head_sh->batch_head = NULL;
 	spin_unlock_irq(&head_sh->stripe_lock);
 	wake_up_nr(&conf->wait_for_overlap, wakeup_nr);
+	if (head_sh->state & STRIPE_EXPAND_SYNC_FLAG)
+		set_bit(STRIPE_HANDLE, &head_sh->state);
 }
 
 static void handle_stripe_dirtying(struct r5conf *conf,
@@ -3853,8 +3859,8 @@ static void analyse_stripe(struct stripe
 
 	memset(s, 0, sizeof(*s));
 
-	s->expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state);
-	s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state);
+	s->expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state) && !sh->batch_head;
+	s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state) && !sh->batch_head;
 	s->failed_num[0] = -1;
 	s->failed_num[1] = -1;
 
@@ -4076,8 +4082,10 @@ static void check_break_stripe_batch_lis
 			struct stripe_head, batch_list);
 		list_del_init(&sh->batch_list);
 
-		sh->state = head_sh->state & (~((1 << STRIPE_ACTIVE) |
-			(1 << STRIPE_PREREAD_ACTIVE) | (1 << STRIPE_DEGRADED)));
+		set_mask_bits(&sh->state, ~STRIPE_EXPAND_SYNC_FLAG,
+			head_sh->state & (~((1 << STRIPE_ACTIVE) |
+			(1 << STRIPE_PREREAD_ACTIVE) | (1 << STRIPE_DEGRADED) |
+			STRIPE_EXPAND_SYNC_FLAG)));
 		sh->check_state = head_sh->check_state;
 		sh->reconstruct_state = head_sh->reconstruct_state;
 		for (i = 0; i < sh->disks; i++)
@@ -4119,7 +4127,7 @@ static void handle_stripe(struct stripe_
 
 	check_break_stripe_batch_list(sh);
 
-	if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state)) {
+	if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) && !sh->batch_head) {
 		spin_lock(&sh->stripe_lock);
 		/* Cannot process 'sync' concurrently with 'discard' */
 		if (!test_bit(STRIPE_DISCARD, &sh->state) &&
Index: linux/drivers/md/raid5.h
===================================================================
--- linux.orig/drivers/md/raid5.h	2014-09-10 19:21:16.759271053 +0800
+++ linux/drivers/md/raid5.h	2014-09-10 19:21:16.755271103 +0800
@@ -339,6 +339,11 @@ enum {
 	STRIPE_BATCH_ERR,
 };
 
+#define STRIPE_EXPAND_SYNC_FLAG \
+	((1 << STRIPE_EXPAND_SOURCE) |\
+	(1 << STRIPE_EXPAND_READY) |\
+	(1 << STRIPE_EXPANDING) |\
+	(1 << STRIPE_SYNC_REQUESTED))
 /*
  * Operation request flags
  */


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [patch v2 0/6] raid5: automatically batch adjacent full stripe write
  2014-09-10 12:40 [patch v2 0/6] raid5: automatically batch adjacent full stripe write shli
                   ` (5 preceding siblings ...)
  2014-09-10 12:40 ` [patch v2 6/6] raid5: handle expansion/resync case with stripe batching shli
@ 2014-09-11  7:15 ` NeilBrown
  2014-09-11 14:38   ` Shaohua Li
  6 siblings, 1 reply; 9+ messages in thread
From: NeilBrown @ 2014-09-11  7:15 UTC (permalink / raw)
  To: shli; +Cc: linux-raid

[-- Attachment #1: Type: text/plain, Size: 601 bytes --]

On Wed, 10 Sep 2014 20:40:09 +0800 shli@kernel.org wrote:

> This is the 2nd attempt to make adjacent full stripe write together. The main
> change against v1 is how to detect if stripes can be batched. Also some bugs
> are fixed.
> 

Thanks a lot for this - it looks quite good.  I've read through most of it
and don't see anything worth mentioning.... except maybe to wonder why
to_addr_page() has an unused 'sh' argument.  Maybe I'll just remove that(?).

I've pushed it you the 'devel' branch of my md git tree.  I'll try to do some
testing of my own next week.

Thanks,
NeilBrown

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 828 bytes --]

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [patch v2 0/6] raid5: automatically batch adjacent full stripe write
  2014-09-11  7:15 ` [patch v2 0/6] raid5: automatically batch adjacent full stripe write NeilBrown
@ 2014-09-11 14:38   ` Shaohua Li
  0 siblings, 0 replies; 9+ messages in thread
From: Shaohua Li @ 2014-09-11 14:38 UTC (permalink / raw)
  To: NeilBrown; +Cc: linux-raid

On Thu, Sep 11, 2014 at 05:15:25PM +1000, NeilBrown wrote:
> On Wed, 10 Sep 2014 20:40:09 +0800 shli@kernel.org wrote:
> 
> > This is the 2nd attempt to make adjacent full stripe write together. The main
> > change against v1 is how to detect if stripes can be batched. Also some bugs
> > are fixed.
> > 
> 
> Thanks a lot for this - it looks quite good.  I've read through most of it
> and don't see anything worth mentioning.... except maybe to wonder why
> to_addr_page() has an unused 'sh' argument.  Maybe I'll just remove that(?).

Can be removed. I just want the parameters of to_addr_conv/page are the same.

> I've pushed it you the 'devel' branch of my md git tree.  I'll try to do some
> testing of my own next week.

Thanks!

Thanks,
Shaohua

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2014-09-11 14:38 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2014-09-10 12:40 [patch v2 0/6] raid5: automatically batch adjacent full stripe write shli
2014-09-10 12:40 ` [patch v2 1/6] raid5: use flex_array for scribble data shli
2014-09-10 12:40 ` [patch v2 2/6] raid5: add a new flag to track if a stripe can be batched shli
2014-09-10 12:40 ` [patch v2 3/6] raid5: track overwrite disk count shli
2014-09-10 12:40 ` [patch v2 4/6] RAID5: batch adjacent full stripe write shli
2014-09-10 12:40 ` [patch v2 5/6] raid5: handle io error of batch list shli
2014-09-10 12:40 ` [patch v2 6/6] raid5: handle expansion/resync case with stripe batching shli
2014-09-11  7:15 ` [patch v2 0/6] raid5: automatically batch adjacent full stripe write NeilBrown
2014-09-11 14:38   ` Shaohua Li

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).