* [PATCH] raid5: add support for rmw writes in raid6
@ 2013-04-18  0:56 Dan Williams
  2013-04-18 18:40 ` Dan Williams
  2013-04-22  4:22 ` NeilBrown
  0 siblings, 2 replies; 17+ messages in thread
From: Dan Williams @ 2013-04-18  0:56 UTC (permalink / raw)
  To: neilb; +Cc: linux-raid, Kumar Sundararajan
From: Kumar Sundararajan <kumar@fb.com>
Add support for rmw writes in raid6. This improves small write performance for most workloads.
Since there may be some configurations and workloads where rcw always outperforms rmw,
this feature is disabled by default. It can be enabled via sysfs. For example,
	#echo 1 > /sys/block/md0/md/enable_rmw
Signed-off-by: Kumar Sundararajan <kumar@fb.com>
Signed-off-by: Dan Williams <djbw@fb.com>
---
Hi Neil,
We decided to leave the enable in for the few cases where forced-rcw
outperformed rmw and there may be other cases out there.
Thoughts?
Here are some numbers from Kumar's testing with a 12 disk array:
                                     with rmw   rcw only
4K random write                    887.0 KB/s  94.4 KB/s
64k seq write                      291.5 MB/s 283.2 MB/s
64k random write                    16.4 MB/s   7.7 MB/s
64K mixed read/write                94.4 MB/s  94.0 MB/s
(70% random reads/30 % seq writes)   1.6 MB/s   1.8 MB/s
--
Dan
 drivers/md/raid5.c |  169 ++++++++++++++++++++++++++++++++++++++++++++++++----
 drivers/md/raid5.h |    4 +
 2 files changed, 161 insertions(+), 12 deletions(-)
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index e25520e..c9b6112 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1015,6 +1015,40 @@ static int set_syndrome_sources(struct page **srcs, struct stripe_head *sh)
 	return syndrome_disks;
 }
 
+static int set_rmw_syndrome_sources(struct page **srcs, struct stripe_head *sh,
+	 struct raid5_percpu *percpu, int subtract)
+{
+	int disks = sh->disks;
+	int syndrome_disks = sh->ddf_layout ? disks : (disks - 2);
+	int d0_idx = raid6_d0(sh);
+	int count;
+	int i;
+
+	for (i = 0; i < disks; i++)
+		srcs[i] = NULL;
+
+	count = 0;
+	i = d0_idx;
+	do {
+		int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
+
+		if (subtract) {
+			if (test_bit(R5_Wantdrain, &sh->dev[i].flags))
+				srcs[slot] = sh->dev[i].page;
+			else if (i == sh->pd_idx)
+				srcs[slot] = percpu->spare_page;
+			else if (i == sh->qd_idx)
+				srcs[slot] = percpu->spare_page2;
+		} else if (sh->dev[i].written || i == sh->pd_idx ||
+			    i == sh->qd_idx)
+			srcs[slot] = sh->dev[i].page;
+
+		i = raid6_next_disk(i, disks);
+	} while (i != d0_idx);
+
+	return syndrome_disks;
+}
+
 static struct dma_async_tx_descriptor *
 ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu)
 {
@@ -1401,6 +1435,50 @@ ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu,
 	async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE,  &submit);
 }
 
+static struct dma_async_tx_descriptor *
+ops_run_rmw(struct stripe_head *sh, struct raid5_percpu *percpu,
+	    struct dma_async_tx_descriptor *tx, int subtract)
+{
+	int pd_idx = sh->pd_idx;
+	int qd_idx = sh->qd_idx;
+	struct page **blocks = percpu->scribble;
+	struct page *xor_dest;
+	struct r5dev *dev;
+	struct async_submit_ctl submit;
+	int count;
+
+	pr_debug("%s: stripe %llu\n", __func__,
+		(unsigned long long)sh->sector);
+
+	count = set_rmw_syndrome_sources(blocks, sh, percpu, subtract);
+
+	init_async_submit(&submit, ASYNC_TX_ACK, tx, NULL, NULL,
+			  to_addr_conv(sh, percpu));
+	tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit);
+
+	dev = &sh->dev[pd_idx];
+	xor_dest = blocks[0] = subtract ? percpu->spare_page : dev->page;
+	blocks[1] = subtract ? dev->page : percpu->spare_page;
+
+	init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx,
+			  NULL, sh, to_addr_conv(sh, percpu));
+	tx = async_xor(xor_dest, blocks, 0, 2, STRIPE_SIZE, &submit);
+
+	dev = &sh->dev[qd_idx];
+	xor_dest = blocks[0] = subtract ? percpu->spare_page2 : dev->page;
+	blocks[1] = subtract ? dev->page : percpu->spare_page2;
+
+	if (!subtract)
+		atomic_inc(&sh->count);
+
+	init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx,
+			  subtract ? NULL : ops_complete_reconstruct, sh,
+			  to_addr_conv(sh, percpu));
+	tx = async_xor(xor_dest, blocks, 0, 2, STRIPE_SIZE, &submit);
+
+	return tx;
+}
+
 static void ops_complete_check(void *stripe_head_ref)
 {
 	struct stripe_head *sh = stripe_head_ref;
@@ -1500,6 +1578,9 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
 	if (test_bit(STRIPE_OP_PREXOR, &ops_request))
 		tx = ops_run_prexor(sh, percpu, tx);
 
+	if (test_bit(STRIPE_OP_RMW_SUBTRACT, &ops_request))
+		tx = ops_run_rmw(sh, percpu, tx, 1);
+
 	if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) {
 		tx = ops_run_biodrain(sh, tx);
 		overlap_clear++;
@@ -1512,6 +1593,9 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
 			ops_run_reconstruct6(sh, percpu, tx);
 	}
 
+	if (test_bit(STRIPE_OP_RMW, &ops_request))
+		tx = ops_run_rmw(sh, percpu, tx, 0);
+
 	if (test_bit(STRIPE_OP_CHECK, &ops_request)) {
 		if (sh->check_state == check_state_run)
 			ops_run_check_p(sh, percpu);
@@ -2346,7 +2430,7 @@ static void
 schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
 			 int rcw, int expand)
 {
-	int i, pd_idx = sh->pd_idx, disks = sh->disks;
+	int i, pd_idx = sh->pd_idx, qd_idx = sh->qd_idx, disks = sh->disks;
 	struct r5conf *conf = sh->raid_conf;
 	int level = conf->level;
 
@@ -2382,13 +2466,15 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
 			if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state))
 				atomic_inc(&conf->pending_full_writes);
 	} else {
-		BUG_ON(level == 6);
 		BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) ||
 			test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags)));
+		BUG_ON(level == 6 &&
+			(!(test_bit(R5_UPTODATE, &sh->dev[qd_idx].flags) ||
+			test_bit(R5_Wantcompute, &sh->dev[qd_idx].flags))));
 
 		for (i = disks; i--; ) {
 			struct r5dev *dev = &sh->dev[i];
-			if (i == pd_idx)
+			if (i == pd_idx || i == qd_idx)
 				continue;
 
 			if (dev->towrite &&
@@ -2404,9 +2490,16 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
 			/* False alarm - nothing to do */
 			return;
 		sh->reconstruct_state = reconstruct_state_prexor_drain_run;
-		set_bit(STRIPE_OP_PREXOR, &s->ops_request);
-		set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
-		set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request);
+
+		if (level == 6) {
+			set_bit(STRIPE_OP_RMW_SUBTRACT, &s->ops_request);
+			set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
+			set_bit(STRIPE_OP_RMW, &s->ops_request);
+		} else {
+			set_bit(STRIPE_OP_PREXOR, &s->ops_request);
+			set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
+			set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request);
+		}
 	}
 
 	/* keep the parity disk(s) locked while asynchronous operations
@@ -2876,15 +2969,14 @@ static void handle_stripe_dirtying(struct r5conf *conf,
 	int rmw = 0, rcw = 0, i;
 	sector_t recovery_cp = conf->mddev->recovery_cp;
 
-	/* RAID6 requires 'rcw' in current implementation.
-	 * Otherwise, check whether resync is now happening or should start.
+	/* Check whether resync is now happening or should start.
 	 * If yes, then the array is dirty (after unclean shutdown or
 	 * initial creation), so parity in some stripes might be inconsistent.
 	 * In this case, we need to always do reconstruct-write, to ensure
 	 * that in case of drive failure or read-error correction, we
 	 * generate correct data from the parity.
 	 */
-	if (conf->max_degraded == 2 ||
+	if ((conf->max_degraded == 2 && !conf->enable_rmw) ||
 	    (recovery_cp < MaxSector && sh->sector >= recovery_cp)) {
 		/* Calculate the real rcw later - for now make it
 		 * look like rcw is cheaper
@@ -2896,7 +2988,7 @@ static void handle_stripe_dirtying(struct r5conf *conf,
 	} else for (i = disks; i--; ) {
 		/* would I have to read this buffer for read_modify_write */
 		struct r5dev *dev = &sh->dev[i];
-		if ((dev->towrite || i == sh->pd_idx) &&
+		if ((dev->towrite || i == sh->pd_idx || i == sh->qd_idx) &&
 		    !test_bit(R5_LOCKED, &dev->flags) &&
 		    !(test_bit(R5_UPTODATE, &dev->flags) ||
 		      test_bit(R5_Wantcompute, &dev->flags))) {
@@ -2907,6 +2999,7 @@ static void handle_stripe_dirtying(struct r5conf *conf,
 		}
 		/* Would I have to read this buffer for reconstruct_write */
 		if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx &&
+		    i != sh->qd_idx &&
 		    !test_bit(R5_LOCKED, &dev->flags) &&
 		    !(test_bit(R5_UPTODATE, &dev->flags) ||
 		    test_bit(R5_Wantcompute, &dev->flags))) {
@@ -2926,7 +3019,8 @@ static void handle_stripe_dirtying(struct r5conf *conf,
 					  (unsigned long long)sh->sector, rmw);
 		for (i = disks; i--; ) {
 			struct r5dev *dev = &sh->dev[i];
-			if ((dev->towrite || i == sh->pd_idx) &&
+			if ((dev->towrite || i == sh->pd_idx ||
+			    i == sh->qd_idx) &&
 			    !test_bit(R5_LOCKED, &dev->flags) &&
 			    !(test_bit(R5_UPTODATE, &dev->flags) ||
 			    test_bit(R5_Wantcompute, &dev->flags)) &&
@@ -5360,11 +5454,48 @@ raid5_auxthread_number = __ATTR(auxthread_number, S_IRUGO|S_IWUSR,
 				raid5_show_auxthread_number,
 				raid5_store_auxthread_number);
 
+static ssize_t
+raid5_show_enable_rmw(struct mddev  *mddev, char *page)
+{
+	struct r5conf *conf = mddev->private;
+
+	if (conf->level == 6)
+		return sprintf(page, "%d\n", conf->enable_rmw);
+	else
+		return sprintf(page, "%d\n", 1);
+}
+
+static ssize_t
+raid5_store_enable_rmw(struct mddev  *mddev, const char *page, size_t len)
+{
+	struct r5conf *conf = mddev->private;
+	unsigned long new;
+
+	if (!conf)
+		return -ENODEV;
+	if (conf->level != 6)
+		return -EINVAL;
+
+	if (len >= PAGE_SIZE)
+		return -EINVAL;
+
+	if (kstrtoul(page, 10, &new))
+		return -EINVAL;
+	conf->enable_rmw = !!new;
+	return len;
+}
+
+static struct md_sysfs_entry
+raid5_enable_rmw = __ATTR(enable_rmw, S_IRUGO | S_IWUSR,
+				raid5_show_enable_rmw,
+				raid5_store_enable_rmw);
+
 static struct attribute *raid5_attrs[] =  {
 	&raid5_stripecache_size.attr,
 	&raid5_stripecache_active.attr,
 	&raid5_preread_bypass_threshold.attr,
 	&raid5_auxthread_number.attr,
+	&raid5_enable_rmw.attr,
 	NULL,
 };
 static struct attribute_group raid5_attrs_group = {
@@ -5400,6 +5531,7 @@ static void raid5_free_percpu(struct r5conf *conf)
 	for_each_possible_cpu(cpu) {
 		percpu = per_cpu_ptr(conf->percpu, cpu);
 		safe_put_page(percpu->spare_page);
+		safe_put_page(percpu->spare_page2);
 		kfree(percpu->scribble);
 	}
 #ifdef CONFIG_HOTPLUG_CPU
@@ -5433,12 +5565,16 @@ static int raid456_cpu_notify(struct notifier_block *nfb, unsigned long action,
 	case CPU_UP_PREPARE_FROZEN:
 		if (conf->level == 6 && !percpu->spare_page)
 			percpu->spare_page = alloc_page(GFP_KERNEL);
+		if (conf->level == 6 && !percpu->spare_page2)
+			percpu->spare_page2 = alloc_page(GFP_KERNEL);
 		if (!percpu->scribble)
 			percpu->scribble = kmalloc(conf->scribble_len, GFP_KERNEL);
 
 		if (!percpu->scribble ||
-		    (conf->level == 6 && !percpu->spare_page)) {
+		    (conf->level == 6 && !percpu->spare_page) ||
+		    (conf->level == 6 && !percpu->spare_page2)) {
 			safe_put_page(percpu->spare_page);
+			safe_put_page(percpu->spare_page2);
 			kfree(percpu->scribble);
 			pr_err("%s: failed memory allocation for cpu%ld\n",
 			       __func__, cpu);
@@ -5456,8 +5592,10 @@ static int raid456_cpu_notify(struct notifier_block *nfb, unsigned long action,
 		spin_unlock_irq(&conf->device_lock);
 
 		safe_put_page(percpu->spare_page);
+		safe_put_page(percpu->spare_page2);
 		kfree(percpu->scribble);
 		percpu->spare_page = NULL;
+		percpu->spare_page2 = NULL;
 		percpu->scribble = NULL;
 		break;
 	default:
@@ -5496,6 +5634,13 @@ static int raid5_alloc_percpu(struct r5conf *conf)
 				break;
 			}
 			percpu->spare_page = spare_page;
+			spare_page = alloc_page(GFP_KERNEL);
+			if (!spare_page) {
+				err = -ENOMEM;
+				break;
+			}
+			per_cpu_ptr(conf->percpu, cpu)->spare_page2 =
+			    spare_page;
 		}
 		scribble = kmalloc(conf->scribble_len, GFP_KERNEL);
 		if (!scribble) {
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 217cb19..fd7ed19 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -334,6 +334,8 @@ enum {
 	STRIPE_OP_PREXOR,
 	STRIPE_OP_BIODRAIN,
 	STRIPE_OP_RECONSTRUCT,
+	STRIPE_OP_RMW_SUBTRACT,
+	STRIPE_OP_RMW,
 	STRIPE_OP_CHECK,
 };
 /*
@@ -437,6 +439,7 @@ struct r5conf {
 	/* per cpu variables */
 	struct raid5_percpu {
 		struct page	*spare_page; /* Used when checking P/Q in raid6 */
+		struct page	*spare_page2; /* Used for rmw writes in raid6 */
 		void		*scribble;   /* space for constructing buffer
 					      * lists and performing address
 					      * conversions
@@ -479,6 +482,7 @@ struct r5conf {
 	struct raid5_auxth	**aux_threads;
 	/* which CPUs should raid5d thread handle stripes from */
 	cpumask_t		work_mask;
+	int			enable_rmw; /* 1 if rmw is enabled for raid6 */
 };
 
 /*
^ permalink raw reply related	[flat|nested] 17+ messages in thread- * Re: [PATCH] raid5: add support for rmw writes in raid6
  2013-04-18  0:56 [PATCH] raid5: add support for rmw writes in raid6 Dan Williams
@ 2013-04-18 18:40 ` Dan Williams
  2013-04-22  4:22 ` NeilBrown
  1 sibling, 0 replies; 17+ messages in thread
From: Dan Williams @ 2013-04-18 18:40 UTC (permalink / raw)
  To: NeilBrown; +Cc: linux-raid, Kumar Sundararajan
On Wed, Apr 17, 2013 at 5:56 PM, Dan Williams <djbw@fb.com> wrote:
> Here are some numbers from Kumar's testing with a 12 disk array:
>
>                                      with rmw   rcw only
> 4K random write                    887.0 KB/s  94.4 KB/s
copy pasta error, should be 493KB/s for rcw only...
^ permalink raw reply	[flat|nested] 17+ messages in thread 
- * Re: [PATCH] raid5: add support for rmw writes in raid6
  2013-04-18  0:56 [PATCH] raid5: add support for rmw writes in raid6 Dan Williams
  2013-04-18 18:40 ` Dan Williams
@ 2013-04-22  4:22 ` NeilBrown
  2013-04-26 21:35   ` Dan Williams
  1 sibling, 1 reply; 17+ messages in thread
From: NeilBrown @ 2013-04-22  4:22 UTC (permalink / raw)
  To: Dan Williams; +Cc: linux-raid, Kumar Sundararajan
[-- Attachment #1: Type: text/plain, Size: 15497 bytes --]
On Wed, 17 Apr 2013 17:56:56 -0700 Dan Williams <djbw@fb.com> wrote:
> From: Kumar Sundararajan <kumar@fb.com>
> 
> Add support for rmw writes in raid6. This improves small write performance for most workloads.
> Since there may be some configurations and workloads where rcw always outperforms rmw,
> this feature is disabled by default. It can be enabled via sysfs. For example,
> 
> 	#echo 1 > /sys/block/md0/md/enable_rmw
> 
> Signed-off-by: Kumar Sundararajan <kumar@fb.com>
> Signed-off-by: Dan Williams <djbw@fb.com>
> ---
> Hi Neil,
> 
> We decided to leave the enable in for the few cases where forced-rcw
> outperformed rmw and there may be other cases out there.
> 
> Thoughts?
- More commentary would help.  The text at the top should explain enough so
  when I read the code I am just verifying the text at the top, not trying to
  figure out how it is supposed to work.
- If 'enable_rmw' really is a good idea, then it is possibly a good idea for
  RAID5 to and so should be a separate patch and should work for RAID4/5/6.
  The default for each array type may well be different, but the
  functionality would be the same.
- Can you  explain *why* rcw is sometimes better than rmw even on large
  arrays? Even a fairly hand-wavy arguement would help.  And it would go in
  the comment at the top of the patch that adds enable_rmw.
- patch looks fairly sane assuming that it works - but I don't really know if
  it does.  You've reported speeds but haven't told me that you ran 'check'
  after doing each test and it reported no mismatches.  I suspect you did but
  I'd like to be told.  I'd also like to be told what role 'spare2' plays.
There: my complaints are longer than your explanatory text, so I think I
can stop now :-)
Oh, and the stuff below, that should be above so that it gets captured with
the patch and remains in the git logs of posterity. 
NeilBrown
P.S. a couple of comments further down.
> 
> Here are some numbers from Kumar's testing with a 12 disk array:
> 
>                                      with rmw   rcw only
> 4K random write                    887.0 KB/s  94.4 KB/s
> 64k seq write                      291.5 MB/s 283.2 MB/s
> 64k random write                    16.4 MB/s   7.7 MB/s
> 64K mixed read/write                94.4 MB/s  94.0 MB/s
> (70% random reads/30 % seq writes)   1.6 MB/s   1.8 MB/s
> 
> --
> Dan
> 
>  drivers/md/raid5.c |  169 ++++++++++++++++++++++++++++++++++++++++++++++++----
>  drivers/md/raid5.h |    4 +
>  2 files changed, 161 insertions(+), 12 deletions(-)
> 
> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
> index e25520e..c9b6112 100644
> --- a/drivers/md/raid5.c
> +++ b/drivers/md/raid5.c
> @@ -1015,6 +1015,40 @@ static int set_syndrome_sources(struct page **srcs, struct stripe_head *sh)
>  	return syndrome_disks;
>  }
>  
> +static int set_rmw_syndrome_sources(struct page **srcs, struct stripe_head *sh,
> +	 struct raid5_percpu *percpu, int subtract)
Arg.  Bad intending.
I suspect it would look better if this were two separate functions instead of
having a 'subtract' arg, but I'm not sure.
> +{
> +	int disks = sh->disks;
> +	int syndrome_disks = sh->ddf_layout ? disks : (disks - 2);
> +	int d0_idx = raid6_d0(sh);
> +	int count;
> +	int i;
> +
> +	for (i = 0; i < disks; i++)
> +		srcs[i] = NULL;
> +
> +	count = 0;
> +	i = d0_idx;
> +	do {
> +		int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
> +
> +		if (subtract) {
> +			if (test_bit(R5_Wantdrain, &sh->dev[i].flags))
> +				srcs[slot] = sh->dev[i].page;
> +			else if (i == sh->pd_idx)
> +				srcs[slot] = percpu->spare_page;
> +			else if (i == sh->qd_idx)
> +				srcs[slot] = percpu->spare_page2;
> +		} else if (sh->dev[i].written || i == sh->pd_idx ||
> +			    i == sh->qd_idx)
> +			srcs[slot] = sh->dev[i].page;
> +
> +		i = raid6_next_disk(i, disks);
> +	} while (i != d0_idx);
> +
> +	return syndrome_disks;
> +}
> +
>  static struct dma_async_tx_descriptor *
>  ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu)
>  {
> @@ -1401,6 +1435,50 @@ ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu,
>  	async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE,  &submit);
>  }
>  
> +static struct dma_async_tx_descriptor *
> +ops_run_rmw(struct stripe_head *sh, struct raid5_percpu *percpu,
> +	    struct dma_async_tx_descriptor *tx, int subtract)
> +{
> +	int pd_idx = sh->pd_idx;
> +	int qd_idx = sh->qd_idx;
> +	struct page **blocks = percpu->scribble;
> +	struct page *xor_dest;
> +	struct r5dev *dev;
> +	struct async_submit_ctl submit;
> +	int count;
> +
> +	pr_debug("%s: stripe %llu\n", __func__,
> +		(unsigned long long)sh->sector);
> +
> +	count = set_rmw_syndrome_sources(blocks, sh, percpu, subtract);
> +
> +	init_async_submit(&submit, ASYNC_TX_ACK, tx, NULL, NULL,
> +			  to_addr_conv(sh, percpu));
> +	tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit);
> +
> +	dev = &sh->dev[pd_idx];
> +	xor_dest = blocks[0] = subtract ? percpu->spare_page : dev->page;
> +	blocks[1] = subtract ? dev->page : percpu->spare_page;
> +
> +	init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx,
> +			  NULL, sh, to_addr_conv(sh, percpu));
> +	tx = async_xor(xor_dest, blocks, 0, 2, STRIPE_SIZE, &submit);
> +
> +	dev = &sh->dev[qd_idx];
> +	xor_dest = blocks[0] = subtract ? percpu->spare_page2 : dev->page;
> +	blocks[1] = subtract ? dev->page : percpu->spare_page2;
> +
> +	if (!subtract)
> +		atomic_inc(&sh->count);
> +
> +	init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx,
> +			  subtract ? NULL : ops_complete_reconstruct, sh,
> +			  to_addr_conv(sh, percpu));
> +	tx = async_xor(xor_dest, blocks, 0, 2, STRIPE_SIZE, &submit);
> +
> +	return tx;
The continual switching on 'subtract' make this hard to read too.  It is
probably a bit big to duplication ... Is there anything you can do to make it
easier to read?
> +}
> +
>  static void ops_complete_check(void *stripe_head_ref)
>  {
>  	struct stripe_head *sh = stripe_head_ref;
> @@ -1500,6 +1578,9 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
>  	if (test_bit(STRIPE_OP_PREXOR, &ops_request))
>  		tx = ops_run_prexor(sh, percpu, tx);
>  
> +	if (test_bit(STRIPE_OP_RMW_SUBTRACT, &ops_request))
> +		tx = ops_run_rmw(sh, percpu, tx, 1);
> +
>  	if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) {
>  		tx = ops_run_biodrain(sh, tx);
>  		overlap_clear++;
> @@ -1512,6 +1593,9 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
>  			ops_run_reconstruct6(sh, percpu, tx);
>  	}
>  
> +	if (test_bit(STRIPE_OP_RMW, &ops_request))
> +		tx = ops_run_rmw(sh, percpu, tx, 0);
> +
>  	if (test_bit(STRIPE_OP_CHECK, &ops_request)) {
>  		if (sh->check_state == check_state_run)
>  			ops_run_check_p(sh, percpu);
> @@ -2346,7 +2430,7 @@ static void
>  schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
>  			 int rcw, int expand)
>  {
> -	int i, pd_idx = sh->pd_idx, disks = sh->disks;
> +	int i, pd_idx = sh->pd_idx, qd_idx = sh->qd_idx, disks = sh->disks;
>  	struct r5conf *conf = sh->raid_conf;
>  	int level = conf->level;
>  
> @@ -2382,13 +2466,15 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
>  			if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state))
>  				atomic_inc(&conf->pending_full_writes);
>  	} else {
> -		BUG_ON(level == 6);
>  		BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) ||
>  			test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags)));
> +		BUG_ON(level == 6 &&
> +			(!(test_bit(R5_UPTODATE, &sh->dev[qd_idx].flags) ||
> +			test_bit(R5_Wantcompute, &sh->dev[qd_idx].flags))));
>  
>  		for (i = disks; i--; ) {
>  			struct r5dev *dev = &sh->dev[i];
> -			if (i == pd_idx)
> +			if (i == pd_idx || i == qd_idx)
>  				continue;
>  
>  			if (dev->towrite &&
> @@ -2404,9 +2490,16 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
>  			/* False alarm - nothing to do */
>  			return;
>  		sh->reconstruct_state = reconstruct_state_prexor_drain_run;
> -		set_bit(STRIPE_OP_PREXOR, &s->ops_request);
> -		set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
> -		set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request);
> +
> +		if (level == 6) {
> +			set_bit(STRIPE_OP_RMW_SUBTRACT, &s->ops_request);
> +			set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
> +			set_bit(STRIPE_OP_RMW, &s->ops_request);
> +		} else {
> +			set_bit(STRIPE_OP_PREXOR, &s->ops_request);
> +			set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
> +			set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request);
> +		}
>  	}
>  
>  	/* keep the parity disk(s) locked while asynchronous operations
> @@ -2876,15 +2969,14 @@ static void handle_stripe_dirtying(struct r5conf *conf,
>  	int rmw = 0, rcw = 0, i;
>  	sector_t recovery_cp = conf->mddev->recovery_cp;
>  
> -	/* RAID6 requires 'rcw' in current implementation.
> -	 * Otherwise, check whether resync is now happening or should start.
> +	/* Check whether resync is now happening or should start.
>  	 * If yes, then the array is dirty (after unclean shutdown or
>  	 * initial creation), so parity in some stripes might be inconsistent.
>  	 * In this case, we need to always do reconstruct-write, to ensure
>  	 * that in case of drive failure or read-error correction, we
>  	 * generate correct data from the parity.
>  	 */
> -	if (conf->max_degraded == 2 ||
> +	if ((conf->max_degraded == 2 && !conf->enable_rmw) ||
>  	    (recovery_cp < MaxSector && sh->sector >= recovery_cp)) {
>  		/* Calculate the real rcw later - for now make it
>  		 * look like rcw is cheaper
> @@ -2896,7 +2988,7 @@ static void handle_stripe_dirtying(struct r5conf *conf,
>  	} else for (i = disks; i--; ) {
>  		/* would I have to read this buffer for read_modify_write */
>  		struct r5dev *dev = &sh->dev[i];
> -		if ((dev->towrite || i == sh->pd_idx) &&
> +		if ((dev->towrite || i == sh->pd_idx || i == sh->qd_idx) &&
>  		    !test_bit(R5_LOCKED, &dev->flags) &&
>  		    !(test_bit(R5_UPTODATE, &dev->flags) ||
>  		      test_bit(R5_Wantcompute, &dev->flags))) {
> @@ -2907,6 +2999,7 @@ static void handle_stripe_dirtying(struct r5conf *conf,
>  		}
>  		/* Would I have to read this buffer for reconstruct_write */
>  		if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx &&
> +		    i != sh->qd_idx &&
>  		    !test_bit(R5_LOCKED, &dev->flags) &&
>  		    !(test_bit(R5_UPTODATE, &dev->flags) ||
>  		    test_bit(R5_Wantcompute, &dev->flags))) {
> @@ -2926,7 +3019,8 @@ static void handle_stripe_dirtying(struct r5conf *conf,
>  					  (unsigned long long)sh->sector, rmw);
>  		for (i = disks; i--; ) {
>  			struct r5dev *dev = &sh->dev[i];
> -			if ((dev->towrite || i == sh->pd_idx) &&
> +			if ((dev->towrite || i == sh->pd_idx ||
> +			    i == sh->qd_idx) &&
>  			    !test_bit(R5_LOCKED, &dev->flags) &&
>  			    !(test_bit(R5_UPTODATE, &dev->flags) ||
>  			    test_bit(R5_Wantcompute, &dev->flags)) &&
> @@ -5360,11 +5454,48 @@ raid5_auxthread_number = __ATTR(auxthread_number, S_IRUGO|S_IWUSR,
>  				raid5_show_auxthread_number,
>  				raid5_store_auxthread_number);
>  
> +static ssize_t
> +raid5_show_enable_rmw(struct mddev  *mddev, char *page)
> +{
> +	struct r5conf *conf = mddev->private;
> +
> +	if (conf->level == 6)
> +		return sprintf(page, "%d\n", conf->enable_rmw);
> +	else
> +		return sprintf(page, "%d\n", 1);
> +}
> +
> +static ssize_t
> +raid5_store_enable_rmw(struct mddev  *mddev, const char *page, size_t len)
> +{
> +	struct r5conf *conf = mddev->private;
> +	unsigned long new;
> +
> +	if (!conf)
> +		return -ENODEV;
> +	if (conf->level != 6)
> +		return -EINVAL;
> +
> +	if (len >= PAGE_SIZE)
> +		return -EINVAL;
> +
> +	if (kstrtoul(page, 10, &new))
> +		return -EINVAL;
> +	conf->enable_rmw = !!new;
> +	return len;
> +}
> +
> +static struct md_sysfs_entry
> +raid5_enable_rmw = __ATTR(enable_rmw, S_IRUGO | S_IWUSR,
> +				raid5_show_enable_rmw,
> +				raid5_store_enable_rmw);
> +
>  static struct attribute *raid5_attrs[] =  {
>  	&raid5_stripecache_size.attr,
>  	&raid5_stripecache_active.attr,
>  	&raid5_preread_bypass_threshold.attr,
>  	&raid5_auxthread_number.attr,
> +	&raid5_enable_rmw.attr,
>  	NULL,
>  };
>  static struct attribute_group raid5_attrs_group = {
> @@ -5400,6 +5531,7 @@ static void raid5_free_percpu(struct r5conf *conf)
>  	for_each_possible_cpu(cpu) {
>  		percpu = per_cpu_ptr(conf->percpu, cpu);
>  		safe_put_page(percpu->spare_page);
> +		safe_put_page(percpu->spare_page2);
>  		kfree(percpu->scribble);
>  	}
>  #ifdef CONFIG_HOTPLUG_CPU
> @@ -5433,12 +5565,16 @@ static int raid456_cpu_notify(struct notifier_block *nfb, unsigned long action,
>  	case CPU_UP_PREPARE_FROZEN:
>  		if (conf->level == 6 && !percpu->spare_page)
>  			percpu->spare_page = alloc_page(GFP_KERNEL);
> +		if (conf->level == 6 && !percpu->spare_page2)
> +			percpu->spare_page2 = alloc_page(GFP_KERNEL);
>  		if (!percpu->scribble)
>  			percpu->scribble = kmalloc(conf->scribble_len, GFP_KERNEL);
>  
>  		if (!percpu->scribble ||
> -		    (conf->level == 6 && !percpu->spare_page)) {
> +		    (conf->level == 6 && !percpu->spare_page) ||
> +		    (conf->level == 6 && !percpu->spare_page2)) {
>  			safe_put_page(percpu->spare_page);
> +			safe_put_page(percpu->spare_page2);
>  			kfree(percpu->scribble);
>  			pr_err("%s: failed memory allocation for cpu%ld\n",
>  			       __func__, cpu);
> @@ -5456,8 +5592,10 @@ static int raid456_cpu_notify(struct notifier_block *nfb, unsigned long action,
>  		spin_unlock_irq(&conf->device_lock);
>  
>  		safe_put_page(percpu->spare_page);
> +		safe_put_page(percpu->spare_page2);
>  		kfree(percpu->scribble);
>  		percpu->spare_page = NULL;
> +		percpu->spare_page2 = NULL;
>  		percpu->scribble = NULL;
>  		break;
>  	default:
> @@ -5496,6 +5634,13 @@ static int raid5_alloc_percpu(struct r5conf *conf)
>  				break;
>  			}
>  			percpu->spare_page = spare_page;
> +			spare_page = alloc_page(GFP_KERNEL);
> +			if (!spare_page) {
> +				err = -ENOMEM;
> +				break;
> +			}
> +			per_cpu_ptr(conf->percpu, cpu)->spare_page2 =
> +			    spare_page;
>  		}
>  		scribble = kmalloc(conf->scribble_len, GFP_KERNEL);
>  		if (!scribble) {
> diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
> index 217cb19..fd7ed19 100644
> --- a/drivers/md/raid5.h
> +++ b/drivers/md/raid5.h
> @@ -334,6 +334,8 @@ enum {
>  	STRIPE_OP_PREXOR,
>  	STRIPE_OP_BIODRAIN,
>  	STRIPE_OP_RECONSTRUCT,
> +	STRIPE_OP_RMW_SUBTRACT,
> +	STRIPE_OP_RMW,
>  	STRIPE_OP_CHECK,
>  };
>  /*
> @@ -437,6 +439,7 @@ struct r5conf {
>  	/* per cpu variables */
>  	struct raid5_percpu {
>  		struct page	*spare_page; /* Used when checking P/Q in raid6 */
> +		struct page	*spare_page2; /* Used for rmw writes in raid6 */
>  		void		*scribble;   /* space for constructing buffer
>  					      * lists and performing address
>  					      * conversions
> @@ -479,6 +482,7 @@ struct r5conf {
>  	struct raid5_auxth	**aux_threads;
>  	/* which CPUs should raid5d thread handle stripes from */
>  	cpumask_t		work_mask;
> +	int			enable_rmw; /* 1 if rmw is enabled for raid6 */
>  };
>  
>  /*
[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 828 bytes --]
^ permalink raw reply	[flat|nested] 17+ messages in thread
- * Re: [PATCH] raid5: add support for rmw writes in raid6
  2013-04-22  4:22 ` NeilBrown
@ 2013-04-26 21:35   ` Dan Williams
  2013-04-29  1:29     ` NeilBrown
  0 siblings, 1 reply; 17+ messages in thread
From: Dan Williams @ 2013-04-26 21:35 UTC (permalink / raw)
  To: NeilBrown; +Cc: linux-raid, Kumar Sundararajan
On Sun, Apr 21, 2013 at 9:22 PM, NeilBrown <neilb@suse.de> wrote:
> On Wed, 17 Apr 2013 17:56:56 -0700 Dan Williams <djbw@fb.com> wrote:
>
>> From: Kumar Sundararajan <kumar@fb.com>
>>
>> Add support for rmw writes in raid6. This improves small write performance for most workloads.
>> Since there may be some configurations and workloads where rcw always outperforms rmw,
>> this feature is disabled by default. It can be enabled via sysfs. For example,
>>
>>       #echo 1 > /sys/block/md0/md/enable_rmw
>>
>> Signed-off-by: Kumar Sundararajan <kumar@fb.com>
>> Signed-off-by: Dan Williams <djbw@fb.com>
>> ---
>> Hi Neil,
>>
>> We decided to leave the enable in for the few cases where forced-rcw
>> outperformed rmw and there may be other cases out there.
>>
>> Thoughts?
>
> - More commentary would help.  The text at the top should explain enough so
>   when I read the code I am just verifying the text at the top, not trying to
>   figure out how it is supposed to work.
Ok, along the lines of:
"raid5 has supported sub-stripe writes by computing:
P_new = P_old + D0_old + D0_new
For raid6 we add the following:
P_new = P_old + D0_old + D0_new
Q_sub = Q_old + syndrome(D0_old)
Q_sub = Q_old + g^0*D0_old
Q_new = Q_old + Q_sub + g^0*D0_new
This has been verified with check and dual-degraded recovery operations.
> - If 'enable_rmw' really is a good idea, then it is possibly a good idea for
>   RAID5 to and so should be a separate patch and should work for RAID4/5/6.
>   The default for each array type may well be different, but the
>   functionality would be the same.
Yes, although Kumar's testing has not found a test case that makes
significant difference for raid5.  I.e. it makes sense to not
artificially prevent raid5 from disabling rmw if the knob is there for
raid6, but it would need a specific use case to flip it from the
default.
> - Can you  explain *why* rcw is sometimes better than rmw even on large
>   arrays? Even a fairly hand-wavy arguement would help.  And it would go in
>   the comment at the top of the patch that adds enable_rmw.
Hand wavy argument is that rcw guarantees that the drives will be more
busy so small sequential writes have more chance to coalesce into
larger writes.  Kumar, other thoughts?
> - patch looks fairly sane assuming that it works - but I don't really know if
>   it does.  You've reported speeds but haven't told me that you ran 'check'
>   after doing each test and it reported no mismatches.  I suspect you did but
>   I'd like to be told.  I'd also like to be told what role 'spare2' plays.
spare2 is holding the intermediate Q_sub result of subtracting out the
syndrome of the old data block(s)
> There: my complaints are longer than your explanatory text, so I think I
> can stop now :-)
:-) Thanks.
>
> Oh, and the stuff below, that should be above so that it gets captured with
> the patch and remains in the git logs of posterity.
>
> NeilBrown
>
> P.S. a couple of comments further down.
>
>>
>> Here are some numbers from Kumar's testing with a 12 disk array:
>>
>>                                      with rmw   rcw only
>> 4K random write                    887.0 KB/s  94.4 KB/s
>> 64k seq write                      291.5 MB/s 283.2 MB/s
>> 64k random write                    16.4 MB/s   7.7 MB/s
>> 64K mixed read/write                94.4 MB/s  94.0 MB/s
>> (70% random reads/30 % seq writes)   1.6 MB/s   1.8 MB/s
>>
>> --
>> Dan
>>
>>  drivers/md/raid5.c |  169 ++++++++++++++++++++++++++++++++++++++++++++++++----
>>  drivers/md/raid5.h |    4 +
>>  2 files changed, 161 insertions(+), 12 deletions(-)
>>
>> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
>> index e25520e..c9b6112 100644
>> --- a/drivers/md/raid5.c
>> +++ b/drivers/md/raid5.c
>> @@ -1015,6 +1015,40 @@ static int set_syndrome_sources(struct page **srcs, struct stripe_head *sh)
>>       return syndrome_disks;
>>  }
>>
>> +static int set_rmw_syndrome_sources(struct page **srcs, struct stripe_head *sh,
>> +      struct raid5_percpu *percpu, int subtract)
>
> Arg.  Bad intending.
>
> I suspect it would look better if this were two separate functions instead of
> having a 'subtract' arg, but I'm not sure.
Will take a look.  I think I may have asked Kumar to squish it originally.
>
>
>> +{
>> +     int disks = sh->disks;
>> +     int syndrome_disks = sh->ddf_layout ? disks : (disks - 2);
>> +     int d0_idx = raid6_d0(sh);
>> +     int count;
>> +     int i;
>> +
>> +     for (i = 0; i < disks; i++)
>> +             srcs[i] = NULL;
>> +
>> +     count = 0;
>> +     i = d0_idx;
>> +     do {
>> +             int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
>> +
>> +             if (subtract) {
>> +                     if (test_bit(R5_Wantdrain, &sh->dev[i].flags))
>> +                             srcs[slot] = sh->dev[i].page;
>> +                     else if (i == sh->pd_idx)
>> +                             srcs[slot] = percpu->spare_page;
>> +                     else if (i == sh->qd_idx)
>> +                             srcs[slot] = percpu->spare_page2;
>> +             } else if (sh->dev[i].written || i == sh->pd_idx ||
>> +                         i == sh->qd_idx)
>> +                     srcs[slot] = sh->dev[i].page;
>> +
>> +             i = raid6_next_disk(i, disks);
>> +     } while (i != d0_idx);
>> +
>> +     return syndrome_disks;
>> +}
>> +
>>  static struct dma_async_tx_descriptor *
>>  ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu)
>>  {
>> @@ -1401,6 +1435,50 @@ ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu,
>>       async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE,  &submit);
>>  }
>>
>> +static struct dma_async_tx_descriptor *
>> +ops_run_rmw(struct stripe_head *sh, struct raid5_percpu *percpu,
>> +         struct dma_async_tx_descriptor *tx, int subtract)
>> +{
>> +     int pd_idx = sh->pd_idx;
>> +     int qd_idx = sh->qd_idx;
>> +     struct page **blocks = percpu->scribble;
>> +     struct page *xor_dest;
>> +     struct r5dev *dev;
>> +     struct async_submit_ctl submit;
>> +     int count;
>> +
>> +     pr_debug("%s: stripe %llu\n", __func__,
>> +             (unsigned long long)sh->sector);
>> +
>> +     count = set_rmw_syndrome_sources(blocks, sh, percpu, subtract);
>> +
>> +     init_async_submit(&submit, ASYNC_TX_ACK, tx, NULL, NULL,
>> +                       to_addr_conv(sh, percpu));
>> +     tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit);
>> +
>> +     dev = &sh->dev[pd_idx];
>> +     xor_dest = blocks[0] = subtract ? percpu->spare_page : dev->page;
>> +     blocks[1] = subtract ? dev->page : percpu->spare_page;
>> +
>> +     init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx,
>> +                       NULL, sh, to_addr_conv(sh, percpu));
>> +     tx = async_xor(xor_dest, blocks, 0, 2, STRIPE_SIZE, &submit);
>> +
>> +     dev = &sh->dev[qd_idx];
>> +     xor_dest = blocks[0] = subtract ? percpu->spare_page2 : dev->page;
>> +     blocks[1] = subtract ? dev->page : percpu->spare_page2;
>> +
>> +     if (!subtract)
>> +             atomic_inc(&sh->count);
>> +
>> +     init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx,
>> +                       subtract ? NULL : ops_complete_reconstruct, sh,
>> +                       to_addr_conv(sh, percpu));
>> +     tx = async_xor(xor_dest, blocks, 0, 2, STRIPE_SIZE, &submit);
>> +
>> +     return tx;
>
> The continual switching on 'subtract' make this hard to read too.  It is
> probably a bit big to duplication ... Is there anything you can do to make it
> easier to read?
>
Is this any better? Kumar?
static struct dma_async_tx_descriptor *
ops_run_rmw(struct stripe_head *sh, struct raid5_percpu *percpu,
            struct dma_async_tx_descriptor *tx, int subtract)
{
        int pd_idx = sh->pd_idx;
        int qd_idx = sh->qd_idx;
        struct page **blocks = percpu->scribble;
        struct page *xor_dest, *result, *src, *qresult, *qsrc;
        struct async_submit_ctl submit;
        dma_async_tx_callback done_fn;
        int count;
        /* the spare pages hold the intermediate P and Q results */
        if (subtract) {
                result = percpu->spare_page;
                src = sh->dev[pd_idx].page;
                qresult = percpu->spare_page2;
                qsrc = sh->dev[qd_idx].page;
                /* we'll be called once again after new data arrives */
                done_fn = NULL;
        } else {
                /* this is the final stage of the reconstruct */
                result = sh->dev[pd_idx].page;
                src = percpu->spare_page;
                qresult = sh->dev[qd_idx].page;
                qsrc = percpu->spare_page2;
                done_fn = ops_complete_reconstruct;
                atomic_inc(&sh->count);
        }
        pr_debug("%s: stripe %llu\n", __func__,
                (unsigned long long)sh->sector);
        count = set_rmw_syndrome_sources(blocks, sh, percpu, subtract);
        init_async_submit(&submit, ASYNC_TX_ACK, tx, NULL, NULL,
                          to_addr_conv(sh, percpu));
        tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit);
        xor_dest = blocks[0] = result;
        blocks[1] = src;
        init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx,
                          NULL, sh, to_addr_conv(sh, percpu));
        tx = async_xor(xor_dest, blocks, 0, 2, STRIPE_SIZE, &submit);
        xor_dest = blocks[0] = qresult;
        blocks[1] = qsrc;
        init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx,
                          done_fn, sh, to_addr_conv(sh, percpu));
        tx = async_xor(xor_dest, blocks, 0, 2, STRIPE_SIZE, &submit);
        return tx;
}
^ permalink raw reply	[flat|nested] 17+ messages in thread
- * Re: [PATCH] raid5: add support for rmw writes in raid6
  2013-04-26 21:35   ` Dan Williams
@ 2013-04-29  1:29     ` NeilBrown
  2013-04-29  7:10       ` David Brown
                         ` (3 more replies)
  0 siblings, 4 replies; 17+ messages in thread
From: NeilBrown @ 2013-04-29  1:29 UTC (permalink / raw)
  To: Dan Williams; +Cc: linux-raid, Kumar Sundararajan
[-- Attachment #1: Type: text/plain, Size: 7757 bytes --]
On Fri, 26 Apr 2013 14:35:27 -0700 Dan Williams <djbw@fb.com> wrote:
> On Sun, Apr 21, 2013 at 9:22 PM, NeilBrown <neilb@suse.de> wrote:
> > On Wed, 17 Apr 2013 17:56:56 -0700 Dan Williams <djbw@fb.com> wrote:
> >
> >> From: Kumar Sundararajan <kumar@fb.com>
> >>
> >> Add support for rmw writes in raid6. This improves small write performance for most workloads.
> >> Since there may be some configurations and workloads where rcw always outperforms rmw,
> >> this feature is disabled by default. It can be enabled via sysfs. For example,
> >>
> >>       #echo 1 > /sys/block/md0/md/enable_rmw
> >>
> >> Signed-off-by: Kumar Sundararajan <kumar@fb.com>
> >> Signed-off-by: Dan Williams <djbw@fb.com>
> >> ---
> >> Hi Neil,
> >>
> >> We decided to leave the enable in for the few cases where forced-rcw
> >> outperformed rmw and there may be other cases out there.
> >>
> >> Thoughts?
> >
> > - More commentary would help.  The text at the top should explain enough so
> >   when I read the code I am just verifying the text at the top, not trying to
> >   figure out how it is supposed to work.
> 
> Ok, along the lines of:
> 
> "raid5 has supported sub-stripe writes by computing:
> 
> P_new = P_old + D0_old + D0_new
The '0' looks out of place.  All 'D's are treated the same, so I would write
it
   P_new = P_old + D_old + D_new
though that is a small point.
> 
> For raid6 we add the following:
> 
> P_new = P_old + D0_old + D0_new
> 
> Q_sub = Q_old + syndrome(D0_old)
> Q_sub = Q_old + g^0*D0_old
> Q_new = Q_old + Q_sub + g^0*D0_new
But here the different Ds are different, so I would be using "Di" or similar.
  P_new = P_old + Di_old + Di_new
and I assume the two 'Q_sub' lines are meant to mean the same thing, so let's
take the second:
  Q_sub = Q_old + g^i * Di_old
  Q_new = Q_old + Q_sub + g^i * Di_new
which looks odd as when that expands out, we add Q_old to Q_old and as  '+'
is 'xor', it disappears.  Maybe you mean:
  Q_new = Q_sub + g^i * Di_new
??
This is exactly the sort of thing I wanted to see, but I hoped it wouldn't
confuse me like it seems to be doing :-)
> 
> This has been verified with check and dual-degraded recovery operations.
Good, thanks.
> 
> > - If 'enable_rmw' really is a good idea, then it is possibly a good idea for
> >   RAID5 to and so should be a separate patch and should work for RAID4/5/6.
> >   The default for each array type may well be different, but the
> >   functionality would be the same.
> 
> Yes, although Kumar's testing has not found a test case that makes
> significant difference for raid5.  I.e. it makes sense to not
> artificially prevent raid5 from disabling rmw if the knob is there for
> raid6, but it would need a specific use case to flip it from the
> default.
Agreed.
> 
> > - Can you  explain *why* rcw is sometimes better than rmw even on large
> >   arrays? Even a fairly hand-wavy arguement would help.  And it would go in
> >   the comment at the top of the patch that adds enable_rmw.
> 
> Hand wavy argument is that rcw guarantees that the drives will be more
> busy so small sequential writes have more chance to coalesce into
> larger writes.  Kumar, other thoughts?
So it goes faster because it goes slower?  We seem to see that a lot with
RAID5/6 :-(  But that argument seems to work adequately for both.  If there
is a measured difference it would be nice to know where it comes from.  Not
essential, but nice.
I had guessed that rcw spreads the load over more devices which might help on
very busy arrays.
My concern is that rcw being faster might be due to a bug somewhere because
it is counter intuitive.
Or maybe the trade off is different for RAID6 than RAID5.  Or maybe it is
wrong for RAID5 but we haven't noticed.
We currently count the number of devices we will need to read from to satisfy
the demands of rcw and rmw, and choose the fewest.
Maybe it is more expensive to read from a device that we are about to write
to by - say - 50% because it keeps the head in the same region for longer
(or more often). So we could scale the costs appropriately.
Maybe a useful tunable would be "read-before-write penalty" rather than
"allow rmw".
With a high read-before-write penalty it would choose rcw more (as it reads
from different devices to where it writes.  With a low read-before-write
penalty it would be more likely to choose rmw where possible.
> 
> > - patch looks fairly sane assuming that it works - but I don't really know if
> >   it does.  You've reported speeds but haven't told me that you ran 'check'
> >   after doing each test and it reported no mismatches.  I suspect you did but
> >   I'd like to be told.  I'd also like to be told what role 'spare2' plays.
> 
> spare2 is holding the intermediate Q_sub result of subtracting out the
> syndrome of the old data block(s)
That makes sense.  I wonder if we should call it "Q_sub" rather than "spare2".
Maybe there is a better name for spare_page too.
....
> > The continual switching on 'subtract' make this hard to read too.  It is
> > probably a bit big to duplication ... Is there anything you can do to make it
> > easier to read?
> >
> 
> Is this any better? Kumar?
Yes thanks.  Together with the above description of what it is trying to
achieve, I can almost say I understand it :-)
Thanks,
NeilBrown
> 
> static struct dma_async_tx_descriptor *
> ops_run_rmw(struct stripe_head *sh, struct raid5_percpu *percpu,
>             struct dma_async_tx_descriptor *tx, int subtract)
> {
>         int pd_idx = sh->pd_idx;
>         int qd_idx = sh->qd_idx;
>         struct page **blocks = percpu->scribble;
>         struct page *xor_dest, *result, *src, *qresult, *qsrc;
>         struct async_submit_ctl submit;
>         dma_async_tx_callback done_fn;
>         int count;
> 
>         /* the spare pages hold the intermediate P and Q results */
>         if (subtract) {
>                 result = percpu->spare_page;
>                 src = sh->dev[pd_idx].page;
> 
>                 qresult = percpu->spare_page2;
>                 qsrc = sh->dev[qd_idx].page;
> 
>                 /* we'll be called once again after new data arrives */
>                 done_fn = NULL;
>         } else {
>                 /* this is the final stage of the reconstruct */
>                 result = sh->dev[pd_idx].page;
>                 src = percpu->spare_page;
> 
>                 qresult = sh->dev[qd_idx].page;
>                 qsrc = percpu->spare_page2;
>                 done_fn = ops_complete_reconstruct;
>                 atomic_inc(&sh->count);
>         }
> 
>         pr_debug("%s: stripe %llu\n", __func__,
>                 (unsigned long long)sh->sector);
> 
>         count = set_rmw_syndrome_sources(blocks, sh, percpu, subtract);
> 
>         init_async_submit(&submit, ASYNC_TX_ACK, tx, NULL, NULL,
>                           to_addr_conv(sh, percpu));
>         tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit);
> 
>         xor_dest = blocks[0] = result;
>         blocks[1] = src;
> 
>         init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx,
>                           NULL, sh, to_addr_conv(sh, percpu));
>         tx = async_xor(xor_dest, blocks, 0, 2, STRIPE_SIZE, &submit);
> 
>         xor_dest = blocks[0] = qresult;
>         blocks[1] = qsrc;
> 
>         init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx,
>                           done_fn, sh, to_addr_conv(sh, percpu));
>         tx = async_xor(xor_dest, blocks, 0, 2, STRIPE_SIZE, &submit);
> 
>         return tx;
> }
[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 828 bytes --]
^ permalink raw reply	[flat|nested] 17+ messages in thread
- * Re: [PATCH] raid5: add support for rmw writes in raid6
  2013-04-29  1:29     ` NeilBrown
@ 2013-04-29  7:10       ` David Brown
  2013-04-29 16:11         ` Kumar Sundararajan
  2013-04-29 17:54       ` Kumar Sundararajan
                         ` (2 subsequent siblings)
  3 siblings, 1 reply; 17+ messages in thread
From: David Brown @ 2013-04-29  7:10 UTC (permalink / raw)
  To: NeilBrown; +Cc: Dan Williams, linux-raid, Kumar Sundararajan
On 29/04/13 03:29, NeilBrown wrote:
> On Fri, 26 Apr 2013 14:35:27 -0700 Dan Williams <djbw@fb.com> wrote:
> 
>> On Sun, Apr 21, 2013 at 9:22 PM, NeilBrown <neilb@suse.de> wrote:
>>> On Wed, 17 Apr 2013 17:56:56 -0700 Dan Williams <djbw@fb.com> wrote:
>>>
>>>> From: Kumar Sundararajan <kumar@fb.com>
>>>>
>>>> Add support for rmw writes in raid6. This improves small write performance for most workloads.
>>>> Since there may be some configurations and workloads where rcw always outperforms rmw,
>>>> this feature is disabled by default. It can be enabled via sysfs. For example,
>>>>
>>>>       #echo 1 > /sys/block/md0/md/enable_rmw
>>>>
>>>> Signed-off-by: Kumar Sundararajan <kumar@fb.com>
>>>> Signed-off-by: Dan Williams <djbw@fb.com>
>>>> ---
>>>> Hi Neil,
>>>>
>>>> We decided to leave the enable in for the few cases where forced-rcw
>>>> outperformed rmw and there may be other cases out there.
>>>>
>>>> Thoughts?
>>>
>>> - More commentary would help.  The text at the top should explain enough so
>>>   when I read the code I am just verifying the text at the top, not trying to
>>>   figure out how it is supposed to work.
>>
>> Ok, along the lines of:
>>
>> "raid5 has supported sub-stripe writes by computing:
>>
>> P_new = P_old + D0_old + D0_new
> 
> The '0' looks out of place.  All 'D's are treated the same, so I would write
> it
>    P_new = P_old + D_old + D_new
> though that is a small point.
> 
>>
>> For raid6 we add the following:
>>
>> P_new = P_old + D0_old + D0_new
Correct (though I agree with Neil that "0" should be replaced by "i").
>>
>> Q_sub = Q_old + syndrome(D0_old)
Wrong - using this terminology, Q_sub = syndrome(D0_old)
>> Q_sub = Q_old + g^0*D0_old
>> Q_new = Q_old + Q_sub + g^0*D0_new
> 
> But here the different Ds are different, so I would be using "Di" or similar.
> 
>   P_new = P_old + Di_old + Di_new
Correct.
> 
> and I assume the two 'Q_sub' lines are meant to mean the same thing, so let's
> take the second:
> 
>   Q_sub = Q_old + g^i * Di_old
>   Q_new = Q_old + Q_sub + g^i * Di_new
> 
You've still got one "Q_old" too many, which means your algebra is wrong
(Q_new reduces here to g^i * Di_new, which is clearly wrong).  But
you've jumped to almost the right conclusion on the next line, proving
that two wrongs sometimes do make a right!
> which looks odd as when that expands out, we add Q_old to Q_old and as  '+'
> is 'xor', it disappears.  Maybe you mean:
> 
>   Q_new = Q_sub + g^i * Di_new
> ??
Q_sub = g^i * Di_old
Q_new = Q_old - Q_sub + g^i * Di_new
      = Q_old + g^i * Di_old + g^i * Di_new   (since "-" = "+")
      = Q_old + g^i * (Di_old + Di_new)
I can't say for sure (I know the maths, not the code), but I would
expect this final version do be the most efficient for the calculations.
 By xor'ing Di_old and Di_new first, then multiplying by g^i, you'll
half the number of costly g^i multiplications.
mvh.,
David
^ permalink raw reply	[flat|nested] 17+ messages in thread
- * Re: [PATCH] raid5: add support for rmw writes in raid6
  2013-04-29  7:10       ` David Brown
@ 2013-04-29 16:11         ` Kumar Sundararajan
  2013-04-29 19:28           ` David Brown
  0 siblings, 1 reply; 17+ messages in thread
From: Kumar Sundararajan @ 2013-04-29 16:11 UTC (permalink / raw)
  To: David Brown, NeilBrown; +Cc: Dan Williams, linux-raid
On 4/29/13 12:10 AM, "David Brown" <david.brown@hesbynett.no> wrote:
>On 29/04/13 03:29, NeilBrown wrote:
>> On Fri, 26 Apr 2013 14:35:27 -0700 Dan Williams <djbw@fb.com> wrote:
>> 
>>> On Sun, Apr 21, 2013 at 9:22 PM, NeilBrown <neilb@suse.de> wrote:
>>>> On Wed, 17 Apr 2013 17:56:56 -0700 Dan Williams <djbw@fb.com> wrote:
>>>>
>>>>> From: Kumar Sundararajan <kumar@fb.com>
>>>>>
>>>>> Add support for rmw writes in raid6. This improves small write
>>>>>performance for most workloads.
>>>>> Since there may be some configurations and workloads where rcw
>>>>>always outperforms rmw,
>>>>> this feature is disabled by default. It can be enabled via sysfs.
>>>>>For example,
>>>>>
>>>>>       #echo 1 > /sys/block/md0/md/enable_rmw
>>>>>
>>>>> Signed-off-by: Kumar Sundararajan <kumar@fb.com>
>>>>> Signed-off-by: Dan Williams <djbw@fb.com>
>>>>> ---
>>>>> Hi Neil,
>>>>>
>>>>> We decided to leave the enable in for the few cases where forced-rcw
>>>>> outperformed rmw and there may be other cases out there.
>>>>>
>>>>> Thoughts?
>>>>
>>>> - More commentary would help.  The text at the top should explain
>>>>enough so
>>>>   when I read the code I am just verifying the text at the top, not
>>>>trying to
>>>>   figure out how it is supposed to work.
>>>
>>> Ok, along the lines of:
>>>
>>> "raid5 has supported sub-stripe writes by computing:
>>>
>>> P_new = P_old + D0_old + D0_new
>> 
>> The '0' looks out of place.  All 'D's are treated the same, so I would
>>write
>> it
>>    P_new = P_old + D_old + D_new
>> though that is a small point.
>> 
>>>
>>> For raid6 we add the following:
>>>
>>> P_new = P_old + D0_old + D0_new
>
>Correct (though I agree with Neil that "0" should be replaced by "i").
>
>>>
>>> Q_sub = Q_old + syndrome(D0_old)
>
>Wrong - using this terminology, Q_sub = syndrome(D0_old)
>
>>> Q_sub = Q_old + g^0*D0_old
>>> Q_new = Q_old + Q_sub + g^0*D0_new
>> 
>> But here the different Ds are different, so I would be using "Di" or
>>similar.
>> 
>>   P_new = P_old + Di_old + Di_new
>
>Correct.
>
>> 
>> and I assume the two 'Q_sub' lines are meant to mean the same thing, so
>>let's
>> take the second:
>> 
>>   Q_sub = Q_old + g^i * Di_old
>>   Q_new = Q_old + Q_sub + g^i * Di_new
>> 
>
>You've still got one "Q_old" too many, which means your algebra is wrong
>(Q_new reduces here to g^i * Di_new, which is clearly wrong).  But
>you've jumped to almost the right conclusion on the next line, proving
>that two wrongs sometimes do make a right!
>
>> which looks odd as when that expands out, we add Q_old to Q_old and as
>>'+'
>> is 'xor', it disappears.  Maybe you mean:
>> 
>>   Q_new = Q_sub + g^i * Di_new
>> ??
>
>Q_sub = g^i * Di_old
>Q_new = Q_old - Q_sub + g^i * Di_new
>      = Q_old + g^i * Di_old + g^i * Di_new   (since "-" = "+")
>      = Q_old + g^i * (Di_old + Di_new)
>
>I can't say for sure (I know the maths, not the code), but I would
>expect this final version do be the most efficient for the calculations.
> By xor'ing Di_old and Di_new first, then multiplying by g^i, you'll
>half the number of costly g^i multiplications.
>
>
Yes, I did consider this. But this scheme is harder to implement with how
the rest of the code works,
especially with writes covering more than one Di.
Kumar
>
^ permalink raw reply	[flat|nested] 17+ messages in thread 
- * Re: [PATCH] raid5: add support for rmw writes in raid6
  2013-04-29 16:11         ` Kumar Sundararajan
@ 2013-04-29 19:28           ` David Brown
  2013-04-30  0:05             ` Dan Williams
  0 siblings, 1 reply; 17+ messages in thread
From: David Brown @ 2013-04-29 19:28 UTC (permalink / raw)
  To: Kumar Sundararajan; +Cc: NeilBrown, Dan Williams, linux-raid
On 29/04/13 18:11, Kumar Sundararajan wrote:
>
>
> On 4/29/13 12:10 AM, "David Brown" <david.brown@hesbynett.no> wrote:
>
>> On 29/04/13 03:29, NeilBrown wrote:
>>> On Fri, 26 Apr 2013 14:35:27 -0700 Dan Williams <djbw@fb.com> wrote:
>>>
>>>> On Sun, Apr 21, 2013 at 9:22 PM, NeilBrown <neilb@suse.de> wrote:
>>>>> On Wed, 17 Apr 2013 17:56:56 -0700 Dan Williams <djbw@fb.com> wrote:
>>>>>
>>>>>> From: Kumar Sundararajan <kumar@fb.com>
>>>>>>
>>>>>> Add support for rmw writes in raid6. This improves small write
>>>>>> performance for most workloads.
>>>>>> Since there may be some configurations and workloads where rcw
>>>>>> always outperforms rmw,
>>>>>> this feature is disabled by default. It can be enabled via sysfs.
>>>>>> For example,
>>>>>>
>>>>>>        #echo 1 > /sys/block/md0/md/enable_rmw
>>>>>>
>>>>>> Signed-off-by: Kumar Sundararajan <kumar@fb.com>
>>>>>> Signed-off-by: Dan Williams <djbw@fb.com>
>>>>>> ---
>>>>>> Hi Neil,
>>>>>>
>>>>>> We decided to leave the enable in for the few cases where forced-rcw
>>>>>> outperformed rmw and there may be other cases out there.
>>>>>>
>>>>>> Thoughts?
>>>>>
>>>>> - More commentary would help.  The text at the top should explain
>>>>> enough so
>>>>>    when I read the code I am just verifying the text at the top, not
>>>>> trying to
>>>>>    figure out how it is supposed to work.
>>>>
>>>> Ok, along the lines of:
>>>>
>>>> "raid5 has supported sub-stripe writes by computing:
>>>>
>>>> P_new = P_old + D0_old + D0_new
>>>
>>> The '0' looks out of place.  All 'D's are treated the same, so I would
>>> write
>>> it
>>>     P_new = P_old + D_old + D_new
>>> though that is a small point.
>>>
>>>>
>>>> For raid6 we add the following:
>>>>
>>>> P_new = P_old + D0_old + D0_new
>>
>> Correct (though I agree with Neil that "0" should be replaced by "i").
>>
>>>>
>>>> Q_sub = Q_old + syndrome(D0_old)
>>
>> Wrong - using this terminology, Q_sub = syndrome(D0_old)
>>
>>>> Q_sub = Q_old + g^0*D0_old
>>>> Q_new = Q_old + Q_sub + g^0*D0_new
>>>
>>> But here the different Ds are different, so I would be using "Di" or
>>> similar.
>>>
>>>    P_new = P_old + Di_old + Di_new
>>
>> Correct.
>>
>>>
>>> and I assume the two 'Q_sub' lines are meant to mean the same thing, so
>>> let's
>>> take the second:
>>>
>>>    Q_sub = Q_old + g^i * Di_old
>>>    Q_new = Q_old + Q_sub + g^i * Di_new
>>>
>>
>> You've still got one "Q_old" too many, which means your algebra is wrong
>> (Q_new reduces here to g^i * Di_new, which is clearly wrong).  But
>> you've jumped to almost the right conclusion on the next line, proving
>> that two wrongs sometimes do make a right!
>>
>>> which looks odd as when that expands out, we add Q_old to Q_old and as
>>> '+'
>>> is 'xor', it disappears.  Maybe you mean:
>>>
>>>    Q_new = Q_sub + g^i * Di_new
>>> ??
>>
>> Q_sub = g^i * Di_old
>> Q_new = Q_old - Q_sub + g^i * Di_new
>>       = Q_old + g^i * Di_old + g^i * Di_new   (since "-" = "+")
>>       = Q_old + g^i * (Di_old + Di_new)
>>
>> I can't say for sure (I know the maths, not the code), but I would
>> expect this final version do be the most efficient for the calculations.
>> By xor'ing Di_old and Di_new first, then multiplying by g^i, you'll
>> half the number of costly g^i multiplications.
>>
>>
>
> Yes, I did consider this. But this scheme is harder to implement with how
> the rest of the code works,
> especially with writes covering more than one Di.
>
>
For each data block you are changing, you will need to remove the old 
g^i * Di_old then add in the new g^i * Di_new, so you can still use this 
simplification to reduce the number of multiplies.  If you want to 
change blocks "i" and "j", you thus do:
Q_new = Q_old + g^i * (Di_old + Di_new) + g^j * (Dj_old + Dj_new)
But as I say, I only know the maths - not the code.
mvh.,
David
^ permalink raw reply	[flat|nested] 17+ messages in thread 
- * Re: [PATCH] raid5: add support for rmw writes in raid6
  2013-04-29 19:28           ` David Brown
@ 2013-04-30  0:05             ` Dan Williams
  2013-04-30  6:48               ` David Brown
  0 siblings, 1 reply; 17+ messages in thread
From: Dan Williams @ 2013-04-30  0:05 UTC (permalink / raw)
  To: David Brown; +Cc: Kumar Sundararajan, NeilBrown, linux-raid
On Mon, Apr 29, 2013 at 12:28 PM, David Brown <david.brown@hesbynett.no> wrote:
> For each data block you are changing, you will need to remove the old g^i *
> Di_old then add in the new g^i * Di_new, so you can still use this
> simplification to reduce the number of multiplies.  If you want to change
> blocks "i" and "j", you thus do:
>
> Q_new = Q_old + g^i * (Di_old + Di_new) + g^j * (Dj_old + Dj_new)
>
> But as I say, I only know the maths - not the code.
The issue is where to store those intermediate Di_old + Di_new results
without doubling the size of the stripe cache.
^ permalink raw reply	[flat|nested] 17+ messages in thread 
- * Re: [PATCH] raid5: add support for rmw writes in raid6
  2013-04-30  0:05             ` Dan Williams
@ 2013-04-30  6:48               ` David Brown
  2013-04-30 16:01                 ` Kumar Sundararajan
  0 siblings, 1 reply; 17+ messages in thread
From: David Brown @ 2013-04-30  6:48 UTC (permalink / raw)
  To: Dan Williams; +Cc: Kumar Sundararajan, NeilBrown, linux-raid
On 30/04/13 02:05, Dan Williams wrote:
> On Mon, Apr 29, 2013 at 12:28 PM, David Brown <david.brown@hesbynett.no> wrote:
>> For each data block you are changing, you will need to remove the old g^i *
>> Di_old then add in the new g^i * Di_new, so you can still use this
>> simplification to reduce the number of multiplies.  If you want to change
>> blocks "i" and "j", you thus do:
>>
>> Q_new = Q_old + g^i * (Di_old + Di_new) + g^j * (Dj_old + Dj_new)
>>
>> But as I say, I only know the maths - not the code.
> 
> The issue is where to store those intermediate Di_old + Di_new results
> without doubling the size of the stripe cache.
> 
(As before, I haven't looked at the code.  I justify my laziness by
claiming that I might come up with fresh ideas without how to implement
things.  But only you folks can say what takes more work, and what is
riskier in the code.)
I don't see that you would need to double the size of the stripe cache.
 You might need an extra few block spaces, but not double the cache.
Also, once you have done this calculation (assuming you did the easy
P_new first), you no longer need to keep Di_old lying around - it's
going to be replaced with the new stripe data.  So maybe you can do the
operation as "Di_old += Di_new" - i.e., in place and without using more
memory.  That is going to be faster too, as it is more cache friendly.
On the other hand, it might involve more locking or other tracking
mechanisms to avoid problems if something else is trying to access the
same caches.
^ permalink raw reply	[flat|nested] 17+ messages in thread 
- * Re: [PATCH] raid5: add support for rmw writes in raid6
  2013-04-30  6:48               ` David Brown
@ 2013-04-30 16:01                 ` Kumar Sundararajan
  2013-04-30 16:10                   ` Kumar Sundararajan
  0 siblings, 1 reply; 17+ messages in thread
From: Kumar Sundararajan @ 2013-04-30 16:01 UTC (permalink / raw)
  To: David Brown, Dan Williams; +Cc: NeilBrown, linux-raid
On 4/29/13 11:48 PM, "David Brown" <david.brown@hesbynett.no> wrote:
>On 30/04/13 02:05, Dan Williams wrote:
>> On Mon, Apr 29, 2013 at 12:28 PM, David Brown
>><david.brown@hesbynett.no> wrote:
>>> For each data block you are changing, you will need to remove the old
>>>g^i *
>>> Di_old then add in the new g^i * Di_new, so you can still use this
>>> simplification to reduce the number of multiplies.  If you want to
>>>change
>>> blocks "i" and "j", you thus do:
>>>
>>> Q_new = Q_old + g^i * (Di_old + Di_new) + g^j * (Dj_old + Dj_new)
>>>
>>> But as I say, I only know the maths - not the code.
>> 
>> The issue is where to store those intermediate Di_old + Di_new results
>> without doubling the size of the stripe cache.
>> 
>
>(As before, I haven't looked at the code.  I justify my laziness by
>claiming that I might come up with fresh ideas without how to implement
>things.  But only you folks can say what takes more work, and what is
>riskier in the code.)
>
>I don't see that you would need to double the size of the stripe cache.
> You might need an extra few block spaces, but not double the cache.
>
>Also, once you have done this calculation (assuming you did the easy
>P_new first), you no longer need to keep Di_old lying around - it's
>going to be replaced with the new stripe data.  So maybe you can do the
>operation as "Di_old += Di_new" - i.e., in place and without using more
>memory.  That is going to be faster too, as it is more cache friendly.
>On the other hand, it might involve more locking or other tracking
>mechanisms to avoid problems if something else is trying to access the
>same caches.
>
Yes, I had seen your earlier mails on this topic and implemented it this
way initially.
However, this required us to either ask for stable pages or allocate an
extra "spare" page
for each disk in the array to hold the intermediate results. We chose to
use the scheme
described earlier since it requires only one extra spare page per array
and we didn't see
any appreciable increase in cpu usage on our setup.
>
^ permalink raw reply	[flat|nested] 17+ messages in thread 
- * Re: [PATCH] raid5: add support for rmw writes in raid6
  2013-04-30 16:01                 ` Kumar Sundararajan
@ 2013-04-30 16:10                   ` Kumar Sundararajan
  2013-04-30 18:39                     ` David Brown
  0 siblings, 1 reply; 17+ messages in thread
From: Kumar Sundararajan @ 2013-04-30 16:10 UTC (permalink / raw)
  To: David Brown, Dan Williams; +Cc: NeilBrown, linux-raid
On 4/30/13 9:01 AM, "Kumar Sundararajan" <kumar@fb.com> wrote:
>
>
>On 4/29/13 11:48 PM, "David Brown" <david.brown@hesbynett.no> wrote:
>
>>On 30/04/13 02:05, Dan Williams wrote:
>>> On Mon, Apr 29, 2013 at 12:28 PM, David Brown
>>><david.brown@hesbynett.no> wrote:
>>>> For each data block you are changing, you will need to remove the old
>>>>g^i *
>>>> Di_old then add in the new g^i * Di_new, so you can still use this
>>>> simplification to reduce the number of multiplies.  If you want to
>>>>change
>>>> blocks "i" and "j", you thus do:
>>>>
>>>> Q_new = Q_old + g^i * (Di_old + Di_new) + g^j * (Dj_old + Dj_new)
>>>>
>>>> But as I say, I only know the maths - not the code.
>>> 
>>> The issue is where to store those intermediate Di_old + Di_new results
>>> without doubling the size of the stripe cache.
>>> 
>>
>>(As before, I haven't looked at the code.  I justify my laziness by
>>claiming that I might come up with fresh ideas without how to implement
>>things.  But only you folks can say what takes more work, and what is
>>riskier in the code.)
>>
>>I don't see that you would need to double the size of the stripe cache.
>> You might need an extra few block spaces, but not double the cache.
>>
>>Also, once you have done this calculation (assuming you did the easy
>>P_new first), you no longer need to keep Di_old lying around - it's
>>going to be replaced with the new stripe data.  So maybe you can do the
>>operation as "Di_old += Di_new" - i.e., in place and without using more
>>memory.  That is going to be faster too, as it is more cache friendly.
>>On the other hand, it might involve more locking or other tracking
>>mechanisms to avoid problems if something else is trying to access the
>>same caches.
>>
>
>Yes, I had seen your earlier mails on this topic and implemented it this
>way initially.
>However, this required us to either ask for stable pages or allocate an
>extra "spare" page
>for each disk in the array to hold the intermediate results.
Sorry, that should read -- one extra "spare" page per cpu for each disk
in the array.
>
>>
>
^ permalink raw reply	[flat|nested] 17+ messages in thread 
- * Re: [PATCH] raid5: add support for rmw writes in raid6
  2013-04-30 16:10                   ` Kumar Sundararajan
@ 2013-04-30 18:39                     ` David Brown
  0 siblings, 0 replies; 17+ messages in thread
From: David Brown @ 2013-04-30 18:39 UTC (permalink / raw)
  To: Kumar Sundararajan; +Cc: Dan Williams, NeilBrown, linux-raid
On 30/04/13 18:10, Kumar Sundararajan wrote:
> 
> 
> On 4/30/13 9:01 AM, "Kumar Sundararajan" <kumar@fb.com> wrote:
> 
>>
>>
>> On 4/29/13 11:48 PM, "David Brown" <david.brown@hesbynett.no> wrote:
>>
>>> On 30/04/13 02:05, Dan Williams wrote:
>>>> On Mon, Apr 29, 2013 at 12:28 PM, David Brown
>>>> <david.brown@hesbynett.no> wrote:
>>>>> For each data block you are changing, you will need to remove the old
>>>>> g^i *
>>>>> Di_old then add in the new g^i * Di_new, so you can still use this
>>>>> simplification to reduce the number of multiplies.  If you want to
>>>>> change
>>>>> blocks "i" and "j", you thus do:
>>>>>
>>>>> Q_new = Q_old + g^i * (Di_old + Di_new) + g^j * (Dj_old + Dj_new)
>>>>>
>>>>> But as I say, I only know the maths - not the code.
>>>>
>>>> The issue is where to store those intermediate Di_old + Di_new results
>>>> without doubling the size of the stripe cache.
>>>>
>>>
>>> (As before, I haven't looked at the code.  I justify my laziness by
>>> claiming that I might come up with fresh ideas without how to implement
>>> things.  But only you folks can say what takes more work, and what is
>>> riskier in the code.)
>>>
>>> I don't see that you would need to double the size of the stripe cache.
>>> You might need an extra few block spaces, but not double the cache.
>>>
>>> Also, once you have done this calculation (assuming you did the easy
>>> P_new first), you no longer need to keep Di_old lying around - it's
>>> going to be replaced with the new stripe data.  So maybe you can do the
>>> operation as "Di_old += Di_new" - i.e., in place and without using more
>>> memory.  That is going to be faster too, as it is more cache friendly.
>>> On the other hand, it might involve more locking or other tracking
>>> mechanisms to avoid problems if something else is trying to access the
>>> same caches.
>>>
>>
>> Yes, I had seen your earlier mails on this topic and implemented it this
>> way initially.
>> However, this required us to either ask for stable pages or allocate an
>> extra "spare" page
>> for each disk in the array to hold the intermediate results.
> 
> Sorry, that should read -- one extra "spare" page per cpu for each disk
> in the array.
> 
Fair enough.  I suppose the "multiple by g^i" operation will be done
with a table lookup, and it's going to be pretty fast on a modern cpu
since the table will be entirely in L1 cache.
^ permalink raw reply	[flat|nested] 17+ messages in thread 
 
 
 
 
 
 
 
- * Re: [PATCH] raid5: add support for rmw writes in raid6
  2013-04-29  1:29     ` NeilBrown
  2013-04-29  7:10       ` David Brown
@ 2013-04-29 17:54       ` Kumar Sundararajan
  2013-04-30 21:32       ` Dan Williams
  2013-05-08 17:42       ` Dan Williams
  3 siblings, 0 replies; 17+ messages in thread
From: Kumar Sundararajan @ 2013-04-29 17:54 UTC (permalink / raw)
  To: NeilBrown, Dan Williams; +Cc: linux-raid
On 4/28/13 6:29 PM, "NeilBrown" <neilb@suse.de> wrote:
>On Fri, 26 Apr 2013 14:35:27 -0700 Dan Williams <djbw@fb.com> wrote:
>
>> On Sun, Apr 21, 2013 at 9:22 PM, NeilBrown <neilb@suse.de> wrote:
>> > On Wed, 17 Apr 2013 17:56:56 -0700 Dan Williams <djbw@fb.com> wrote:
>> >
>> >> From: Kumar Sundararajan <kumar@fb.com>
>> >>
>> >> Add support for rmw writes in raid6. This improves small write
>>performance for most workloads.
>> >> Since there may be some configurations and workloads where rcw
>>always outperforms rmw,
>> >> this feature is disabled by default. It can be enabled via sysfs.
>>For example,
>> >>
>> >>       #echo 1 > /sys/block/md0/md/enable_rmw
>> >>
>> >> Signed-off-by: Kumar Sundararajan <kumar@fb.com>
>> >> Signed-off-by: Dan Williams <djbw@fb.com>
>> >> ---
>> >> Hi Neil,
>> >>
>> >> We decided to leave the enable in for the few cases where forced-rcw
>> >> outperformed rmw and there may be other cases out there.
>> >>
>> >> Thoughts?
>> >
>> > - More commentary would help.  The text at the top should explain
>>enough so
>> >   when I read the code I am just verifying the text at the top, not
>>trying to
>> >   figure out how it is supposed to work.
>> 
>> Ok, along the lines of:
>> 
>> "raid5 has supported sub-stripe writes by computing:
>> 
>> P_new = P_old + D0_old + D0_new
>
>The '0' looks out of place.  All 'D's are treated the same, so I would
>write
>it
>   P_new = P_old + D_old + D_new
>though that is a small point.
>
>> 
>> For raid6 we add the following:
>> 
>> P_new = P_old + D0_old + D0_new
>> 
>> Q_sub = Q_old + syndrome(D0_old)
>> Q_sub = Q_old + g^0*D0_old
>> Q_new = Q_old + Q_sub + g^0*D0_new
>
>But here the different Ds are different, so I would be using "Di" or
>similar.
>
>  P_new = P_old + Di_old + Di_new
>
>and I assume the two 'Q_sub' lines are meant to mean the same thing, so
>let's
>take the second:
>
>  Q_sub = Q_old + g^i * Di_old
>  Q_new = Q_old + Q_sub + g^i * Di_new
>
>which looks odd as when that expands out, we add Q_old to Q_old and as
>'+'
>is 'xor', it disappears.  Maybe you mean:
>
>  Q_new = Q_sub + g^i * Di_new
>??
>
>This is exactly the sort of thing I wanted to see, but I hoped it wouldn't
>confuse me like it seems to be doing :-)
>
>
>> 
>> This has been verified with check and dual-degraded recovery operations.
>
>Good, thanks.
>
>> 
>> > - If 'enable_rmw' really is a good idea, then it is possibly a good
>>idea for
>> >   RAID5 to and so should be a separate patch and should work for
>>RAID4/5/6.
>> >   The default for each array type may well be different, but the
>> >   functionality would be the same.
>> 
>> Yes, although Kumar's testing has not found a test case that makes
>> significant difference for raid5.  I.e. it makes sense to not
>> artificially prevent raid5 from disabling rmw if the knob is there for
>> raid6, but it would need a specific use case to flip it from the
>> default.
>
>Agreed.
>
>> 
>> > - Can you  explain *why* rcw is sometimes better than rmw even on
>>large
>> >   arrays? Even a fairly hand-wavy arguement would help.  And it would
>>go in
>> >   the comment at the top of the patch that adds enable_rmw.
>> 
>> Hand wavy argument is that rcw guarantees that the drives will be more
>> busy so small sequential writes have more chance to coalesce into
>> larger writes.  Kumar, other thoughts?
>
>So it goes faster because it goes slower?  We seem to see that a lot with
>RAID5/6 :-(  But that argument seems to work adequately for both.  If
>there
>is a measured difference it would be nice to know where it comes from.
>Not
>essential, but nice.
>
>I had guessed that rcw spreads the load over more devices which might
>help on
>very busy arrays.
>
>My concern is that rcw being faster might be due to a bug somewhere
>because
>it is counter intuitive.
>Or maybe the trade off is different for RAID6 than RAID5.  Or maybe it is
>wrong for RAID5 but we haven't noticed.
>
>We currently count the number of devices we will need to read from to
>satisfy
>the demands of rcw and rmw, and choose the fewest.
>Maybe it is more expensive to read from a device that we are about to
>write
>to by - say - 50% because it keeps the head in the same region for longer
>(or more often). So we could scale the costs appropriately.
>Maybe a useful tunable would be "read-before-write penalty" rather than
>"allow rmw".
>With a high read-before-write penalty it would choose rcw more (as it
>reads
>from different devices to where it writes.  With a low read-before-write
>penalty it would be more likely to choose rmw where possible.
Performance with rmw + RAID6 also seems to depend on the workload.
With our setup -- 10+2 7200 RPM disks and 256K chunk size + internal
bitmap, we see
70-80% gains for purely random write workloads. With the same setup, 64K
sequential writes
perform about the same while smaller sequential writes are faster with rcw
-- 
the degradation with rmw is about 5-6%. In all cases, disk utilization is
lower with rmw.
Our thinking behind adding an "enable_rmw" flag was that it seems safe to
enable
rmw for raid6 when the write workload is known to be random. With
sequential writes,
the user can either leave it off (the default) or turn it on if they see
any
benefit during testing.
As to why rcw is faster sometimes, one reason for this maybe the higher
coalescing
of smaller writes with rcw that Dan referred to -- we do see higher disk
utilization
and higher merge rates with rcw.
And as you point out, rmw can add extra rotational latency -- in the worst
case,
a full disk rotation is needed to write the data to the modified disk.
This can be
significant for lower RPM disks.
What puzzles me is that these arguments should apply to raid5 too but we
don't
see similar degradation with rmw.
>
>> 
>> > - patch looks fairly sane assuming that it works - but I don't really
>>know if
>> >   it does.  You've reported speeds but haven't told me that you ran
>>'check'
>> >   after doing each test and it reported no mismatches.  I suspect you
>>did but
>> >   I'd like to be told.  I'd also like to be told what role 'spare2'
>>plays.
>> 
>> spare2 is holding the intermediate Q_sub result of subtracting out the
>> syndrome of the old data block(s)
>
>That makes sense.  I wonder if we should call it "Q_sub" rather than
>"spare2".
>Maybe there is a better name for spare_page too.
>
>....
>> > The continual switching on 'subtract' make this hard to read too.  It
>>is
>> > probably a bit big to duplication ... Is there anything you can do to
>>make it
>> > easier to read?
>> >
>> 
>> Is this any better? Kumar?
>
>Yes thanks.  Together with the above description of what it is trying to
>achieve, I can almost say I understand it :-)
>
>Thanks,
>NeilBrown
>
>
>> 
>> static struct dma_async_tx_descriptor *
>> ops_run_rmw(struct stripe_head *sh, struct raid5_percpu *percpu,
>>             struct dma_async_tx_descriptor *tx, int subtract)
>> {
>>         int pd_idx = sh->pd_idx;
>>         int qd_idx = sh->qd_idx;
>>         struct page **blocks = percpu->scribble;
>>         struct page *xor_dest, *result, *src, *qresult, *qsrc;
>>         struct async_submit_ctl submit;
>>         dma_async_tx_callback done_fn;
>>         int count;
>> 
>>         /* the spare pages hold the intermediate P and Q results */
>>         if (subtract) {
>>                 result = percpu->spare_page;
>>                 src = sh->dev[pd_idx].page;
>> 
>>                 qresult = percpu->spare_page2;
>>                 qsrc = sh->dev[qd_idx].page;
>> 
>>                 /* we'll be called once again after new data arrives */
>>                 done_fn = NULL;
>>         } else {
>>                 /* this is the final stage of the reconstruct */
>>                 result = sh->dev[pd_idx].page;
>>                 src = percpu->spare_page;
>> 
>>                 qresult = sh->dev[qd_idx].page;
>>                 qsrc = percpu->spare_page2;
>>                 done_fn = ops_complete_reconstruct;
>>                 atomic_inc(&sh->count);
>>         }
>> 
>>         pr_debug("%s: stripe %llu\n", __func__,
>>                 (unsigned long long)sh->sector);
>> 
>>         count = set_rmw_syndrome_sources(blocks, sh, percpu, subtract);
>> 
>>         init_async_submit(&submit, ASYNC_TX_ACK, tx, NULL, NULL,
>>                           to_addr_conv(sh, percpu));
>>         tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE,
>>&submit);
>> 
>>         xor_dest = blocks[0] = result;
>>         blocks[1] = src;
>> 
>>         init_async_submit(&submit,
>>ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx,
>>                           NULL, sh, to_addr_conv(sh, percpu));
>>         tx = async_xor(xor_dest, blocks, 0, 2, STRIPE_SIZE, &submit);
>> 
>>         xor_dest = blocks[0] = qresult;
>>         blocks[1] = qsrc;
>> 
>>         init_async_submit(&submit,
>>ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx,
>>                           done_fn, sh, to_addr_conv(sh, percpu));
>>         tx = async_xor(xor_dest, blocks, 0, 2, STRIPE_SIZE, &submit);
>> 
>>         return tx;
>> }
>
^ permalink raw reply	[flat|nested] 17+ messages in thread
- * Re: [PATCH] raid5: add support for rmw writes in raid6
  2013-04-29  1:29     ` NeilBrown
  2013-04-29  7:10       ` David Brown
  2013-04-29 17:54       ` Kumar Sundararajan
@ 2013-04-30 21:32       ` Dan Williams
  2013-05-01 13:57         ` David Brown
  2013-05-08 17:42       ` Dan Williams
  3 siblings, 1 reply; 17+ messages in thread
From: Dan Williams @ 2013-04-30 21:32 UTC (permalink / raw)
  To: NeilBrown; +Cc: linux-raid, Kumar Sundararajan, David Brown
On Sun, Apr 28, 2013 at 6:29 PM, NeilBrown <neilb@suse.de> wrote:
> On Fri, 26 Apr 2013 14:35:27 -0700 Dan Williams <djbw@fb.com> wrote:
[..]
> which looks odd as when that expands out, we add Q_old to Q_old and as  '+'
> is 'xor', it disappears.  Maybe you mean:
>
>   Q_new = Q_sub + g^i * Di_new
> ??
>
> This is exactly the sort of thing I wanted to see, but I hoped it wouldn't
> confuse me like it seems to be doing :-)
Sorry, yes for completeness the approach includes all the modified
data blocks (Di...Dm) and zero pages for the unmodified blocks in the
gen_syndrome.
Q_sub = Q_old + g^i * Di_old ... + g^m * Dm_old + (zero blocks)
Q_new = Q_sub + g^i *Di_new ... + g^m * Dm_new + (zero blocks)
For David, we're using vector processing (SSE) in gen_syndrome()
rather than individual multiplications by table lookup.
--
Dan
^ permalink raw reply	[flat|nested] 17+ messages in thread 
- * Re: [PATCH] raid5: add support for rmw writes in raid6
  2013-04-30 21:32       ` Dan Williams
@ 2013-05-01 13:57         ` David Brown
  0 siblings, 0 replies; 17+ messages in thread
From: David Brown @ 2013-05-01 13:57 UTC (permalink / raw)
  To: Dan Williams; +Cc: NeilBrown, linux-raid, Kumar Sundararajan
On 30/04/13 23:32, Dan Williams wrote:
> On Sun, Apr 28, 2013 at 6:29 PM, NeilBrown <neilb@suse.de> wrote:
>> On Fri, 26 Apr 2013 14:35:27 -0700 Dan Williams <djbw@fb.com> wrote:
> [..]
>> which looks odd as when that expands out, we add Q_old to Q_old and as  '+'
>> is 'xor', it disappears.  Maybe you mean:
>>
>>    Q_new = Q_sub + g^i * Di_new
>> ??
>>
>> This is exactly the sort of thing I wanted to see, but I hoped it wouldn't
>> confuse me like it seems to be doing :-)
>
> Sorry, yes for completeness the approach includes all the modified
> data blocks (Di...Dm) and zero pages for the unmodified blocks in the
> gen_syndrome.
>
> Q_sub = Q_old + g^i * Di_old ... + g^m * Dm_old + (zero blocks)
> Q_new = Q_sub + g^i *Di_new ... + g^m * Dm_new + (zero blocks)
>
>
> For David, we're using vector processing (SSE) in gen_syndrome()
> rather than individual multiplications by table lookup.
>
Certainly SSE is the best way to go for multiply-by-g, and when doing a 
full Q generation, I believe it is calculated as:
Q = D_0 + g * (D_1 + g * (D_2 + g * ...))
But in this case, if you want to do a rmw of disk 10, you have 9 
multiply-by-g operations in a row.  Is that really faster than doing a 
single multiply-by-g^9 table lookup?
Of course, getting the very fastest implementation is not the most 
important thing.  Avoiding extra disk access will make orders of 
magnitude more difference than the choice of parity algorithm, and I 
could imagine that using the existing SSE system will mean fewer code 
changes.
^ permalink raw reply	[flat|nested] 17+ messages in thread 
 
- * Re: [PATCH] raid5: add support for rmw writes in raid6
  2013-04-29  1:29     ` NeilBrown
                         ` (2 preceding siblings ...)
  2013-04-30 21:32       ` Dan Williams
@ 2013-05-08 17:42       ` Dan Williams
  3 siblings, 0 replies; 17+ messages in thread
From: Dan Williams @ 2013-05-08 17:42 UTC (permalink / raw)
  To: NeilBrown; +Cc: linux-raid, Kumar Sundararajan, Li Shaohua
On Sun, Apr 28, 2013 at 6:29 PM, NeilBrown <neilb@suse.de> wrote:
> On Fri, 26 Apr 2013 14:35:27 -0700 Dan Williams <djbw@fb.com> wrote:
[..]
>>
>> This has been verified with check and dual-degraded recovery operations.
>
> Good, thanks.
However we are running into hangs when running it on top of your
md/for-next branch with the threading and lockless changes, still
looking into it...
--
Dan
^ permalink raw reply	[flat|nested] 17+ messages in thread
 
 
 
end of thread, other threads:[~2013-05-08 17:42 UTC | newest]
Thread overview: 17+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2013-04-18  0:56 [PATCH] raid5: add support for rmw writes in raid6 Dan Williams
2013-04-18 18:40 ` Dan Williams
2013-04-22  4:22 ` NeilBrown
2013-04-26 21:35   ` Dan Williams
2013-04-29  1:29     ` NeilBrown
2013-04-29  7:10       ` David Brown
2013-04-29 16:11         ` Kumar Sundararajan
2013-04-29 19:28           ` David Brown
2013-04-30  0:05             ` Dan Williams
2013-04-30  6:48               ` David Brown
2013-04-30 16:01                 ` Kumar Sundararajan
2013-04-30 16:10                   ` Kumar Sundararajan
2013-04-30 18:39                     ` David Brown
2013-04-29 17:54       ` Kumar Sundararajan
2013-04-30 21:32       ` Dan Williams
2013-05-01 13:57         ` David Brown
2013-05-08 17:42       ` Dan Williams
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).