[RFC 1/2]raid1: only write mismatch sectors in sync

linux-raid.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [RFC 1/2]raid1: only write mismatch sectors in sync
@ 2012-07-26  8:01 Shaohua Li
  2012-07-27 16:01 ` Jan Ceuleers
  2012-07-31  5:53 ` NeilBrown
  0 siblings, 2 replies; 24+ messages in thread
From: Shaohua Li @ 2012-07-26  8:01 UTC (permalink / raw)
  To: linux-raid; +Cc: neilb

Write has some impacts to SSD:
1. wear out flash. Frequent write can speed out the progress.
2. increase the burden of garbage collection of SSD firmware. If no space
left for write, SSD firmware garbage collection will try to free some space.
3. slow down subsequent write. After write SSD to some extents (for example,
write the whole disk), subsequent write will slow down significantly (because
almost every write invokes garbage collection in such case).

We want to avoid unnecessary write as more as possible. raid sync generally
involves a lot of unnecessary write. For example, even two disks don't have
any data, we write the second disk for the whole disk size.

To reduce write, we always compare raid disk data and only write mismatch part.
This means sync will have extra IO read and memory compare. So this scheme is
very bad for hard disk raid and sometimes SSD raid too if mismatch part is
majority. But sometimes this can be very helpful to reduce write, in that case,
since sync is rare operation, the extra IO/CPU usage is worthy paying. People
who want to use the feature should understand the risk first. So this ability
is off by default, a sysfs entry can be used to enable it.

Signed-off-by: Shaohua Li <shli@fusionio.com>
---
 drivers/md/md.c    |   41 +++++++++++++++++++++++++++++++
 drivers/md/md.h    |    3 ++
 drivers/md/raid1.c |   70 +++++++++++++++++++++++++++++++++++++++++++++++++----
 3 files changed, 110 insertions(+), 4 deletions(-)

Index: linux/drivers/md/md.h
===================================================================
--- linux.orig/drivers/md/md.h	2012-07-25 13:51:00.353775521 +0800
+++ linux/drivers/md/md.h	2012-07-26 10:36:38.500740552 +0800
@@ -325,6 +325,9 @@ struct mddev {
 #define	MD_RECOVERY_FROZEN	9
 
 	unsigned long			recovery;
+#define MD_RECOVERY_MODE_REPAIR		0
+#define MD_RECOVERY_MODE_DISCARD	1
+	unsigned long			recovery_mode;
 	/* If a RAID personality determines that recovery (of a particular
 	 * device) will fail due to a read error on the source device, it
 	 * takes a copy of this number and does not attempt recovery again
Index: linux/drivers/md/raid1.c
===================================================================
--- linux.orig/drivers/md/raid1.c	2012-07-25 13:51:00.365775374 +0800
+++ linux/drivers/md/raid1.c	2012-07-26 10:34:10.658595244 +0800
@@ -102,7 +102,8 @@ static void * r1buf_pool_alloc(gfp_t gfp
 	 * If this is a user-requested check/repair, allocate
 	 * RESYNC_PAGES for each bio.
 	 */
-	if (test_bit(MD_RECOVERY_REQUESTED, &pi->mddev->recovery))
+	if (test_bit(MD_RECOVERY_REQUESTED, &pi->mddev->recovery) ||
+	    test_bit(MD_RECOVERY_MODE_REPAIR, &pi->mddev->recovery_mode))
 		j = pi->raid_disks;
 	else
 		j = 1;
@@ -118,7 +119,8 @@ static void * r1buf_pool_alloc(gfp_t gfp
 		}
 	}
 	/* If not user-requests, copy the page pointers to all bios */
-	if (!test_bit(MD_RECOVERY_REQUESTED, &pi->mddev->recovery)) {
+	if (!test_bit(MD_RECOVERY_REQUESTED, &pi->mddev->recovery) &&
+	    !test_bit(MD_RECOVERY_MODE_REPAIR, &pi->mddev->recovery_mode)) {
 		for (i=0; i<RESYNC_PAGES ; i++)
 			for (j=1; j<pi->raid_disks; j++)
 				r1_bio->bios[j]->bi_io_vec[i].bv_page =
@@ -1556,6 +1558,38 @@ static void end_sync_write(struct bio *b
 	}
 }
 
+static void end_repair_read(struct bio *bio, int error, int write)
+{
+	struct r1bio *r1_bio = bio->bi_private;
+	struct r1conf *conf;
+	int i;
+
+	/* process_checks() will re-setup the bio */
+	if (write)
+		bio->bi_end_io = end_sync_write;
+	else
+		bio->bi_end_io = end_sync_read;
+
+	conf = r1_bio->mddev->private;
+	for (i = 0; i < conf->raid_disks * 2; i++)
+		if (r1_bio->bios[i] == bio)
+			break;
+	update_head_pos(i, r1_bio);
+
+	if (atomic_dec_and_test(&r1_bio->remaining))
+		reschedule_retry(r1_bio);
+}
+
+static void end_repair_read_for_write(struct bio *bio, int error)
+{
+	end_repair_read(bio, error, 1);
+}
+
+static void end_repair_read_for_read(struct bio *bio, int error)
+{
+	end_repair_read(bio, error, 0);
+}
+
 static int r1_sync_page_io(struct md_rdev *rdev, sector_t sector,
 			    int sectors, struct page *page, int rw)
 {
@@ -1718,6 +1752,8 @@ static int process_checks(struct r1bio *
 			rdev_dec_pending(conf->mirrors[primary].rdev, mddev);
 			break;
 		}
+	if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
+		primary = r1_bio->read_disk;
 	r1_bio->read_disk = primary;
 	vcnt = (r1_bio->sectors + PAGE_SIZE / 512 - 1) >> (PAGE_SHIFT - 9);
 	for (i = 0; i < conf->raid_disks * 2; i++) {
@@ -1726,7 +1762,9 @@ static int process_checks(struct r1bio *
 		struct bio *sbio = r1_bio->bios[i];
 		int size;
 
-		if (r1_bio->bios[i]->bi_end_io != end_sync_read)
+		if (sbio->bi_end_io != end_sync_read &&
+		    !(sbio->bi_end_io == end_sync_write &&
+		      test_bit(MD_RECOVERY_MODE_REPAIR, &mddev->recovery_mode)))
 			continue;
 
 		if (test_bit(BIO_UPTODATE, &sbio->bi_flags)) {
@@ -1761,6 +1799,7 @@ static int process_checks(struct r1bio *
 		sbio->bi_sector = r1_bio->sector +
 			conf->mirrors[i].rdev->data_offset;
 		sbio->bi_bdev = conf->mirrors[i].rdev->bdev;
+
 		size = sbio->bi_size;
 		for (j = 0; j < vcnt ; j++) {
 			struct bio_vec *bi;
@@ -1793,7 +1832,8 @@ static void sync_request_write(struct md
 		if (!fix_sync_read_error(r1_bio))
 			return;
 
-	if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
+	if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) ||
+	    test_bit(MD_RECOVERY_MODE_REPAIR, &mddev->recovery_mode))
 		if (process_checks(r1_bio) < 0)
 			return;
 	/*
@@ -2491,6 +2531,28 @@ static sector_t sync_request(struct mdde
 				md_sync_acct(bio->bi_bdev, nr_sectors);
 				generic_make_request(bio);
 			}
+		}
+	} else if (test_bit(MD_RECOVERY_MODE_REPAIR, &mddev->recovery_mode)) {
+		atomic_set(&r1_bio->remaining, write_targets + 1);
+		for (i = 0; i < conf->raid_disks * 2; i++) {
+			int do_io = 0;
+
+			bio = r1_bio->bios[i];
+			if (bio->bi_end_io == end_sync_write) {
+				bio->bi_rw = READ;
+				bio->bi_end_io = end_repair_read_for_write;
+				do_io = 1;
+			}
+			if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
+			    bio->bi_end_io == end_sync_read &&
+			    i != r1_bio->read_disk) {
+				bio->bi_end_io = end_repair_read_for_read;
+				do_io = 1;
+			}
+			if (i == r1_bio->read_disk || do_io) {
+				md_sync_acct(bio->bi_bdev, nr_sectors);
+				generic_make_request(bio);
+			}
 		}
 	} else {
 		atomic_set(&r1_bio->remaining, 1);
Index: linux/drivers/md/md.c
===================================================================
--- linux.orig/drivers/md/md.c	2012-07-25 13:51:00.345775613 +0800
+++ linux/drivers/md/md.c	2012-07-26 10:12:13.123162321 +0800
@@ -4330,9 +4330,49 @@ mismatch_cnt_show(struct mddev *mddev, c
 		       (unsigned long long) mddev->resync_mismatches);
 }
 
+static ssize_t
+recovery_mode_show(struct mddev *mddev, char *page)
+{
+	char *type = "default";
+	if (test_bit(MD_RECOVERY_MODE_REPAIR, &mddev->recovery_mode)) {
+		type = "repair";
+		if (test_bit(MD_RECOVERY_MODE_DISCARD, &mddev->recovery_mode))
+			type = "discard";
+	}
+	return sprintf(page, "%s\n", type);
+}
+
+static ssize_t
+recovery_mode_store(struct mddev *mddev, const char *page, size_t len)
+{
+	if (!mddev->pers || !mddev->pers->sync_request)
+		return -EINVAL;
+
+	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
+		    !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
+		return -EBUSY;
+
+	if (cmd_match(page, "discard")) {
+		set_bit(MD_RECOVERY_MODE_REPAIR, &mddev->recovery_mode);
+		set_bit(MD_RECOVERY_MODE_DISCARD, &mddev->recovery_mode);
+	} else {
+		clear_bit(MD_RECOVERY_MODE_DISCARD, &mddev->recovery_mode);
+		if (cmd_match(page, "repair"))
+			set_bit(MD_RECOVERY_MODE_REPAIR, &mddev->recovery_mode);
+		else {
+			clear_bit(MD_RECOVERY_MODE_REPAIR, &mddev->recovery_mode);
+			if (!cmd_match(page, "default"))
+				return -EINVAL;
+		}
+	}
+	return len;
+}
+
 static struct md_sysfs_entry md_scan_mode =
 __ATTR(sync_action, S_IRUGO|S_IWUSR, action_show, action_store);
 
+static struct md_sysfs_entry md_recovery_mode =
+__ATTR(recovery_mode, S_IRUGO|S_IWUSR, recovery_mode_show, recovery_mode_store);
 
 static struct md_sysfs_entry md_mismatches = __ATTR_RO(mismatch_cnt);
 
@@ -4732,6 +4772,7 @@ static struct attribute *md_default_attr
 
 static struct attribute *md_redundancy_attrs[] = {
 	&md_scan_mode.attr,
+	&md_recovery_mode.attr,
 	&md_mismatches.attr,
 	&md_sync_min.attr,
 	&md_sync_max.attr,

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC 1/2]raid1: only write mismatch sectors in sync
  2012-07-26  8:01 [RFC 1/2]raid1: only write mismatch sectors in sync Shaohua Li
@ 2012-07-27 16:01 ` Jan Ceuleers
  2012-07-30  0:39   ` Shaohua Li
  2012-07-31  5:53 ` NeilBrown
  1 sibling, 1 reply; 24+ messages in thread
From: Jan Ceuleers @ 2012-07-27 16:01 UTC (permalink / raw)
  To: Shaohua Li; +Cc: linux-raid, neilb

On 07/26/2012 10:01 AM, Shaohua Li wrote:
...
> To reduce write, we always compare raid disk data and only write mismatch part.
> This means sync will have extra IO read and memory compare. So this scheme is
> very bad for hard disk raid and sometimes SSD raid too if mismatch part is
> majority. But sometimes this can be very helpful to reduce write, in that case,
> since sync is rare operation, the extra IO/CPU usage is worthy paying. People
> who want to use the feature should understand the risk first. So this ability
> is off by default, a sysfs entry can be used to enable it.

For clarity: the risk you are talking about is that the sync will result in more reads, as well as more CPU cycles spent comparing data. Is that right? I.e. there is no risk whatsoever to data integrity?

Can you comment on the magnitude of this risk? For example, if this functionality is inadvertently applied to hard disks, and assuming that the components reside on separate spindles (which is a safe bet), wouldn't those reads happen in parallel thereby not significantly contributing to the slowdown? In other words: the principal component of the slowdown is the in-memory data comparison?

But isn't there then an upside resulting from the avoidance of some writes where data is found to already match?

Thanks, Jan

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC 1/2]raid1: only write mismatch sectors in sync
  2012-07-27 16:01 ` Jan Ceuleers
@ 2012-07-30  0:39   ` Shaohua Li
  2012-07-30  1:07     ` Roberto Spadim
  0 siblings, 1 reply; 24+ messages in thread
From: Shaohua Li @ 2012-07-30  0:39 UTC (permalink / raw)
  To: Jan Ceuleers; +Cc: linux-raid, neilb

2012/7/28 Jan Ceuleers <jan.ceuleers@computer.org>:
> On 07/26/2012 10:01 AM, Shaohua Li wrote:
> ...
>> To reduce write, we always compare raid disk data and only write mismatch part.
>> This means sync will have extra IO read and memory compare. So this scheme is
>> very bad for hard disk raid and sometimes SSD raid too if mismatch part is
>> majority. But sometimes this can be very helpful to reduce write, in that case,
>> since sync is rare operation, the extra IO/CPU usage is worthy paying. People
>> who want to use the feature should understand the risk first. So this ability
>> is off by default, a sysfs entry can be used to enable it.
>
> For clarity: the risk you are talking about is that the sync will result in more reads, as well as more CPU cycles spent comparing data. Is that right? I.e. there is no risk whatsoever to data integrity?
>
> Can you comment on the magnitude of this risk? For example, if this functionality is inadvertently applied to hard disks, and assuming that the components reside on separate spindles (which is a safe bet), wouldn't those reads happen in parallel thereby not significantly contributing to the slowdown? In other words: the principal component of the slowdown is the in-memory data comparison?
>
> But isn't there then an upside resulting from the avoidance of some writes where data is found to already match?

data integrity should be guaranteed. To do sync, without the patch, we
read disk1, write disk 2. With it,
we read disk1/disk2, memory compare, if match, skip write, otherwise,
write disk2. So if data match,
the tradeoff is a memory compare (and write is replaced with read). If
not match, the tradeoff is
memory compare and extra read.

Thanks,
Shaohua

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC 1/2]raid1: only write mismatch sectors in sync
  2012-07-30  0:39   ` Shaohua Li
@ 2012-07-30  1:07     ` Roberto Spadim
  0 siblings, 0 replies; 24+ messages in thread
From: Roberto Spadim @ 2012-07-30  1:07 UTC (permalink / raw)
  To: Shaohua Li; +Cc: Jan Ceuleers, linux-raid, neilb

this could be good to SD cards too... not only to resync

2012/7/29 Shaohua Li <shli@kernel.org>:
> 2012/7/28 Jan Ceuleers <jan.ceuleers@computer.org>:
>> On 07/26/2012 10:01 AM, Shaohua Li wrote:
>> ...
>>> To reduce write, we always compare raid disk data and only write mismatch part.
>>> This means sync will have extra IO read and memory compare. So this scheme is
>>> very bad for hard disk raid and sometimes SSD raid too if mismatch part is
>>> majority. But sometimes this can be very helpful to reduce write, in that case,
>>> since sync is rare operation, the extra IO/CPU usage is worthy paying. People
>>> who want to use the feature should understand the risk first. So this ability
>>> is off by default, a sysfs entry can be used to enable it.
>>
>> For clarity: the risk you are talking about is that the sync will result in more reads, as well as more CPU cycles spent comparing data. Is that right? I.e. there is no risk whatsoever to data integrity?
>>
>> Can you comment on the magnitude of this risk? For example, if this functionality is inadvertently applied to hard disks, and assuming that the components reside on separate spindles (which is a safe bet), wouldn't those reads happen in parallel thereby not significantly contributing to the slowdown? In other words: the principal component of the slowdown is the in-memory data comparison?
>>
>> But isn't there then an upside resulting from the avoidance of some writes where data is found to already match?
>
> data integrity should be guaranteed. To do sync, without the patch, we
> read disk1, write disk 2. With it,
> we read disk1/disk2, memory compare, if match, skip write, otherwise,
> write disk2. So if data match,
> the tradeoff is a memory compare (and write is replaced with read). If
> not match, the tradeoff is
> memory compare and extra read.
>
> Thanks,
> Shaohua
> --
> To unsubscribe from this list: send the line "unsubscribe linux-raid" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html



-- 
Roberto Spadim
Spadim Technology / SPAEmpresarial

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC 1/2]raid1: only write mismatch sectors in sync
  2012-07-26  8:01 [RFC 1/2]raid1: only write mismatch sectors in sync Shaohua Li
  2012-07-27 16:01 ` Jan Ceuleers
@ 2012-07-31  5:53 ` NeilBrown
  2012-07-31  8:12   ` Shaohua Li
  1 sibling, 1 reply; 24+ messages in thread
From: NeilBrown @ 2012-07-31  5:53 UTC (permalink / raw)
  To: Shaohua Li; +Cc: linux-raid

[-- Attachment #1: Type: text/plain, Size: 2841 bytes --]

On Thu, 26 Jul 2012 16:01:50 +0800 Shaohua Li <shli@kernel.org> wrote:

> Write has some impacts to SSD:
> 1. wear out flash. Frequent write can speed out the progress.
> 2. increase the burden of garbage collection of SSD firmware. If no space
> left for write, SSD firmware garbage collection will try to free some space.
> 3. slow down subsequent write. After write SSD to some extents (for example,
> write the whole disk), subsequent write will slow down significantly (because
> almost every write invokes garbage collection in such case).
> 
> We want to avoid unnecessary write as more as possible. raid sync generally
> involves a lot of unnecessary write. For example, even two disks don't have
> any data, we write the second disk for the whole disk size.
> 
> To reduce write, we always compare raid disk data and only write mismatch part.
> This means sync will have extra IO read and memory compare. So this scheme is
> very bad for hard disk raid and sometimes SSD raid too if mismatch part is
> majority. But sometimes this can be very helpful to reduce write, in that case,
> since sync is rare operation, the extra IO/CPU usage is worthy paying. People
> who want to use the feature should understand the risk first. So this ability
> is off by default, a sysfs entry can be used to enable it.
> 
> Signed-off-by: Shaohua Li <shli@fusionio.com>
> ---
>  drivers/md/md.c    |   41 +++++++++++++++++++++++++++++++
>  drivers/md/md.h    |    3 ++
>  drivers/md/raid1.c |   70 +++++++++++++++++++++++++++++++++++++++++++++++++----
>  3 files changed, 110 insertions(+), 4 deletions(-)
> 
> Index: linux/drivers/md/md.h
> ===================================================================
> --- linux.orig/drivers/md/md.h	2012-07-25 13:51:00.353775521 +0800
> +++ linux/drivers/md/md.h	2012-07-26 10:36:38.500740552 +0800
> @@ -325,6 +325,9 @@ struct mddev {
>  #define	MD_RECOVERY_FROZEN	9
>  
>  	unsigned long			recovery;
> +#define MD_RECOVERY_MODE_REPAIR		0
> +#define MD_RECOVERY_MODE_DISCARD	1
> +	unsigned long			recovery_mode;

You have not documented the meaning of these two flags at all, and I don't
feel up to guessing.

The patch looks more complex that it should be.  The behaviour you are
suggesting is exactly the behaviour you get when MD_RECOVERY_REQUESTED is
set, so at most I expect to see a few places where that flag is tested
changed to test something else as well.

How would this be used?  It affects resync and resync happens as soon as the
array is assembled.  So when and how would you set the flag which says
"prefer reads to writes"?  It seems like it needs to be set in the metadata.

BTW RAID10 already does this - it reads and compares for a normal sync.  So
maybe just tell people to use raid10 if they want this behaviour?

NeilBrown

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 828 bytes --]

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC 1/2]raid1: only write mismatch sectors in sync
  2012-07-31  5:53 ` NeilBrown
@ 2012-07-31  8:12   ` Shaohua Li
  2012-09-11  0:59     ` NeilBrown
  0 siblings, 1 reply; 24+ messages in thread
From: Shaohua Li @ 2012-07-31  8:12 UTC (permalink / raw)
  To: NeilBrown; +Cc: linux-raid

2012/7/31 NeilBrown <neilb@suse.de>:
> On Thu, 26 Jul 2012 16:01:50 +0800 Shaohua Li <shli@kernel.org> wrote:
>
>> Write has some impacts to SSD:
>> 1. wear out flash. Frequent write can speed out the progress.
>> 2. increase the burden of garbage collection of SSD firmware. If no space
>> left for write, SSD firmware garbage collection will try to free some space.
>> 3. slow down subsequent write. After write SSD to some extents (for example,
>> write the whole disk), subsequent write will slow down significantly (because
>> almost every write invokes garbage collection in such case).
>>
>> We want to avoid unnecessary write as more as possible. raid sync generally
>> involves a lot of unnecessary write. For example, even two disks don't have
>> any data, we write the second disk for the whole disk size.
>>
>> To reduce write, we always compare raid disk data and only write mismatch part.
>> This means sync will have extra IO read and memory compare. So this scheme is
>> very bad for hard disk raid and sometimes SSD raid too if mismatch part is
>> majority. But sometimes this can be very helpful to reduce write, in that case,
>> since sync is rare operation, the extra IO/CPU usage is worthy paying. People
>> who want to use the feature should understand the risk first. So this ability
>> is off by default, a sysfs entry can be used to enable it.
>>
>> Signed-off-by: Shaohua Li <shli@fusionio.com>
>> ---
>>  drivers/md/md.c    |   41 +++++++++++++++++++++++++++++++
>>  drivers/md/md.h    |    3 ++
>>  drivers/md/raid1.c |   70 +++++++++++++++++++++++++++++++++++++++++++++++++----
>>  3 files changed, 110 insertions(+), 4 deletions(-)
>>
>> Index: linux/drivers/md/md.h
>> ===================================================================
>> --- linux.orig/drivers/md/md.h        2012-07-25 13:51:00.353775521 +0800
>> +++ linux/drivers/md/md.h     2012-07-26 10:36:38.500740552 +0800
>> @@ -325,6 +325,9 @@ struct mddev {
>>  #define      MD_RECOVERY_FROZEN      9
>>
>>       unsigned long                   recovery;
>> +#define MD_RECOVERY_MODE_REPAIR              0
>> +#define MD_RECOVERY_MODE_DISCARD     1
>> +     unsigned long                   recovery_mode;
>
> You have not documented the meaning of these two flags at all, and I don't
> feel up to guessing.
>
> The patch looks more complex that it should be.  The behaviour you are
> suggesting is exactly the behaviour you get when MD_RECOVERY_REQUESTED is
> set, so at most I expect to see a few places where that flag is tested
> changed to test something else as well.
>
> How would this be used?  It affects resync and resync happens as soon as the
> array is assembled.  So when and how would you set the flag which says
> "prefer reads to writes"?  It seems like it needs to be set in the metadata.
>
> BTW RAID10 already does this - it reads and compares for a normal sync.  So
> maybe just tell people to use raid10 if they want this behaviour?

It's true, the behavior likes MD_RECOVERY_REQUESTED, ie, read all disks,
only do write if two disk data not match. But I hope it works as soon as the
array is assembled. Surely we can set in the metadata, but I didn't want to
enable it by default, since even with SSD, the read-compare-write isn't optimal
some times, because it involves extra IO read and memory compare.

It appears MD_RECOVERY_REQUESTED assumes disks are insync.
It doesn't read disk !insync, so it doesn't work for assemble.

In my mind, user frozen the sync first (with sync_action setting), and then
enable read-compare-write, and finally continue the sync. I can't stop
a recovery and set MD_RECOVERY_REQUESTED, so I added a flag.
Anyway, please suggest what the preferred way for this.

The second patch is for a special case. two disks data don't match, but
disk1 data are all 0. I'd like to do trim for disk2, instead of write
0 to disk2,
this can be helpful for SSD.

Good to know raid10 already does read-compare-write. From the comments,
looks it only supports resync not recovery.

Thanks,
Shaohua

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC 1/2]raid1: only write mismatch sectors in sync
  2012-07-31  8:12   ` Shaohua Li
@ 2012-09-11  0:59     ` NeilBrown
  2012-09-12  5:29       ` Shaohua Li
  0 siblings, 1 reply; 24+ messages in thread
From: NeilBrown @ 2012-09-11  0:59 UTC (permalink / raw)
  To: Shaohua Li; +Cc: linux-raid

[-- Attachment #1: Type: text/plain, Size: 4865 bytes --]

On Tue, 31 Jul 2012 16:12:04 +0800 Shaohua Li <shli@kernel.org> wrote:

> 2012/7/31 NeilBrown <neilb@suse.de>:
> > On Thu, 26 Jul 2012 16:01:50 +0800 Shaohua Li <shli@kernel.org> wrote:
> >
> >> Write has some impacts to SSD:
> >> 1. wear out flash. Frequent write can speed out the progress.
> >> 2. increase the burden of garbage collection of SSD firmware. If no space
> >> left for write, SSD firmware garbage collection will try to free some space.
> >> 3. slow down subsequent write. After write SSD to some extents (for example,
> >> write the whole disk), subsequent write will slow down significantly (because
> >> almost every write invokes garbage collection in such case).
> >>
> >> We want to avoid unnecessary write as more as possible. raid sync generally
> >> involves a lot of unnecessary write. For example, even two disks don't have
> >> any data, we write the second disk for the whole disk size.
> >>
> >> To reduce write, we always compare raid disk data and only write mismatch part.
> >> This means sync will have extra IO read and memory compare. So this scheme is
> >> very bad for hard disk raid and sometimes SSD raid too if mismatch part is
> >> majority. But sometimes this can be very helpful to reduce write, in that case,
> >> since sync is rare operation, the extra IO/CPU usage is worthy paying. People
> >> who want to use the feature should understand the risk first. So this ability
> >> is off by default, a sysfs entry can be used to enable it.
> >>
> >> Signed-off-by: Shaohua Li <shli@fusionio.com>
> >> ---
> >>  drivers/md/md.c    |   41 +++++++++++++++++++++++++++++++
> >>  drivers/md/md.h    |    3 ++
> >>  drivers/md/raid1.c |   70 +++++++++++++++++++++++++++++++++++++++++++++++++----
> >>  3 files changed, 110 insertions(+), 4 deletions(-)
> >>
> >> Index: linux/drivers/md/md.h
> >> ===================================================================
> >> --- linux.orig/drivers/md/md.h        2012-07-25 13:51:00.353775521 +0800
> >> +++ linux/drivers/md/md.h     2012-07-26 10:36:38.500740552 +0800
> >> @@ -325,6 +325,9 @@ struct mddev {
> >>  #define      MD_RECOVERY_FROZEN      9
> >>
> >>       unsigned long                   recovery;
> >> +#define MD_RECOVERY_MODE_REPAIR              0
> >> +#define MD_RECOVERY_MODE_DISCARD     1
> >> +     unsigned long                   recovery_mode;
> >
> > You have not documented the meaning of these two flags at all, and I don't
> > feel up to guessing.
> >
> > The patch looks more complex that it should be.  The behaviour you are
> > suggesting is exactly the behaviour you get when MD_RECOVERY_REQUESTED is
> > set, so at most I expect to see a few places where that flag is tested
> > changed to test something else as well.
> >
> > How would this be used?  It affects resync and resync happens as soon as the
> > array is assembled.  So when and how would you set the flag which says
> > "prefer reads to writes"?  It seems like it needs to be set in the metadata.
> >
> > BTW RAID10 already does this - it reads and compares for a normal sync.  So
> > maybe just tell people to use raid10 if they want this behaviour?
> 
> It's true, the behavior likes MD_RECOVERY_REQUESTED, ie, read all disks,
> only do write if two disk data not match. But I hope it works as soon as the
> array is assembled. Surely we can set in the metadata, but I didn't want to
> enable it by default, since even with SSD, the read-compare-write isn't optimal
> some times, because it involves extra IO read and memory compare.
> 
> It appears MD_RECOVERY_REQUESTED assumes disks are insync.
> It doesn't read disk !insync, so it doesn't work for assemble.
> 
> In my mind, user frozen the sync first (with sync_action setting), and then
> enable read-compare-write, and finally continue the sync. I can't stop
> a recovery and set MD_RECOVERY_REQUESTED, so I added a flag.
> Anyway, please suggest what the preferred way for this.

I guess I just don't find this functionality at all convincing.  It isn't
clear when anyone would use it, or how they would use it.  It seems best not
to provide it.

> 
> The second patch is for a special case. two disks data don't match, but
> disk1 data are all 0. I'd like to do trim for disk2, instead of write
> 0 to disk2,
> this can be helpful for SSD.

If this is a useful optimisation, then I expect it should be implemented
somewhat lower in the stack, so that every attempt to write a zero block
becomes a 'trim'.  Ideally the SSD itself should  handle this optimisation.

I really don't think it is appropriate for md to be detecting zeros and
writing them as 'trim'.

NeilBrown



> 
> Good to know raid10 already does read-compare-write. From the comments,
> looks it only supports resync not recovery.
> 
> Thanks,
> Shaohua


[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 828 bytes --]

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC 1/2]raid1: only write mismatch sectors in sync
  2012-09-11  0:59     ` NeilBrown
@ 2012-09-12  5:29       ` Shaohua Li
  2012-09-18  4:57         ` NeilBrown
  0 siblings, 1 reply; 24+ messages in thread
From: Shaohua Li @ 2012-09-12  5:29 UTC (permalink / raw)
  To: NeilBrown; +Cc: linux-raid

On Tue, Sep 11, 2012 at 10:59:08AM +1000, NeilBrown wrote:
> On Tue, 31 Jul 2012 16:12:04 +0800 Shaohua Li <shli@kernel.org> wrote:
> 
> > 2012/7/31 NeilBrown <neilb@suse.de>:
> > > On Thu, 26 Jul 2012 16:01:50 +0800 Shaohua Li <shli@kernel.org> wrote:
> > >
> > >> Write has some impacts to SSD:
> > >> 1. wear out flash. Frequent write can speed out the progress.
> > >> 2. increase the burden of garbage collection of SSD firmware. If no space
> > >> left for write, SSD firmware garbage collection will try to free some space.
> > >> 3. slow down subsequent write. After write SSD to some extents (for example,
> > >> write the whole disk), subsequent write will slow down significantly (because
> > >> almost every write invokes garbage collection in such case).
> > >>
> > >> We want to avoid unnecessary write as more as possible. raid sync generally
> > >> involves a lot of unnecessary write. For example, even two disks don't have
> > >> any data, we write the second disk for the whole disk size.
> > >>
> > >> To reduce write, we always compare raid disk data and only write mismatch part.
> > >> This means sync will have extra IO read and memory compare. So this scheme is
> > >> very bad for hard disk raid and sometimes SSD raid too if mismatch part is
> > >> majority. But sometimes this can be very helpful to reduce write, in that case,
> > >> since sync is rare operation, the extra IO/CPU usage is worthy paying. People
> > >> who want to use the feature should understand the risk first. So this ability
> > >> is off by default, a sysfs entry can be used to enable it.
> > >>
> > >> Signed-off-by: Shaohua Li <shli@fusionio.com>
> > >> ---
> > >>  drivers/md/md.c    |   41 +++++++++++++++++++++++++++++++
> > >>  drivers/md/md.h    |    3 ++
> > >>  drivers/md/raid1.c |   70 +++++++++++++++++++++++++++++++++++++++++++++++++----
> > >>  3 files changed, 110 insertions(+), 4 deletions(-)
> > >>
> > >> Index: linux/drivers/md/md.h
> > >> ===================================================================
> > >> --- linux.orig/drivers/md/md.h        2012-07-25 13:51:00.353775521 +0800
> > >> +++ linux/drivers/md/md.h     2012-07-26 10:36:38.500740552 +0800
> > >> @@ -325,6 +325,9 @@ struct mddev {
> > >>  #define      MD_RECOVERY_FROZEN      9
> > >>
> > >>       unsigned long                   recovery;
> > >> +#define MD_RECOVERY_MODE_REPAIR              0
> > >> +#define MD_RECOVERY_MODE_DISCARD     1
> > >> +     unsigned long                   recovery_mode;
> > >
> > > You have not documented the meaning of these two flags at all, and I don't
> > > feel up to guessing.
> > >
> > > The patch looks more complex that it should be.  The behaviour you are
> > > suggesting is exactly the behaviour you get when MD_RECOVERY_REQUESTED is
> > > set, so at most I expect to see a few places where that flag is tested
> > > changed to test something else as well.
> > >
> > > How would this be used?  It affects resync and resync happens as soon as the
> > > array is assembled.  So when and how would you set the flag which says
> > > "prefer reads to writes"?  It seems like it needs to be set in the metadata.
> > >
> > > BTW RAID10 already does this - it reads and compares for a normal sync.  So
> > > maybe just tell people to use raid10 if they want this behaviour?
> > 
> > It's true, the behavior likes MD_RECOVERY_REQUESTED, ie, read all disks,
> > only do write if two disk data not match. But I hope it works as soon as the
> > array is assembled. Surely we can set in the metadata, but I didn't want to
> > enable it by default, since even with SSD, the read-compare-write isn't optimal
> > some times, because it involves extra IO read and memory compare.
> > 
> > It appears MD_RECOVERY_REQUESTED assumes disks are insync.
> > It doesn't read disk !insync, so it doesn't work for assemble.
> > 
> > In my mind, user frozen the sync first (with sync_action setting), and then
> > enable read-compare-write, and finally continue the sync. I can't stop
> > a recovery and set MD_RECOVERY_REQUESTED, so I added a flag.
> > Anyway, please suggest what the preferred way for this.
> 
> I guess I just don't find this functionality at all convincing.  It isn't
> clear when anyone would use it, or how they would use it.  It seems best not
> to provide it.

The background is: For SSD, writting a hard formatted disk and fully filled
disk, the first case could be 300% faster than the second case depending on SSD
firmware and how fragmented the filled disk is. So this isn't a trival
performance issue.

The usage model is something like this: one disk is borken, add a new disk to
the raid. Currently we copy the whole disk of the first disk to the second. For
SSD, if the first disk and second disk have most data identical or most content
of the first is 0, the copy is very bad. In this scenario, we can avoid some
copy, which will make latter write to the disk faster. Thinking about 300%
faster :).

I don't think we can do this in underlayer. We don't want to do it always.
Detecting 0 is very expensive.

And this isn't intrusive to me. We only do the 'copy avoidness' in sync, and
sync is rare event.

I hope to convince you this is a useful functionality, then we can discuss the
implementation.

Thanks,
Shaohua

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC 1/2]raid1: only write mismatch sectors in sync
  2012-09-12  5:29       ` Shaohua Li
@ 2012-09-18  4:57         ` NeilBrown
  2012-09-19  5:51           ` Shaohua Li
  0 siblings, 1 reply; 24+ messages in thread
From: NeilBrown @ 2012-09-18  4:57 UTC (permalink / raw)
  To: Shaohua Li; +Cc: linux-raid

[-- Attachment #1: Type: text/plain, Size: 5989 bytes --]

On Wed, 12 Sep 2012 13:29:41 +0800 Shaohua Li <shli@kernel.org> wrote:

> On Tue, Sep 11, 2012 at 10:59:08AM +1000, NeilBrown wrote:
> > On Tue, 31 Jul 2012 16:12:04 +0800 Shaohua Li <shli@kernel.org> wrote:
> > 
> > > 2012/7/31 NeilBrown <neilb@suse.de>:
> > > > On Thu, 26 Jul 2012 16:01:50 +0800 Shaohua Li <shli@kernel.org> wrote:
> > > >
> > > >> Write has some impacts to SSD:
> > > >> 1. wear out flash. Frequent write can speed out the progress.
> > > >> 2. increase the burden of garbage collection of SSD firmware. If no space
> > > >> left for write, SSD firmware garbage collection will try to free some space.
> > > >> 3. slow down subsequent write. After write SSD to some extents (for example,
> > > >> write the whole disk), subsequent write will slow down significantly (because
> > > >> almost every write invokes garbage collection in such case).
> > > >>
> > > >> We want to avoid unnecessary write as more as possible. raid sync generally
> > > >> involves a lot of unnecessary write. For example, even two disks don't have
> > > >> any data, we write the second disk for the whole disk size.
> > > >>
> > > >> To reduce write, we always compare raid disk data and only write mismatch part.
> > > >> This means sync will have extra IO read and memory compare. So this scheme is
> > > >> very bad for hard disk raid and sometimes SSD raid too if mismatch part is
> > > >> majority. But sometimes this can be very helpful to reduce write, in that case,
> > > >> since sync is rare operation, the extra IO/CPU usage is worthy paying. People
> > > >> who want to use the feature should understand the risk first. So this ability
> > > >> is off by default, a sysfs entry can be used to enable it.
> > > >>
> > > >> Signed-off-by: Shaohua Li <shli@fusionio.com>
> > > >> ---
> > > >>  drivers/md/md.c    |   41 +++++++++++++++++++++++++++++++
> > > >>  drivers/md/md.h    |    3 ++
> > > >>  drivers/md/raid1.c |   70 +++++++++++++++++++++++++++++++++++++++++++++++++----
> > > >>  3 files changed, 110 insertions(+), 4 deletions(-)
> > > >>
> > > >> Index: linux/drivers/md/md.h
> > > >> ===================================================================
> > > >> --- linux.orig/drivers/md/md.h        2012-07-25 13:51:00.353775521 +0800
> > > >> +++ linux/drivers/md/md.h     2012-07-26 10:36:38.500740552 +0800
> > > >> @@ -325,6 +325,9 @@ struct mddev {
> > > >>  #define      MD_RECOVERY_FROZEN      9
> > > >>
> > > >>       unsigned long                   recovery;
> > > >> +#define MD_RECOVERY_MODE_REPAIR              0
> > > >> +#define MD_RECOVERY_MODE_DISCARD     1
> > > >> +     unsigned long                   recovery_mode;
> > > >
> > > > You have not documented the meaning of these two flags at all, and I don't
> > > > feel up to guessing.
> > > >
> > > > The patch looks more complex that it should be.  The behaviour you are
> > > > suggesting is exactly the behaviour you get when MD_RECOVERY_REQUESTED is
> > > > set, so at most I expect to see a few places where that flag is tested
> > > > changed to test something else as well.
> > > >
> > > > How would this be used?  It affects resync and resync happens as soon as the
> > > > array is assembled.  So when and how would you set the flag which says
> > > > "prefer reads to writes"?  It seems like it needs to be set in the metadata.
> > > >
> > > > BTW RAID10 already does this - it reads and compares for a normal sync.  So
> > > > maybe just tell people to use raid10 if they want this behaviour?
> > > 
> > > It's true, the behavior likes MD_RECOVERY_REQUESTED, ie, read all disks,
> > > only do write if two disk data not match. But I hope it works as soon as the
> > > array is assembled. Surely we can set in the metadata, but I didn't want to
> > > enable it by default, since even with SSD, the read-compare-write isn't optimal
> > > some times, because it involves extra IO read and memory compare.
> > > 
> > > It appears MD_RECOVERY_REQUESTED assumes disks are insync.
> > > It doesn't read disk !insync, so it doesn't work for assemble.
> > > 
> > > In my mind, user frozen the sync first (with sync_action setting), and then
> > > enable read-compare-write, and finally continue the sync. I can't stop
> > > a recovery and set MD_RECOVERY_REQUESTED, so I added a flag.
> > > Anyway, please suggest what the preferred way for this.
> > 
> > I guess I just don't find this functionality at all convincing.  It isn't
> > clear when anyone would use it, or how they would use it.  It seems best not
> > to provide it.
> 
> The background is: For SSD, writting a hard formatted disk and fully filled
> disk, the first case could be 300% faster than the second case depending on SSD
> firmware and how fragmented the filled disk is. So this isn't a trival
> performance issue.
> 
> The usage model is something like this: one disk is borken, add a new disk to
> the raid. Currently we copy the whole disk of the first disk to the second. For
> SSD, if the first disk and second disk have most data identical or most content
> of the first is 0, the copy is very bad. In this scenario, we can avoid some
> copy, which will make latter write to the disk faster. Thinking about 300%
> faster :).
> 
> I don't think we can do this in underlayer. We don't want to do it always.
> Detecting 0 is very expensive.
> 
> And this isn't intrusive to me. We only do the 'copy avoidness' in sync, and
> sync is rare event.
> 
> I hope to convince you this is a useful functionality, then we can discuss the
> implementation.
> 

Let's start with "when would someone use it".

You said that you don't want to make it the default.  If something isn't a
default it will not be used very often, and will only be used at all if it is
reasonably easy to use.
So how would someone use this?  What would make them choose to use it, and
what action would that take to make it happen?

NeilBrown

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 828 bytes --]

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC 1/2]raid1: only write mismatch sectors in sync
  2012-09-18  4:57         ` NeilBrown
@ 2012-09-19  5:51           ` Shaohua Li
  2012-09-19  7:16             ` NeilBrown
  0 siblings, 1 reply; 24+ messages in thread
From: Shaohua Li @ 2012-09-19  5:51 UTC (permalink / raw)
  To: NeilBrown; +Cc: linux-raid

On Tue, Sep 18, 2012 at 02:57:10PM +1000, NeilBrown wrote:
> On Wed, 12 Sep 2012 13:29:41 +0800 Shaohua Li <shli@kernel.org> wrote:
> 
> > On Tue, Sep 11, 2012 at 10:59:08AM +1000, NeilBrown wrote:
> > > On Tue, 31 Jul 2012 16:12:04 +0800 Shaohua Li <shli@kernel.org> wrote:
> > > 
> > > > 2012/7/31 NeilBrown <neilb@suse.de>:
> > > > > On Thu, 26 Jul 2012 16:01:50 +0800 Shaohua Li <shli@kernel.org> wrote:
> > > > >
> > > > >> Write has some impacts to SSD:
> > > > >> 1. wear out flash. Frequent write can speed out the progress.
> > > > >> 2. increase the burden of garbage collection of SSD firmware. If no space
> > > > >> left for write, SSD firmware garbage collection will try to free some space.
> > > > >> 3. slow down subsequent write. After write SSD to some extents (for example,
> > > > >> write the whole disk), subsequent write will slow down significantly (because
> > > > >> almost every write invokes garbage collection in such case).
> > > > >>
> > > > >> We want to avoid unnecessary write as more as possible. raid sync generally
> > > > >> involves a lot of unnecessary write. For example, even two disks don't have
> > > > >> any data, we write the second disk for the whole disk size.
> > > > >>
> > > > >> To reduce write, we always compare raid disk data and only write mismatch part.
> > > > >> This means sync will have extra IO read and memory compare. So this scheme is
> > > > >> very bad for hard disk raid and sometimes SSD raid too if mismatch part is
> > > > >> majority. But sometimes this can be very helpful to reduce write, in that case,
> > > > >> since sync is rare operation, the extra IO/CPU usage is worthy paying. People
> > > > >> who want to use the feature should understand the risk first. So this ability
> > > > >> is off by default, a sysfs entry can be used to enable it.
> > > > >>
> > > > >> Signed-off-by: Shaohua Li <shli@fusionio.com>
> > > > >> ---
> > > > >>  drivers/md/md.c    |   41 +++++++++++++++++++++++++++++++
> > > > >>  drivers/md/md.h    |    3 ++
> > > > >>  drivers/md/raid1.c |   70 +++++++++++++++++++++++++++++++++++++++++++++++++----
> > > > >>  3 files changed, 110 insertions(+), 4 deletions(-)
> > > > >>
> > > > >> Index: linux/drivers/md/md.h
> > > > >> ===================================================================
> > > > >> --- linux.orig/drivers/md/md.h        2012-07-25 13:51:00.353775521 +0800
> > > > >> +++ linux/drivers/md/md.h     2012-07-26 10:36:38.500740552 +0800
> > > > >> @@ -325,6 +325,9 @@ struct mddev {
> > > > >>  #define      MD_RECOVERY_FROZEN      9
> > > > >>
> > > > >>       unsigned long                   recovery;
> > > > >> +#define MD_RECOVERY_MODE_REPAIR              0
> > > > >> +#define MD_RECOVERY_MODE_DISCARD     1
> > > > >> +     unsigned long                   recovery_mode;
> > > > >
> > > > > You have not documented the meaning of these two flags at all, and I don't
> > > > > feel up to guessing.
> > > > >
> > > > > The patch looks more complex that it should be.  The behaviour you are
> > > > > suggesting is exactly the behaviour you get when MD_RECOVERY_REQUESTED is
> > > > > set, so at most I expect to see a few places where that flag is tested
> > > > > changed to test something else as well.
> > > > >
> > > > > How would this be used?  It affects resync and resync happens as soon as the
> > > > > array is assembled.  So when and how would you set the flag which says
> > > > > "prefer reads to writes"?  It seems like it needs to be set in the metadata.
> > > > >
> > > > > BTW RAID10 already does this - it reads and compares for a normal sync.  So
> > > > > maybe just tell people to use raid10 if they want this behaviour?
> > > > 
> > > > It's true, the behavior likes MD_RECOVERY_REQUESTED, ie, read all disks,
> > > > only do write if two disk data not match. But I hope it works as soon as the
> > > > array is assembled. Surely we can set in the metadata, but I didn't want to
> > > > enable it by default, since even with SSD, the read-compare-write isn't optimal
> > > > some times, because it involves extra IO read and memory compare.
> > > > 
> > > > It appears MD_RECOVERY_REQUESTED assumes disks are insync.
> > > > It doesn't read disk !insync, so it doesn't work for assemble.
> > > > 
> > > > In my mind, user frozen the sync first (with sync_action setting), and then
> > > > enable read-compare-write, and finally continue the sync. I can't stop
> > > > a recovery and set MD_RECOVERY_REQUESTED, so I added a flag.
> > > > Anyway, please suggest what the preferred way for this.
> > > 
> > > I guess I just don't find this functionality at all convincing.  It isn't
> > > clear when anyone would use it, or how they would use it.  It seems best not
> > > to provide it.
> > 
> > The background is: For SSD, writting a hard formatted disk and fully filled
> > disk, the first case could be 300% faster than the second case depending on SSD
> > firmware and how fragmented the filled disk is. So this isn't a trival
> > performance issue.
> > 
> > The usage model is something like this: one disk is borken, add a new disk to
> > the raid. Currently we copy the whole disk of the first disk to the second. For
> > SSD, if the first disk and second disk have most data identical or most content
> > of the first is 0, the copy is very bad. In this scenario, we can avoid some
> > copy, which will make latter write to the disk faster. Thinking about 300%
> > faster :).
> > 
> > I don't think we can do this in underlayer. We don't want to do it always.
> > Detecting 0 is very expensive.
> > 
> > And this isn't intrusive to me. We only do the 'copy avoidness' in sync, and
> > sync is rare event.
> > 
> > I hope to convince you this is a useful functionality, then we can discuss the
> > implementation.
> > 
> 
> Let's start with "when would someone use it".
> 
> You said that you don't want to make it the default.  If something isn't a
> default it will not be used very often, and will only be used at all if it is
> reasonably easy to use.
> So how would someone use this?  What would make them choose to use it, and
> what action would that take to make it happen?

Ok, let me explain the usage model.

For 'compare and avoid write if equal' case:
1. update SSD firmware. This doesn't change the data, but we need take one disk
off from the raid one time.
2. One disk has errors, but these errors don't ruin most of the data (for
example, a pcie error)
3. driver/os crash.
In all these cases, two raid disks must be resync, and they have almost identical
data. write avoidness will be very helpful for these.

For 'compare and trim if source disk data is 0' case:
If filesystem support trim, this can be enabled always (or at least we can
enable it if used capacity of the disk is less than 80% for example).

User enables these features and then add disk to downgrade raid. raid resync
then avoids write or does trim.

Does this make sense?

Thanks,
Shaohua

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC 1/2]raid1: only write mismatch sectors in sync
  2012-09-19  5:51           ` Shaohua Li
@ 2012-09-19  7:16             ` NeilBrown
  2012-09-20  1:56               ` Shaohua Li
  0 siblings, 1 reply; 24+ messages in thread
From: NeilBrown @ 2012-09-19  7:16 UTC (permalink / raw)
  To: Shaohua Li; +Cc: linux-raid

[-- Attachment #1: Type: text/plain, Size: 8566 bytes --]

On Wed, 19 Sep 2012 13:51:06 +0800 Shaohua Li <shli@kernel.org> wrote:

> On Tue, Sep 18, 2012 at 02:57:10PM +1000, NeilBrown wrote:
> > On Wed, 12 Sep 2012 13:29:41 +0800 Shaohua Li <shli@kernel.org> wrote:
> > 
> > > On Tue, Sep 11, 2012 at 10:59:08AM +1000, NeilBrown wrote:
> > > > On Tue, 31 Jul 2012 16:12:04 +0800 Shaohua Li <shli@kernel.org> wrote:
> > > > 
> > > > > 2012/7/31 NeilBrown <neilb@suse.de>:
> > > > > > On Thu, 26 Jul 2012 16:01:50 +0800 Shaohua Li <shli@kernel.org> wrote:
> > > > > >
> > > > > >> Write has some impacts to SSD:
> > > > > >> 1. wear out flash. Frequent write can speed out the progress.
> > > > > >> 2. increase the burden of garbage collection of SSD firmware. If no space
> > > > > >> left for write, SSD firmware garbage collection will try to free some space.
> > > > > >> 3. slow down subsequent write. After write SSD to some extents (for example,
> > > > > >> write the whole disk), subsequent write will slow down significantly (because
> > > > > >> almost every write invokes garbage collection in such case).
> > > > > >>
> > > > > >> We want to avoid unnecessary write as more as possible. raid sync generally
> > > > > >> involves a lot of unnecessary write. For example, even two disks don't have
> > > > > >> any data, we write the second disk for the whole disk size.
> > > > > >>
> > > > > >> To reduce write, we always compare raid disk data and only write mismatch part.
> > > > > >> This means sync will have extra IO read and memory compare. So this scheme is
> > > > > >> very bad for hard disk raid and sometimes SSD raid too if mismatch part is
> > > > > >> majority. But sometimes this can be very helpful to reduce write, in that case,
> > > > > >> since sync is rare operation, the extra IO/CPU usage is worthy paying. People
> > > > > >> who want to use the feature should understand the risk first. So this ability
> > > > > >> is off by default, a sysfs entry can be used to enable it.
> > > > > >>
> > > > > >> Signed-off-by: Shaohua Li <shli@fusionio.com>
> > > > > >> ---
> > > > > >>  drivers/md/md.c    |   41 +++++++++++++++++++++++++++++++
> > > > > >>  drivers/md/md.h    |    3 ++
> > > > > >>  drivers/md/raid1.c |   70 +++++++++++++++++++++++++++++++++++++++++++++++++----
> > > > > >>  3 files changed, 110 insertions(+), 4 deletions(-)
> > > > > >>
> > > > > >> Index: linux/drivers/md/md.h
> > > > > >> ===================================================================
> > > > > >> --- linux.orig/drivers/md/md.h        2012-07-25 13:51:00.353775521 +0800
> > > > > >> +++ linux/drivers/md/md.h     2012-07-26 10:36:38.500740552 +0800
> > > > > >> @@ -325,6 +325,9 @@ struct mddev {
> > > > > >>  #define      MD_RECOVERY_FROZEN      9
> > > > > >>
> > > > > >>       unsigned long                   recovery;
> > > > > >> +#define MD_RECOVERY_MODE_REPAIR              0
> > > > > >> +#define MD_RECOVERY_MODE_DISCARD     1
> > > > > >> +     unsigned long                   recovery_mode;
> > > > > >
> > > > > > You have not documented the meaning of these two flags at all, and I don't
> > > > > > feel up to guessing.
> > > > > >
> > > > > > The patch looks more complex that it should be.  The behaviour you are
> > > > > > suggesting is exactly the behaviour you get when MD_RECOVERY_REQUESTED is
> > > > > > set, so at most I expect to see a few places where that flag is tested
> > > > > > changed to test something else as well.
> > > > > >
> > > > > > How would this be used?  It affects resync and resync happens as soon as the
> > > > > > array is assembled.  So when and how would you set the flag which says
> > > > > > "prefer reads to writes"?  It seems like it needs to be set in the metadata.
> > > > > >
> > > > > > BTW RAID10 already does this - it reads and compares for a normal sync.  So
> > > > > > maybe just tell people to use raid10 if they want this behaviour?
> > > > > 
> > > > > It's true, the behavior likes MD_RECOVERY_REQUESTED, ie, read all disks,
> > > > > only do write if two disk data not match. But I hope it works as soon as the
> > > > > array is assembled. Surely we can set in the metadata, but I didn't want to
> > > > > enable it by default, since even with SSD, the read-compare-write isn't optimal
> > > > > some times, because it involves extra IO read and memory compare.
> > > > > 
> > > > > It appears MD_RECOVERY_REQUESTED assumes disks are insync.
> > > > > It doesn't read disk !insync, so it doesn't work for assemble.
> > > > > 
> > > > > In my mind, user frozen the sync first (with sync_action setting), and then
> > > > > enable read-compare-write, and finally continue the sync. I can't stop
> > > > > a recovery and set MD_RECOVERY_REQUESTED, so I added a flag.
> > > > > Anyway, please suggest what the preferred way for this.
> > > > 
> > > > I guess I just don't find this functionality at all convincing.  It isn't
> > > > clear when anyone would use it, or how they would use it.  It seems best not
> > > > to provide it.
> > > 
> > > The background is: For SSD, writting a hard formatted disk and fully filled
> > > disk, the first case could be 300% faster than the second case depending on SSD
> > > firmware and how fragmented the filled disk is. So this isn't a trival
> > > performance issue.
> > > 
> > > The usage model is something like this: one disk is borken, add a new disk to
> > > the raid. Currently we copy the whole disk of the first disk to the second. For
> > > SSD, if the first disk and second disk have most data identical or most content
> > > of the first is 0, the copy is very bad. In this scenario, we can avoid some
> > > copy, which will make latter write to the disk faster. Thinking about 300%
> > > faster :).
> > > 
> > > I don't think we can do this in underlayer. We don't want to do it always.
> > > Detecting 0 is very expensive.
> > > 
> > > And this isn't intrusive to me. We only do the 'copy avoidness' in sync, and
> > > sync is rare event.
> > > 
> > > I hope to convince you this is a useful functionality, then we can discuss the
> > > implementation.
> > > 
> > 
> > Let's start with "when would someone use it".
> > 
> > You said that you don't want to make it the default.  If something isn't a
> > default it will not be used very often, and will only be used at all if it is
> > reasonably easy to use.
> > So how would someone use this?  What would make them choose to use it, and
> > what action would that take to make it happen?
> 
> Ok, let me explain the usage model.
> 
> For 'compare and avoid write if equal' case:
> 1. update SSD firmware. This doesn't change the data, but we need take one disk
> off from the raid one time.
> 2. One disk has errors, but these errors don't ruin most of the data (for
> example, a pcie error)
> 3. driver/os crash.
> In all these cases, two raid disks must be resync, and they have almost identical
> data. write avoidness will be very helpful for these.

This is all valid, but doesn't explain how it actually happens.

Maybe the rsync should initially do a comparison and write-if-different.  If
at any time after the first megabyte the fraction of blocks that requires a
write exceeds (say) 10%, we switch to "read one, write the others".
??

> 
> For 'compare and trim if source disk data is 0' case:
> If filesystem support trim, this can be enabled always (or at least we can
> enable it if used capacity of the disk is less than 80% for example).

One of my concerns about this is that I would expect smart flash drives to
actually do this zero-detection internally.  Maybe they don't now, but surely
they will soon.  By the time we get this feature into production there seems
a reasonably chance that it won't be needed - at least on new hardware.

And on old hardware I believe that 'discard' isn't particularly fast (as it
cannot be queued ... or something).  So I would only want to send a discard
request if it was very large, though I'm not sure how large is large enough.

> 
> User enables these features and then add disk to downgrade raid. raid resync
> then avoids write or does trim.

This is the bit that concerns me the most - expecting the user to enable this
sort of feature.  In many cases they won't making the code pointless.  If an
optimisation is needed, we should use it automatically.  Maybe we can for the
first first case.  Not sure yet about the second one.

NeilBrown


> 
> Does this make sense?
> 
> Thanks,
> Shaohua


[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 828 bytes --]

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC 1/2]raid1: only write mismatch sectors in sync
  2012-09-19  7:16             ` NeilBrown
@ 2012-09-20  1:56               ` Shaohua Li
  2012-10-17  5:11                 ` Shaohua Li
  0 siblings, 1 reply; 24+ messages in thread
From: Shaohua Li @ 2012-09-20  1:56 UTC (permalink / raw)
  To: NeilBrown; +Cc: linux-raid

On Wed, Sep 19, 2012 at 05:16:46PM +1000, NeilBrown wrote:
> On Wed, 19 Sep 2012 13:51:06 +0800 Shaohua Li <shli@kernel.org> wrote:
> 
> > On Tue, Sep 18, 2012 at 02:57:10PM +1000, NeilBrown wrote:
> > > On Wed, 12 Sep 2012 13:29:41 +0800 Shaohua Li <shli@kernel.org> wrote:
> > > 
> > > > On Tue, Sep 11, 2012 at 10:59:08AM +1000, NeilBrown wrote:
> > > > > On Tue, 31 Jul 2012 16:12:04 +0800 Shaohua Li <shli@kernel.org> wrote:
> > > > > 
> > > > > > 2012/7/31 NeilBrown <neilb@suse.de>:
> > > > > > > On Thu, 26 Jul 2012 16:01:50 +0800 Shaohua Li <shli@kernel.org> wrote:
> > > > > > >
> > > > > > >> Write has some impacts to SSD:
> > > > > > >> 1. wear out flash. Frequent write can speed out the progress.
> > > > > > >> 2. increase the burden of garbage collection of SSD firmware. If no space
> > > > > > >> left for write, SSD firmware garbage collection will try to free some space.
> > > > > > >> 3. slow down subsequent write. After write SSD to some extents (for example,
> > > > > > >> write the whole disk), subsequent write will slow down significantly (because
> > > > > > >> almost every write invokes garbage collection in such case).
> > > > > > >>
> > > > > > >> We want to avoid unnecessary write as more as possible. raid sync generally
> > > > > > >> involves a lot of unnecessary write. For example, even two disks don't have
> > > > > > >> any data, we write the second disk for the whole disk size.
> > > > > > >>
> > > > > > >> To reduce write, we always compare raid disk data and only write mismatch part.
> > > > > > >> This means sync will have extra IO read and memory compare. So this scheme is
> > > > > > >> very bad for hard disk raid and sometimes SSD raid too if mismatch part is
> > > > > > >> majority. But sometimes this can be very helpful to reduce write, in that case,
> > > > > > >> since sync is rare operation, the extra IO/CPU usage is worthy paying. People
> > > > > > >> who want to use the feature should understand the risk first. So this ability
> > > > > > >> is off by default, a sysfs entry can be used to enable it.
> > > > > > >>
> > > > > > >> Signed-off-by: Shaohua Li <shli@fusionio.com>
> > > > > > >> ---
> > > > > > >>  drivers/md/md.c    |   41 +++++++++++++++++++++++++++++++
> > > > > > >>  drivers/md/md.h    |    3 ++
> > > > > > >>  drivers/md/raid1.c |   70 +++++++++++++++++++++++++++++++++++++++++++++++++----
> > > > > > >>  3 files changed, 110 insertions(+), 4 deletions(-)
> > > > > > >>
> > > > > > >> Index: linux/drivers/md/md.h
> > > > > > >> ===================================================================
> > > > > > >> --- linux.orig/drivers/md/md.h        2012-07-25 13:51:00.353775521 +0800
> > > > > > >> +++ linux/drivers/md/md.h     2012-07-26 10:36:38.500740552 +0800
> > > > > > >> @@ -325,6 +325,9 @@ struct mddev {
> > > > > > >>  #define      MD_RECOVERY_FROZEN      9
> > > > > > >>
> > > > > > >>       unsigned long                   recovery;
> > > > > > >> +#define MD_RECOVERY_MODE_REPAIR              0
> > > > > > >> +#define MD_RECOVERY_MODE_DISCARD     1
> > > > > > >> +     unsigned long                   recovery_mode;
> > > > > > >
> > > > > > > You have not documented the meaning of these two flags at all, and I don't
> > > > > > > feel up to guessing.
> > > > > > >
> > > > > > > The patch looks more complex that it should be.  The behaviour you are
> > > > > > > suggesting is exactly the behaviour you get when MD_RECOVERY_REQUESTED is
> > > > > > > set, so at most I expect to see a few places where that flag is tested
> > > > > > > changed to test something else as well.
> > > > > > >
> > > > > > > How would this be used?  It affects resync and resync happens as soon as the
> > > > > > > array is assembled.  So when and how would you set the flag which says
> > > > > > > "prefer reads to writes"?  It seems like it needs to be set in the metadata.
> > > > > > >
> > > > > > > BTW RAID10 already does this - it reads and compares for a normal sync.  So
> > > > > > > maybe just tell people to use raid10 if they want this behaviour?
> > > > > > 
> > > > > > It's true, the behavior likes MD_RECOVERY_REQUESTED, ie, read all disks,
> > > > > > only do write if two disk data not match. But I hope it works as soon as the
> > > > > > array is assembled. Surely we can set in the metadata, but I didn't want to
> > > > > > enable it by default, since even with SSD, the read-compare-write isn't optimal
> > > > > > some times, because it involves extra IO read and memory compare.
> > > > > > 
> > > > > > It appears MD_RECOVERY_REQUESTED assumes disks are insync.
> > > > > > It doesn't read disk !insync, so it doesn't work for assemble.
> > > > > > 
> > > > > > In my mind, user frozen the sync first (with sync_action setting), and then
> > > > > > enable read-compare-write, and finally continue the sync. I can't stop
> > > > > > a recovery and set MD_RECOVERY_REQUESTED, so I added a flag.
> > > > > > Anyway, please suggest what the preferred way for this.
> > > > > 
> > > > > I guess I just don't find this functionality at all convincing.  It isn't
> > > > > clear when anyone would use it, or how they would use it.  It seems best not
> > > > > to provide it.
> > > > 
> > > > The background is: For SSD, writting a hard formatted disk and fully filled
> > > > disk, the first case could be 300% faster than the second case depending on SSD
> > > > firmware and how fragmented the filled disk is. So this isn't a trival
> > > > performance issue.
> > > > 
> > > > The usage model is something like this: one disk is borken, add a new disk to
> > > > the raid. Currently we copy the whole disk of the first disk to the second. For
> > > > SSD, if the first disk and second disk have most data identical or most content
> > > > of the first is 0, the copy is very bad. In this scenario, we can avoid some
> > > > copy, which will make latter write to the disk faster. Thinking about 300%
> > > > faster :).
> > > > 
> > > > I don't think we can do this in underlayer. We don't want to do it always.
> > > > Detecting 0 is very expensive.
> > > > 
> > > > And this isn't intrusive to me. We only do the 'copy avoidness' in sync, and
> > > > sync is rare event.
> > > > 
> > > > I hope to convince you this is a useful functionality, then we can discuss the
> > > > implementation.
> > > > 
> > > 
> > > Let's start with "when would someone use it".
> > > 
> > > You said that you don't want to make it the default.  If something isn't a
> > > default it will not be used very often, and will only be used at all if it is
> > > reasonably easy to use.
> > > So how would someone use this?  What would make them choose to use it, and
> > > what action would that take to make it happen?
> > 
> > Ok, let me explain the usage model.
> > 
> > For 'compare and avoid write if equal' case:
> > 1. update SSD firmware. This doesn't change the data, but we need take one disk
> > off from the raid one time.
> > 2. One disk has errors, but these errors don't ruin most of the data (for
> > example, a pcie error)
> > 3. driver/os crash.
> > In all these cases, two raid disks must be resync, and they have almost identical
> > data. write avoidness will be very helpful for these.
> 
> This is all valid, but doesn't explain how it actually happens.

Those are real use cases actually.
 
> Maybe the rsync should initially do a comparison and write-if-different.  If
> at any time after the first megabyte the fraction of blocks that requires a
> write exceeds (say) 10%, we switch to "read one, write the others".
> ??

Not sure, but this will limit the usage. For example, in my case 1, the first
disk can be written slightly when the second disk is doing firmware upgrade.
The write can be in the first 1M for sure.
 
> > For 'compare and trim if source disk data is 0' case:
> > If filesystem support trim, this can be enabled always (or at least we can
> > enable it if used capacity of the disk is less than 80% for example).
> 
> One of my concerns about this is that I would expect smart flash drives to
> actually do this zero-detection internally.  Maybe they don't now, but surely
> they will soon.  By the time we get this feature into production there seems
> a reasonably chance that it won't be needed - at least on new hardware.

Doesn't make sense to me. zero-detection is very expensive. blindly doing it
for every request is insane. While my proposal is just doing it for sync, which
is a rare scenario.
 
> And on old hardware I believe that 'discard' isn't particularly fast (as it
> cannot be queued ... or something).  So I would only want to send a discard
> request if it was very large, though I'm not sure how large is large enough.

Yep, discard speed varies between SSDs. Some are very slow, much slower than
write, some SATA SSD discard speed is even slower than harddisk write. While
our card is very fast, faster than write.
 
> > User enables these features and then add disk to downgrade raid. raid resync
> > then avoids write or does trim.
> 
> This is the bit that concerns me the most - expecting the user to enable this
> sort of feature.  In many cases they won't making the code pointless.  If an
> optimisation is needed, we should use it automatically.  Maybe we can for the
> first first case.  Not sure yet about the second one.

I understood your concerns. Giving this feature can harm/benefit, depending on
different SSDs and workload/usage model, an interface to enable is preferred.

Thanks,
Shaohua

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC 1/2]raid1: only write mismatch sectors in sync
  2012-09-20  1:56               ` Shaohua Li
@ 2012-10-17  5:11                 ` Shaohua Li
  2012-10-17 22:56                   ` NeilBrown
  0 siblings, 1 reply; 24+ messages in thread
From: Shaohua Li @ 2012-10-17  5:11 UTC (permalink / raw)
  To: NeilBrown; +Cc: linux-raid

On Thu, Sep 20, 2012 at 09:56:55AM +0800, Shaohua Li wrote:
> On Wed, Sep 19, 2012 at 05:16:46PM +1000, NeilBrown wrote:
> > On Wed, 19 Sep 2012 13:51:06 +0800 Shaohua Li <shli@kernel.org> wrote:
> > 
> > > On Tue, Sep 18, 2012 at 02:57:10PM +1000, NeilBrown wrote:
> > > > On Wed, 12 Sep 2012 13:29:41 +0800 Shaohua Li <shli@kernel.org> wrote:
> > > > 
> > > > > On Tue, Sep 11, 2012 at 10:59:08AM +1000, NeilBrown wrote:
> > > > > > On Tue, 31 Jul 2012 16:12:04 +0800 Shaohua Li <shli@kernel.org> wrote:
> > > > > > 
> > > > > > > 2012/7/31 NeilBrown <neilb@suse.de>:
> > > > > > > > On Thu, 26 Jul 2012 16:01:50 +0800 Shaohua Li <shli@kernel.org> wrote:
> > > > > > > >
> > > > > > > >> Write has some impacts to SSD:
> > > > > > > >> 1. wear out flash. Frequent write can speed out the progress.
> > > > > > > >> 2. increase the burden of garbage collection of SSD firmware. If no space
> > > > > > > >> left for write, SSD firmware garbage collection will try to free some space.
> > > > > > > >> 3. slow down subsequent write. After write SSD to some extents (for example,
> > > > > > > >> write the whole disk), subsequent write will slow down significantly (because
> > > > > > > >> almost every write invokes garbage collection in such case).
> > > > > > > >>
> > > > > > > >> We want to avoid unnecessary write as more as possible. raid sync generally
> > > > > > > >> involves a lot of unnecessary write. For example, even two disks don't have
> > > > > > > >> any data, we write the second disk for the whole disk size.
> > > > > > > >>
> > > > > > > >> To reduce write, we always compare raid disk data and only write mismatch part.
> > > > > > > >> This means sync will have extra IO read and memory compare. So this scheme is
> > > > > > > >> very bad for hard disk raid and sometimes SSD raid too if mismatch part is
> > > > > > > >> majority. But sometimes this can be very helpful to reduce write, in that case,
> > > > > > > >> since sync is rare operation, the extra IO/CPU usage is worthy paying. People
> > > > > > > >> who want to use the feature should understand the risk first. So this ability
> > > > > > > >> is off by default, a sysfs entry can be used to enable it.
> > > > > > > >>
> > > > > > > >> Signed-off-by: Shaohua Li <shli@fusionio.com>
> > > > > > > >> ---
> > > > > > > >>  drivers/md/md.c    |   41 +++++++++++++++++++++++++++++++
> > > > > > > >>  drivers/md/md.h    |    3 ++
> > > > > > > >>  drivers/md/raid1.c |   70 +++++++++++++++++++++++++++++++++++++++++++++++++----
> > > > > > > >>  3 files changed, 110 insertions(+), 4 deletions(-)
> > > > > > > >>
> > > > > > > >> Index: linux/drivers/md/md.h
> > > > > > > >> ===================================================================
> > > > > > > >> --- linux.orig/drivers/md/md.h        2012-07-25 13:51:00.353775521 +0800
> > > > > > > >> +++ linux/drivers/md/md.h     2012-07-26 10:36:38.500740552 +0800
> > > > > > > >> @@ -325,6 +325,9 @@ struct mddev {
> > > > > > > >>  #define      MD_RECOVERY_FROZEN      9
> > > > > > > >>
> > > > > > > >>       unsigned long                   recovery;
> > > > > > > >> +#define MD_RECOVERY_MODE_REPAIR              0
> > > > > > > >> +#define MD_RECOVERY_MODE_DISCARD     1
> > > > > > > >> +     unsigned long                   recovery_mode;
> > > > > > > >
> > > > > > > > You have not documented the meaning of these two flags at all, and I don't
> > > > > > > > feel up to guessing.
> > > > > > > >
> > > > > > > > The patch looks more complex that it should be.  The behaviour you are
> > > > > > > > suggesting is exactly the behaviour you get when MD_RECOVERY_REQUESTED is
> > > > > > > > set, so at most I expect to see a few places where that flag is tested
> > > > > > > > changed to test something else as well.
> > > > > > > >
> > > > > > > > How would this be used?  It affects resync and resync happens as soon as the
> > > > > > > > array is assembled.  So when and how would you set the flag which says
> > > > > > > > "prefer reads to writes"?  It seems like it needs to be set in the metadata.
> > > > > > > >
> > > > > > > > BTW RAID10 already does this - it reads and compares for a normal sync.  So
> > > > > > > > maybe just tell people to use raid10 if they want this behaviour?
> > > > > > > 
> > > > > > > It's true, the behavior likes MD_RECOVERY_REQUESTED, ie, read all disks,
> > > > > > > only do write if two disk data not match. But I hope it works as soon as the
> > > > > > > array is assembled. Surely we can set in the metadata, but I didn't want to
> > > > > > > enable it by default, since even with SSD, the read-compare-write isn't optimal
> > > > > > > some times, because it involves extra IO read and memory compare.
> > > > > > > 
> > > > > > > It appears MD_RECOVERY_REQUESTED assumes disks are insync.
> > > > > > > It doesn't read disk !insync, so it doesn't work for assemble.
> > > > > > > 
> > > > > > > In my mind, user frozen the sync first (with sync_action setting), and then
> > > > > > > enable read-compare-write, and finally continue the sync. I can't stop
> > > > > > > a recovery and set MD_RECOVERY_REQUESTED, so I added a flag.
> > > > > > > Anyway, please suggest what the preferred way for this.
> > > > > > 
> > > > > > I guess I just don't find this functionality at all convincing.  It isn't
> > > > > > clear when anyone would use it, or how they would use it.  It seems best not
> > > > > > to provide it.
> > > > > 
> > > > > The background is: For SSD, writting a hard formatted disk and fully filled
> > > > > disk, the first case could be 300% faster than the second case depending on SSD
> > > > > firmware and how fragmented the filled disk is. So this isn't a trival
> > > > > performance issue.
> > > > > 
> > > > > The usage model is something like this: one disk is borken, add a new disk to
> > > > > the raid. Currently we copy the whole disk of the first disk to the second. For
> > > > > SSD, if the first disk and second disk have most data identical or most content
> > > > > of the first is 0, the copy is very bad. In this scenario, we can avoid some
> > > > > copy, which will make latter write to the disk faster. Thinking about 300%
> > > > > faster :).
> > > > > 
> > > > > I don't think we can do this in underlayer. We don't want to do it always.
> > > > > Detecting 0 is very expensive.
> > > > > 
> > > > > And this isn't intrusive to me. We only do the 'copy avoidness' in sync, and
> > > > > sync is rare event.
> > > > > 
> > > > > I hope to convince you this is a useful functionality, then we can discuss the
> > > > > implementation.
> > > > > 
> > > > 
> > > > Let's start with "when would someone use it".
> > > > 
> > > > You said that you don't want to make it the default.  If something isn't a
> > > > default it will not be used very often, and will only be used at all if it is
> > > > reasonably easy to use.
> > > > So how would someone use this?  What would make them choose to use it, and
> > > > what action would that take to make it happen?
> > > 
> > > Ok, let me explain the usage model.
> > > 
> > > For 'compare and avoid write if equal' case:
> > > 1. update SSD firmware. This doesn't change the data, but we need take one disk
> > > off from the raid one time.
> > > 2. One disk has errors, but these errors don't ruin most of the data (for
> > > example, a pcie error)
> > > 3. driver/os crash.
> > > In all these cases, two raid disks must be resync, and they have almost identical
> > > data. write avoidness will be very helpful for these.
> > 
> > This is all valid, but doesn't explain how it actually happens.
> 
> Those are real use cases actually.
>  
> > Maybe the rsync should initially do a comparison and write-if-different.  If
> > at any time after the first megabyte the fraction of blocks that requires a
> > write exceeds (say) 10%, we switch to "read one, write the others".
> > ??
> 
> Not sure, but this will limit the usage. For example, in my case 1, the first
> disk can be written slightly when the second disk is doing firmware upgrade.
> The write can be in the first 1M for sure.
>  
> > > For 'compare and trim if source disk data is 0' case:
> > > If filesystem support trim, this can be enabled always (or at least we can
> > > enable it if used capacity of the disk is less than 80% for example).
> > 
> > One of my concerns about this is that I would expect smart flash drives to
> > actually do this zero-detection internally.  Maybe they don't now, but surely
> > they will soon.  By the time we get this feature into production there seems
> > a reasonably chance that it won't be needed - at least on new hardware.
> 
> Doesn't make sense to me. zero-detection is very expensive. blindly doing it
> for every request is insane. While my proposal is just doing it for sync, which
> is a rare scenario.
>  
> > And on old hardware I believe that 'discard' isn't particularly fast (as it
> > cannot be queued ... or something).  So I would only want to send a discard
> > request if it was very large, though I'm not sure how large is large enough.
> 
> Yep, discard speed varies between SSDs. Some are very slow, much slower than
> write, some SATA SSD discard speed is even slower than harddisk write. While
> our card is very fast, faster than write.
>  
> > > User enables these features and then add disk to downgrade raid. raid resync
> > > then avoids write or does trim.
> > 
> > This is the bit that concerns me the most - expecting the user to enable this
> > sort of feature.  In many cases they won't making the code pointless.  If an
> > optimisation is needed, we should use it automatically.  Maybe we can for the
> > first first case.  Not sure yet about the second one.
> 
> I understood your concerns. Giving this feature can harm/benefit, depending on
> different SSDs and workload/usage model, an interface to enable is preferred.

Neil,
any further comments on this? This is a usable feature, I hope we can have some
agreements.

Thanks,
Shaohua

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC 1/2]raid1: only write mismatch sectors in sync
  2012-10-17  5:11                 ` Shaohua Li
@ 2012-10-17 22:56                   ` NeilBrown
  2012-10-18  1:17                     ` Shaohua Li
  2012-11-20 17:00                     ` Joseph Glanville
  0 siblings, 2 replies; 24+ messages in thread
From: NeilBrown @ 2012-10-17 22:56 UTC (permalink / raw)
  To: Shaohua Li; +Cc: linux-raid

[-- Attachment #1: Type: text/plain, Size: 11294 bytes --]

On Wed, 17 Oct 2012 13:11:13 +0800 Shaohua Li <shli@kernel.org> wrote:

> On Thu, Sep 20, 2012 at 09:56:55AM +0800, Shaohua Li wrote:
> > On Wed, Sep 19, 2012 at 05:16:46PM +1000, NeilBrown wrote:
> > > On Wed, 19 Sep 2012 13:51:06 +0800 Shaohua Li <shli@kernel.org> wrote:
> > > 
> > > > On Tue, Sep 18, 2012 at 02:57:10PM +1000, NeilBrown wrote:
> > > > > On Wed, 12 Sep 2012 13:29:41 +0800 Shaohua Li <shli@kernel.org> wrote:
> > > > > 
> > > > > > On Tue, Sep 11, 2012 at 10:59:08AM +1000, NeilBrown wrote:
> > > > > > > On Tue, 31 Jul 2012 16:12:04 +0800 Shaohua Li <shli@kernel.org> wrote:
> > > > > > > 
> > > > > > > > 2012/7/31 NeilBrown <neilb@suse.de>:
> > > > > > > > > On Thu, 26 Jul 2012 16:01:50 +0800 Shaohua Li <shli@kernel.org> wrote:
> > > > > > > > >
> > > > > > > > >> Write has some impacts to SSD:
> > > > > > > > >> 1. wear out flash. Frequent write can speed out the progress.
> > > > > > > > >> 2. increase the burden of garbage collection of SSD firmware. If no space
> > > > > > > > >> left for write, SSD firmware garbage collection will try to free some space.
> > > > > > > > >> 3. slow down subsequent write. After write SSD to some extents (for example,
> > > > > > > > >> write the whole disk), subsequent write will slow down significantly (because
> > > > > > > > >> almost every write invokes garbage collection in such case).
> > > > > > > > >>
> > > > > > > > >> We want to avoid unnecessary write as more as possible. raid sync generally
> > > > > > > > >> involves a lot of unnecessary write. For example, even two disks don't have
> > > > > > > > >> any data, we write the second disk for the whole disk size.
> > > > > > > > >>
> > > > > > > > >> To reduce write, we always compare raid disk data and only write mismatch part.
> > > > > > > > >> This means sync will have extra IO read and memory compare. So this scheme is
> > > > > > > > >> very bad for hard disk raid and sometimes SSD raid too if mismatch part is
> > > > > > > > >> majority. But sometimes this can be very helpful to reduce write, in that case,
> > > > > > > > >> since sync is rare operation, the extra IO/CPU usage is worthy paying. People
> > > > > > > > >> who want to use the feature should understand the risk first. So this ability
> > > > > > > > >> is off by default, a sysfs entry can be used to enable it.
> > > > > > > > >>
> > > > > > > > >> Signed-off-by: Shaohua Li <shli@fusionio.com>
> > > > > > > > >> ---
> > > > > > > > >>  drivers/md/md.c    |   41 +++++++++++++++++++++++++++++++
> > > > > > > > >>  drivers/md/md.h    |    3 ++
> > > > > > > > >>  drivers/md/raid1.c |   70 +++++++++++++++++++++++++++++++++++++++++++++++++----
> > > > > > > > >>  3 files changed, 110 insertions(+), 4 deletions(-)
> > > > > > > > >>
> > > > > > > > >> Index: linux/drivers/md/md.h
> > > > > > > > >> ===================================================================
> > > > > > > > >> --- linux.orig/drivers/md/md.h        2012-07-25 13:51:00.353775521 +0800
> > > > > > > > >> +++ linux/drivers/md/md.h     2012-07-26 10:36:38.500740552 +0800
> > > > > > > > >> @@ -325,6 +325,9 @@ struct mddev {
> > > > > > > > >>  #define      MD_RECOVERY_FROZEN      9
> > > > > > > > >>
> > > > > > > > >>       unsigned long                   recovery;
> > > > > > > > >> +#define MD_RECOVERY_MODE_REPAIR              0
> > > > > > > > >> +#define MD_RECOVERY_MODE_DISCARD     1
> > > > > > > > >> +     unsigned long                   recovery_mode;
> > > > > > > > >
> > > > > > > > > You have not documented the meaning of these two flags at all, and I don't
> > > > > > > > > feel up to guessing.
> > > > > > > > >
> > > > > > > > > The patch looks more complex that it should be.  The behaviour you are
> > > > > > > > > suggesting is exactly the behaviour you get when MD_RECOVERY_REQUESTED is
> > > > > > > > > set, so at most I expect to see a few places where that flag is tested
> > > > > > > > > changed to test something else as well.
> > > > > > > > >
> > > > > > > > > How would this be used?  It affects resync and resync happens as soon as the
> > > > > > > > > array is assembled.  So when and how would you set the flag which says
> > > > > > > > > "prefer reads to writes"?  It seems like it needs to be set in the metadata.
> > > > > > > > >
> > > > > > > > > BTW RAID10 already does this - it reads and compares for a normal sync.  So
> > > > > > > > > maybe just tell people to use raid10 if they want this behaviour?
> > > > > > > > 
> > > > > > > > It's true, the behavior likes MD_RECOVERY_REQUESTED, ie, read all disks,
> > > > > > > > only do write if two disk data not match. But I hope it works as soon as the
> > > > > > > > array is assembled. Surely we can set in the metadata, but I didn't want to
> > > > > > > > enable it by default, since even with SSD, the read-compare-write isn't optimal
> > > > > > > > some times, because it involves extra IO read and memory compare.
> > > > > > > > 
> > > > > > > > It appears MD_RECOVERY_REQUESTED assumes disks are insync.
> > > > > > > > It doesn't read disk !insync, so it doesn't work for assemble.
> > > > > > > > 
> > > > > > > > In my mind, user frozen the sync first (with sync_action setting), and then
> > > > > > > > enable read-compare-write, and finally continue the sync. I can't stop
> > > > > > > > a recovery and set MD_RECOVERY_REQUESTED, so I added a flag.
> > > > > > > > Anyway, please suggest what the preferred way for this.
> > > > > > > 
> > > > > > > I guess I just don't find this functionality at all convincing.  It isn't
> > > > > > > clear when anyone would use it, or how they would use it.  It seems best not
> > > > > > > to provide it.
> > > > > > 
> > > > > > The background is: For SSD, writting a hard formatted disk and fully filled
> > > > > > disk, the first case could be 300% faster than the second case depending on SSD
> > > > > > firmware and how fragmented the filled disk is. So this isn't a trival
> > > > > > performance issue.
> > > > > > 
> > > > > > The usage model is something like this: one disk is borken, add a new disk to
> > > > > > the raid. Currently we copy the whole disk of the first disk to the second. For
> > > > > > SSD, if the first disk and second disk have most data identical or most content
> > > > > > of the first is 0, the copy is very bad. In this scenario, we can avoid some
> > > > > > copy, which will make latter write to the disk faster. Thinking about 300%
> > > > > > faster :).
> > > > > > 
> > > > > > I don't think we can do this in underlayer. We don't want to do it always.
> > > > > > Detecting 0 is very expensive.
> > > > > > 
> > > > > > And this isn't intrusive to me. We only do the 'copy avoidness' in sync, and
> > > > > > sync is rare event.
> > > > > > 
> > > > > > I hope to convince you this is a useful functionality, then we can discuss the
> > > > > > implementation.
> > > > > > 
> > > > > 
> > > > > Let's start with "when would someone use it".
> > > > > 
> > > > > You said that you don't want to make it the default.  If something isn't a
> > > > > default it will not be used very often, and will only be used at all if it is
> > > > > reasonably easy to use.
> > > > > So how would someone use this?  What would make them choose to use it, and
> > > > > what action would that take to make it happen?
> > > > 
> > > > Ok, let me explain the usage model.
> > > > 
> > > > For 'compare and avoid write if equal' case:
> > > > 1. update SSD firmware. This doesn't change the data, but we need take one disk
> > > > off from the raid one time.
> > > > 2. One disk has errors, but these errors don't ruin most of the data (for
> > > > example, a pcie error)
> > > > 3. driver/os crash.
> > > > In all these cases, two raid disks must be resync, and they have almost identical
> > > > data. write avoidness will be very helpful for these.
> > > 
> > > This is all valid, but doesn't explain how it actually happens.
> > 
> > Those are real use cases actually.
> >  
> > > Maybe the rsync should initially do a comparison and write-if-different.  If
> > > at any time after the first megabyte the fraction of blocks that requires a
> > > write exceeds (say) 10%, we switch to "read one, write the others".
> > > ??
> > 
> > Not sure, but this will limit the usage. For example, in my case 1, the first
> > disk can be written slightly when the second disk is doing firmware upgrade.
> > The write can be in the first 1M for sure.
> >  
> > > > For 'compare and trim if source disk data is 0' case:
> > > > If filesystem support trim, this can be enabled always (or at least we can
> > > > enable it if used capacity of the disk is less than 80% for example).
> > > 
> > > One of my concerns about this is that I would expect smart flash drives to
> > > actually do this zero-detection internally.  Maybe they don't now, but surely
> > > they will soon.  By the time we get this feature into production there seems
> > > a reasonably chance that it won't be needed - at least on new hardware.
> > 
> > Doesn't make sense to me. zero-detection is very expensive. blindly doing it
> > for every request is insane. While my proposal is just doing it for sync, which
> > is a rare scenario.
> >  
> > > And on old hardware I believe that 'discard' isn't particularly fast (as it
> > > cannot be queued ... or something).  So I would only want to send a discard
> > > request if it was very large, though I'm not sure how large is large enough.
> > 
> > Yep, discard speed varies between SSDs. Some are very slow, much slower than
> > write, some SATA SSD discard speed is even slower than harddisk write. While
> > our card is very fast, faster than write.
> >  
> > > > User enables these features and then add disk to downgrade raid. raid resync
> > > > then avoids write or does trim.
> > > 
> > > This is the bit that concerns me the most - expecting the user to enable this
> > > sort of feature.  In many cases they won't making the code pointless.  If an
> > > optimisation is needed, we should use it automatically.  Maybe we can for the
> > > first first case.  Not sure yet about the second one.
> > 
> > I understood your concerns. Giving this feature can harm/benefit, depending on
> > different SSDs and workload/usage model, an interface to enable is preferred.
> 
> Neil,
> any further comments on this? This is a usable feature, I hope we can have some
> agreements.

You still haven't answered my main question, which possibly means I haven't
asked it very clearly.

You are saying that this new behaviour should not be the default and I think
I agree.
So the question is:  how it is selected?

You cannot expect the user to explicitly enable it any time a resync or
recovery starts that should use this new feature.  You must have some
automatic, or semi-automatic, way for the feature to be activated, otherwise
it will never be used.

I'm not asking "when should the feature be used" - you've answered that
question a few time and it really isn't an issue.
The question it "What it the exact process by which the feature is turned on
for any particular resync or recovery?"

NeilBrown


[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 828 bytes --]

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC 1/2]raid1: only write mismatch sectors in sync
  2012-10-17 22:56                   ` NeilBrown
@ 2012-10-18  1:17                     ` Shaohua Li
  2012-10-18  1:29                       ` NeilBrown
  2012-10-18  1:30                       ` kedacomkernel
  2012-11-20 17:00                     ` Joseph Glanville
  1 sibling, 2 replies; 24+ messages in thread
From: Shaohua Li @ 2012-10-18  1:17 UTC (permalink / raw)
  To: NeilBrown; +Cc: linux-raid

On Thu, Oct 18, 2012 at 09:56:01AM +1100, NeilBrown wrote:
> On Wed, 17 Oct 2012 13:11:13 +0800 Shaohua Li <shli@kernel.org> wrote:
> 
> > On Thu, Sep 20, 2012 at 09:56:55AM +0800, Shaohua Li wrote:
> > > On Wed, Sep 19, 2012 at 05:16:46PM +1000, NeilBrown wrote:
> > > > On Wed, 19 Sep 2012 13:51:06 +0800 Shaohua Li <shli@kernel.org> wrote:
> > > > 
> > > > > On Tue, Sep 18, 2012 at 02:57:10PM +1000, NeilBrown wrote:
> > > > > > On Wed, 12 Sep 2012 13:29:41 +0800 Shaohua Li <shli@kernel.org> wrote:
> > > > > > 
> > > > > > > On Tue, Sep 11, 2012 at 10:59:08AM +1000, NeilBrown wrote:
> > > > > > > > On Tue, 31 Jul 2012 16:12:04 +0800 Shaohua Li <shli@kernel.org> wrote:
> > > > > > > > 
> > > > > > > > > 2012/7/31 NeilBrown <neilb@suse.de>:
> > > > > > > > > > On Thu, 26 Jul 2012 16:01:50 +0800 Shaohua Li <shli@kernel.org> wrote:
> > > > > > > > > >
> > > > > > > > > >> Write has some impacts to SSD:
> > > > > > > > > >> 1. wear out flash. Frequent write can speed out the progress.
> > > > > > > > > >> 2. increase the burden of garbage collection of SSD firmware. If no space
> > > > > > > > > >> left for write, SSD firmware garbage collection will try to free some space.
> > > > > > > > > >> 3. slow down subsequent write. After write SSD to some extents (for example,
> > > > > > > > > >> write the whole disk), subsequent write will slow down significantly (because
> > > > > > > > > >> almost every write invokes garbage collection in such case).
> > > > > > > > > >>
> > > > > > > > > >> We want to avoid unnecessary write as more as possible. raid sync generally
> > > > > > > > > >> involves a lot of unnecessary write. For example, even two disks don't have
> > > > > > > > > >> any data, we write the second disk for the whole disk size.
> > > > > > > > > >>
> > > > > > > > > >> To reduce write, we always compare raid disk data and only write mismatch part.
> > > > > > > > > >> This means sync will have extra IO read and memory compare. So this scheme is
> > > > > > > > > >> very bad for hard disk raid and sometimes SSD raid too if mismatch part is
> > > > > > > > > >> majority. But sometimes this can be very helpful to reduce write, in that case,
> > > > > > > > > >> since sync is rare operation, the extra IO/CPU usage is worthy paying. People
> > > > > > > > > >> who want to use the feature should understand the risk first. So this ability
> > > > > > > > > >> is off by default, a sysfs entry can be used to enable it.
> > > > > > > > > >>
> > > > > > > > > >> Signed-off-by: Shaohua Li <shli@fusionio.com>
> > > > > > > > > >> ---
> > > > > > > > > >>  drivers/md/md.c    |   41 +++++++++++++++++++++++++++++++
> > > > > > > > > >>  drivers/md/md.h    |    3 ++
> > > > > > > > > >>  drivers/md/raid1.c |   70 +++++++++++++++++++++++++++++++++++++++++++++++++----
> > > > > > > > > >>  3 files changed, 110 insertions(+), 4 deletions(-)
> > > > > > > > > >>
> > > > > > > > > >> Index: linux/drivers/md/md.h
> > > > > > > > > >> ===================================================================
> > > > > > > > > >> --- linux.orig/drivers/md/md.h        2012-07-25 13:51:00.353775521 +0800
> > > > > > > > > >> +++ linux/drivers/md/md.h     2012-07-26 10:36:38.500740552 +0800
> > > > > > > > > >> @@ -325,6 +325,9 @@ struct mddev {
> > > > > > > > > >>  #define      MD_RECOVERY_FROZEN      9
> > > > > > > > > >>
> > > > > > > > > >>       unsigned long                   recovery;
> > > > > > > > > >> +#define MD_RECOVERY_MODE_REPAIR              0
> > > > > > > > > >> +#define MD_RECOVERY_MODE_DISCARD     1
> > > > > > > > > >> +     unsigned long                   recovery_mode;
> > > > > > > > > >
> > > > > > > > > > You have not documented the meaning of these two flags at all, and I don't
> > > > > > > > > > feel up to guessing.
> > > > > > > > > >
> > > > > > > > > > The patch looks more complex that it should be.  The behaviour you are
> > > > > > > > > > suggesting is exactly the behaviour you get when MD_RECOVERY_REQUESTED is
> > > > > > > > > > set, so at most I expect to see a few places where that flag is tested
> > > > > > > > > > changed to test something else as well.
> > > > > > > > > >
> > > > > > > > > > How would this be used?  It affects resync and resync happens as soon as the
> > > > > > > > > > array is assembled.  So when and how would you set the flag which says
> > > > > > > > > > "prefer reads to writes"?  It seems like it needs to be set in the metadata.
> > > > > > > > > >
> > > > > > > > > > BTW RAID10 already does this - it reads and compares for a normal sync.  So
> > > > > > > > > > maybe just tell people to use raid10 if they want this behaviour?
> > > > > > > > > 
> > > > > > > > > It's true, the behavior likes MD_RECOVERY_REQUESTED, ie, read all disks,
> > > > > > > > > only do write if two disk data not match. But I hope it works as soon as the
> > > > > > > > > array is assembled. Surely we can set in the metadata, but I didn't want to
> > > > > > > > > enable it by default, since even with SSD, the read-compare-write isn't optimal
> > > > > > > > > some times, because it involves extra IO read and memory compare.
> > > > > > > > > 
> > > > > > > > > It appears MD_RECOVERY_REQUESTED assumes disks are insync.
> > > > > > > > > It doesn't read disk !insync, so it doesn't work for assemble.
> > > > > > > > > 
> > > > > > > > > In my mind, user frozen the sync first (with sync_action setting), and then
> > > > > > > > > enable read-compare-write, and finally continue the sync. I can't stop
> > > > > > > > > a recovery and set MD_RECOVERY_REQUESTED, so I added a flag.
> > > > > > > > > Anyway, please suggest what the preferred way for this.
> > > > > > > > 
> > > > > > > > I guess I just don't find this functionality at all convincing.  It isn't
> > > > > > > > clear when anyone would use it, or how they would use it.  It seems best not
> > > > > > > > to provide it.
> > > > > > > 
> > > > > > > The background is: For SSD, writting a hard formatted disk and fully filled
> > > > > > > disk, the first case could be 300% faster than the second case depending on SSD
> > > > > > > firmware and how fragmented the filled disk is. So this isn't a trival
> > > > > > > performance issue.
> > > > > > > 
> > > > > > > The usage model is something like this: one disk is borken, add a new disk to
> > > > > > > the raid. Currently we copy the whole disk of the first disk to the second. For
> > > > > > > SSD, if the first disk and second disk have most data identical or most content
> > > > > > > of the first is 0, the copy is very bad. In this scenario, we can avoid some
> > > > > > > copy, which will make latter write to the disk faster. Thinking about 300%
> > > > > > > faster :).
> > > > > > > 
> > > > > > > I don't think we can do this in underlayer. We don't want to do it always.
> > > > > > > Detecting 0 is very expensive.
> > > > > > > 
> > > > > > > And this isn't intrusive to me. We only do the 'copy avoidness' in sync, and
> > > > > > > sync is rare event.
> > > > > > > 
> > > > > > > I hope to convince you this is a useful functionality, then we can discuss the
> > > > > > > implementation.
> > > > > > > 
> > > > > > 
> > > > > > Let's start with "when would someone use it".
> > > > > > 
> > > > > > You said that you don't want to make it the default.  If something isn't a
> > > > > > default it will not be used very often, and will only be used at all if it is
> > > > > > reasonably easy to use.
> > > > > > So how would someone use this?  What would make them choose to use it, and
> > > > > > what action would that take to make it happen?
> > > > > 
> > > > > Ok, let me explain the usage model.
> > > > > 
> > > > > For 'compare and avoid write if equal' case:
> > > > > 1. update SSD firmware. This doesn't change the data, but we need take one disk
> > > > > off from the raid one time.
> > > > > 2. One disk has errors, but these errors don't ruin most of the data (for
> > > > > example, a pcie error)
> > > > > 3. driver/os crash.
> > > > > In all these cases, two raid disks must be resync, and they have almost identical
> > > > > data. write avoidness will be very helpful for these.
> > > > 
> > > > This is all valid, but doesn't explain how it actually happens.
> > > 
> > > Those are real use cases actually.
> > >  
> > > > Maybe the rsync should initially do a comparison and write-if-different.  If
> > > > at any time after the first megabyte the fraction of blocks that requires a
> > > > write exceeds (say) 10%, we switch to "read one, write the others".
> > > > ??
> > > 
> > > Not sure, but this will limit the usage. For example, in my case 1, the first
> > > disk can be written slightly when the second disk is doing firmware upgrade.
> > > The write can be in the first 1M for sure.
> > >  
> > > > > For 'compare and trim if source disk data is 0' case:
> > > > > If filesystem support trim, this can be enabled always (or at least we can
> > > > > enable it if used capacity of the disk is less than 80% for example).
> > > > 
> > > > One of my concerns about this is that I would expect smart flash drives to
> > > > actually do this zero-detection internally.  Maybe they don't now, but surely
> > > > they will soon.  By the time we get this feature into production there seems
> > > > a reasonably chance that it won't be needed - at least on new hardware.
> > > 
> > > Doesn't make sense to me. zero-detection is very expensive. blindly doing it
> > > for every request is insane. While my proposal is just doing it for sync, which
> > > is a rare scenario.
> > >  
> > > > And on old hardware I believe that 'discard' isn't particularly fast (as it
> > > > cannot be queued ... or something).  So I would only want to send a discard
> > > > request if it was very large, though I'm not sure how large is large enough.
> > > 
> > > Yep, discard speed varies between SSDs. Some are very slow, much slower than
> > > write, some SATA SSD discard speed is even slower than harddisk write. While
> > > our card is very fast, faster than write.
> > >  
> > > > > User enables these features and then add disk to downgrade raid. raid resync
> > > > > then avoids write or does trim.
> > > > 
> > > > This is the bit that concerns me the most - expecting the user to enable this
> > > > sort of feature.  In many cases they won't making the code pointless.  If an
> > > > optimisation is needed, we should use it automatically.  Maybe we can for the
> > > > first first case.  Not sure yet about the second one.
> > > 
> > > I understood your concerns. Giving this feature can harm/benefit, depending on
> > > different SSDs and workload/usage model, an interface to enable is preferred.
> > 
> > Neil,
> > any further comments on this? This is a usable feature, I hope we can have some
> > agreements.
> 
> You still haven't answered my main question, which possibly means I haven't
> asked it very clearly.
> 
> You are saying that this new behaviour should not be the default and I think
> I agree.
> So the question is:  how it is selected?
> 
> You cannot expect the user to explicitly enable it any time a resync or
> recovery starts that should use this new feature.  You must have some
> automatic, or semi-automatic, way for the feature to be activated, otherwise
> it will never be used.
> 
> I'm not asking "when should the feature be used" - you've answered that
> question a few time and it really isn't an issue.
> The question it "What it the exact process by which the feature is turned on
> for any particular resync or recovery?"

So you worried about users don't know how to correctly select the feature. An
experienced user knows this, the usage scenario I mentioned describes how to do
the decision. For example, a resync after system crash should enable the
feature. I admit an inexperienced user doesn't know how to select it, but this
isn't a big problem to me. There are a lot of tunables in the kernel (even MD),
which can significantly impact kernel behavior. These tunables are just for
experienced users.

Thanks,
Shaohua

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC 1/2]raid1: only write mismatch sectors in sync
  2012-10-18  1:17                     ` Shaohua Li
@ 2012-10-18  1:29                       ` NeilBrown
  2012-10-18  2:01                         ` Shaohua Li
  2012-10-18  1:30                       ` kedacomkernel
  1 sibling, 1 reply; 24+ messages in thread
From: NeilBrown @ 2012-10-18  1:29 UTC (permalink / raw)
  To: Shaohua Li; +Cc: linux-raid

[-- Attachment #1: Type: text/plain, Size: 1657 bytes --]

On Thu, 18 Oct 2012 09:17:35 +0800 Shaohua Li <shli@kernel.org> wrote:
 
> > > Neil,
> > > any further comments on this? This is a usable feature, I hope we can have some
> > > agreements.
> > 
> > You still haven't answered my main question, which possibly means I haven't
> > asked it very clearly.
> > 
> > You are saying that this new behaviour should not be the default and I think
> > I agree.
> > So the question is:  how it is selected?
> > 
> > You cannot expect the user to explicitly enable it any time a resync or
> > recovery starts that should use this new feature.  You must have some
> > automatic, or semi-automatic, way for the feature to be activated, otherwise
> > it will never be used.
> > 
> > I'm not asking "when should the feature be used" - you've answered that
> > question a few time and it really isn't an issue.
> > The question it "What it the exact process by which the feature is turned on
> > for any particular resync or recovery?"
> 
> So you worried about users don't know how to correctly select the feature. An
> experienced user knows this, the usage scenario I mentioned describes how to do
> the decision. For example, a resync after system crash should enable the
> feature. I admit an inexperienced user doesn't know how to select it, but this
> isn't a big problem to me. There are a lot of tunables in the kernel (even MD),
> which can significantly impact kernel behavior. These tunables are just for
> experienced users.
> 
> Thanks,
> Shaohua


You still aren't answering my question.

What exactly, precisely, specifically, will an "experienced user" do?

NeilBrown

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 828 bytes --]

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC 1/2]raid1: only write mismatch sectors in sync
  2012-10-18  1:29                       ` NeilBrown
@ 2012-10-18  2:01                         ` Shaohua Li
  2012-10-18  2:36                           ` NeilBrown
  0 siblings, 1 reply; 24+ messages in thread
From: Shaohua Li @ 2012-10-18  2:01 UTC (permalink / raw)
  To: NeilBrown; +Cc: linux-raid

On Thu, Oct 18, 2012 at 12:29:59PM +1100, NeilBrown wrote:
> On Thu, 18 Oct 2012 09:17:35 +0800 Shaohua Li <shli@kernel.org> wrote:
>  
> > > > Neil,
> > > > any further comments on this? This is a usable feature, I hope we can have some
> > > > agreements.
> > > 
> > > You still haven't answered my main question, which possibly means I haven't
> > > asked it very clearly.
> > > 
> > > You are saying that this new behaviour should not be the default and I think
> > > I agree.
> > > So the question is:  how it is selected?
> > > 
> > > You cannot expect the user to explicitly enable it any time a resync or
> > > recovery starts that should use this new feature.  You must have some
> > > automatic, or semi-automatic, way for the feature to be activated, otherwise
> > > it will never be used.
> > > 
> > > I'm not asking "when should the feature be used" - you've answered that
> > > question a few time and it really isn't an issue.
> > > The question it "What it the exact process by which the feature is turned on
> > > for any particular resync or recovery?"
> > 
> > So you worried about users don't know how to correctly select the feature. An
> > experienced user knows this, the usage scenario I mentioned describes how to do
> > the decision. For example, a resync after system crash should enable the
> > feature. I admit an inexperienced user doesn't know how to select it, but this
> > isn't a big problem to me. There are a lot of tunables in the kernel (even MD),
> > which can significantly impact kernel behavior. These tunables are just for
> > experienced users.
> > 
> > Thanks,
> > Shaohua
> 
> 
> You still aren't answering my question.
> 
> What exactly, precisely, specifically, will an "experienced user" do?

Set something to a sysfs entry to enable the feature (like my RFC patch does to
have a new sysfs entry for the feature), and readd disk. resync then does 'only
write mismatch data'. Is this what you asked?

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC 1/2]raid1: only write mismatch sectors in sync
  2012-10-18  2:01                         ` Shaohua Li
@ 2012-10-18  2:36                           ` NeilBrown
  2012-10-21 17:14                             ` Michael Tokarev
  2012-10-31  3:25                             ` Shaohua Li
  0 siblings, 2 replies; 24+ messages in thread
From: NeilBrown @ 2012-10-18  2:36 UTC (permalink / raw)
  To: Shaohua Li; +Cc: linux-raid

[-- Attachment #1: Type: text/plain, Size: 3471 bytes --]

On Thu, 18 Oct 2012 10:01:34 +0800 Shaohua Li <shli@kernel.org> wrote:

> On Thu, Oct 18, 2012 at 12:29:59PM +1100, NeilBrown wrote:
> > On Thu, 18 Oct 2012 09:17:35 +0800 Shaohua Li <shli@kernel.org> wrote:
> >  
> > > > > Neil,
> > > > > any further comments on this? This is a usable feature, I hope we can have some
> > > > > agreements.
> > > > 
> > > > You still haven't answered my main question, which possibly means I haven't
> > > > asked it very clearly.
> > > > 
> > > > You are saying that this new behaviour should not be the default and I think
> > > > I agree.
> > > > So the question is:  how it is selected?
> > > > 
> > > > You cannot expect the user to explicitly enable it any time a resync or
> > > > recovery starts that should use this new feature.  You must have some
> > > > automatic, or semi-automatic, way for the feature to be activated, otherwise
> > > > it will never be used.
> > > > 
> > > > I'm not asking "when should the feature be used" - you've answered that
> > > > question a few time and it really isn't an issue.
> > > > The question it "What it the exact process by which the feature is turned on
> > > > for any particular resync or recovery?"
> > > 
> > > So you worried about users don't know how to correctly select the feature. An
> > > experienced user knows this, the usage scenario I mentioned describes how to do
> > > the decision. For example, a resync after system crash should enable the
> > > feature. I admit an inexperienced user doesn't know how to select it, but this
> > > isn't a big problem to me. There are a lot of tunables in the kernel (even MD),
> > > which can significantly impact kernel behavior. These tunables are just for
> > > experienced users.
> > > 
> > > Thanks,
> > > Shaohua
> > 
> > 
> > You still aren't answering my question.
> > 
> > What exactly, precisely, specifically, will an "experienced user" do?
> 
> Set something to a sysfs entry to enable the feature (like my RFC patch does to
> have a new sysfs entry for the feature), and readd disk. resync then does 'only
> write mismatch data'. Is this what you asked?

Yes, that is the sort of thing I was asking for.
When you say "readd disk" I assume you mean to use the --readd option to
mdadm.
The only works when there is a bitmap active on the array,  so relatively few
blocks will be resynced so does it really matter which approach is taken?
Always copy, or read-and-test?

Though maybe you really mean to "--add" the device.  In that case it would
probably make sense to add some other option to mdadm to say "enable
read-mostly recovery".  I wonder what a good name would be.
--minimize-writes ??

You earlier gave a list of scenarios in which you thought this would be
useful.  It was:

> > > For 'compare and avoid write if equal' case:
> > > 1. update SSD firmware. This doesn't change the data, but we need take one disk
> > > off from the raid one time.
> > > 2. One disk has errors, but these errors don't ruin most of the data (for
> > > example, a pcie error)
> > > 3. driver/os crash.
> > > In all these cases, two raid disks must be resync, and they have almost identical
> > > data. write avoidness will be very helpful for these.  


For case '3', it would be a "resync" rather than a "recovery".  How would you
expect an "advanced user" to choose read-and-test recovery in that case?
There is no "readd" command happening.

NeilBrown

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 828 bytes --]

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC 1/2]raid1: only write mismatch sectors in sync
  2012-10-18  2:36                           ` NeilBrown
@ 2012-10-21 17:14                             ` Michael Tokarev
  2012-10-31  3:25                             ` Shaohua Li
  1 sibling, 0 replies; 24+ messages in thread
From: Michael Tokarev @ 2012-10-21 17:14 UTC (permalink / raw)
  To: NeilBrown; +Cc: Shaohua Li, linux-raid

[]
>> Set something to a sysfs entry to enable the feature (like my RFC patch does to
>> have a new sysfs entry for the feature), and readd disk. resync then does 'only
>> write mismatch data'. Is this what you asked?
> 
> Yes, that is the sort of thing I was asking for.
> When you say "readd disk" I assume you mean to use the --readd option to
> mdadm.
> The only works when there is a bitmap active on the array,  so relatively few
> blocks will be resynced so does it really matter which approach is taken?
> Always copy, or read-and-test?
> 
> Though maybe you really mean to "--add" the device.  In that case it would
> probably make sense to add some other option to mdadm to say "enable
> read-mostly recovery".  I wonder what a good name would be.
> --minimize-writes ??

That "minimizing" will be a "magnet" for not-so-experienced users, who will
a) enable that option in hope it will run faster, b) will wonder why it is
not the default, and c) will ask why it is so much slow... ;)

And yes it will be slow for regular rebuild of an array when you replace
a disk - because in that case most parts of the new disk will be read AND
written, instead of being JUST written to.

Maybe --compare-when-write.

Note this is useful not only for SSDs, it is very useful in virtual
environments too (and WHY use mdadm in virtual environment is a
different question).  For example, I often test mdadm itself in
a virtual machine with 2 virtual disks.  Initially these disks are
zero-length sparse files, but once a resync/rebuild is run, one of
them will have zeros written all over...

Thanks,

/mjt

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC 1/2]raid1: only write mismatch sectors in sync
  2012-10-18  2:36                           ` NeilBrown
  2012-10-21 17:14                             ` Michael Tokarev
@ 2012-10-31  3:25                             ` Shaohua Li
  2012-10-31  5:43                               ` NeilBrown
  1 sibling, 1 reply; 24+ messages in thread
From: Shaohua Li @ 2012-10-31  3:25 UTC (permalink / raw)
  To: NeilBrown; +Cc: linux-raid

On Thu, Oct 18, 2012 at 01:36:57PM +1100, NeilBrown wrote:
> On Thu, 18 Oct 2012 10:01:34 +0800 Shaohua Li <shli@kernel.org> wrote:
> 
> > On Thu, Oct 18, 2012 at 12:29:59PM +1100, NeilBrown wrote:
> > > On Thu, 18 Oct 2012 09:17:35 +0800 Shaohua Li <shli@kernel.org> wrote:
> > >  
> > > > > > Neil,
> > > > > > any further comments on this? This is a usable feature, I hope we can have some
> > > > > > agreements.
> > > > > 
> > > > > You still haven't answered my main question, which possibly means I haven't
> > > > > asked it very clearly.
> > > > > 
> > > > > You are saying that this new behaviour should not be the default and I think
> > > > > I agree.
> > > > > So the question is:  how it is selected?
> > > > > 
> > > > > You cannot expect the user to explicitly enable it any time a resync or
> > > > > recovery starts that should use this new feature.  You must have some
> > > > > automatic, or semi-automatic, way for the feature to be activated, otherwise
> > > > > it will never be used.
> > > > > 
> > > > > I'm not asking "when should the feature be used" - you've answered that
> > > > > question a few time and it really isn't an issue.
> > > > > The question it "What it the exact process by which the feature is turned on
> > > > > for any particular resync or recovery?"
> > > > 
> > > > So you worried about users don't know how to correctly select the feature. An
> > > > experienced user knows this, the usage scenario I mentioned describes how to do
> > > > the decision. For example, a resync after system crash should enable the
> > > > feature. I admit an inexperienced user doesn't know how to select it, but this
> > > > isn't a big problem to me. There are a lot of tunables in the kernel (even MD),
> > > > which can significantly impact kernel behavior. These tunables are just for
> > > > experienced users.
> > > > 
> > > > Thanks,
> > > > Shaohua
> > > 
> > > 
> > > You still aren't answering my question.
> > > 
> > > What exactly, precisely, specifically, will an "experienced user" do?
> > 
> > Set something to a sysfs entry to enable the feature (like my RFC patch does to
> > have a new sysfs entry for the feature), and readd disk. resync then does 'only
> > write mismatch data'. Is this what you asked?

sorry for the delay.
 
> Yes, that is the sort of thing I was asking for.
> When you say "readd disk" I assume you mean to use the --readd option to
> mdadm.
> The only works when there is a bitmap active on the array,  so relatively few
> blocks will be resynced so does it really matter which approach is taken?
> Always copy, or read-and-test?
> 
> Though maybe you really mean to "--add" the device.  In that case it would
> probably make sense to add some other option to mdadm to say "enable
> read-mostly recovery".  I wonder what a good name would be.
> --minimize-writes ??

Yep, it's '--add' case. For the '--readd' with bitmap case, bitmap can already
avoid a lot of write already. The useage case is something like:
one disk is broken; trim whole disk of a new disk; add the new disk
If source disk has a lot of 0 and we only write mismatch data, we can avoid
write a lot.

I believe we need such mechanism for '--create' too, if the first disk has some
data, but the second disk is empty.
 
> You earlier gave a list of scenarios in which you thought this would be
> useful.  It was:
> 
> > > > For 'compare and avoid write if equal' case:
> > > > 1. update SSD firmware. This doesn't change the data, but we need take one disk
> > > > off from the raid one time.
> > > > 2. One disk has errors, but these errors don't ruin most of the data (for
> > > > example, a pcie error)
> > > > 3. driver/os crash.
> > > > In all these cases, two raid disks must be resync, and they have almost identical
> > > > data. write avoidness will be very helpful for these.  
> 
> 
> For case '3', it would be a "resync" rather than a "recovery".  How would you
> expect an "advanced user" to choose read-and-test recovery in that case?
> There is no "readd" command happening.

If there is bitmap, maybe we don't need do read-and-test, so this one isn't
very necessary in current stage. If not, what I suggested is:
1. user suspends resync (write something to a sysfs file)
2. user enables read-and-test (again, write a sysfs file)
3. resume resync

Thanks,
Shaohua

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC 1/2]raid1: only write mismatch sectors in sync
  2012-10-31  3:25                             ` Shaohua Li
@ 2012-10-31  5:43                               ` NeilBrown
  2012-10-31  6:05                                 ` Shaohua Li
  0 siblings, 1 reply; 24+ messages in thread
From: NeilBrown @ 2012-10-31  5:43 UTC (permalink / raw)
  To: Shaohua Li; +Cc: linux-raid

[-- Attachment #1: Type: text/plain, Size: 4929 bytes --]

On Wed, 31 Oct 2012 11:25:33 +0800 Shaohua Li <shli@kernel.org> wrote:

> On Thu, Oct 18, 2012 at 01:36:57PM +1100, NeilBrown wrote:
> > On Thu, 18 Oct 2012 10:01:34 +0800 Shaohua Li <shli@kernel.org> wrote:
> > 
> > > On Thu, Oct 18, 2012 at 12:29:59PM +1100, NeilBrown wrote:
> > > > On Thu, 18 Oct 2012 09:17:35 +0800 Shaohua Li <shli@kernel.org> wrote:
> > > >  
> > > > > > > Neil,
> > > > > > > any further comments on this? This is a usable feature, I hope we can have some
> > > > > > > agreements.
> > > > > > 
> > > > > > You still haven't answered my main question, which possibly means I haven't
> > > > > > asked it very clearly.
> > > > > > 
> > > > > > You are saying that this new behaviour should not be the default and I think
> > > > > > I agree.
> > > > > > So the question is:  how it is selected?
> > > > > > 
> > > > > > You cannot expect the user to explicitly enable it any time a resync or
> > > > > > recovery starts that should use this new feature.  You must have some
> > > > > > automatic, or semi-automatic, way for the feature to be activated, otherwise
> > > > > > it will never be used.
> > > > > > 
> > > > > > I'm not asking "when should the feature be used" - you've answered that
> > > > > > question a few time and it really isn't an issue.
> > > > > > The question it "What it the exact process by which the feature is turned on
> > > > > > for any particular resync or recovery?"
> > > > > 
> > > > > So you worried about users don't know how to correctly select the feature. An
> > > > > experienced user knows this, the usage scenario I mentioned describes how to do
> > > > > the decision. For example, a resync after system crash should enable the
> > > > > feature. I admit an inexperienced user doesn't know how to select it, but this
> > > > > isn't a big problem to me. There are a lot of tunables in the kernel (even MD),
> > > > > which can significantly impact kernel behavior. These tunables are just for
> > > > > experienced users.
> > > > > 
> > > > > Thanks,
> > > > > Shaohua
> > > > 
> > > > 
> > > > You still aren't answering my question.
> > > > 
> > > > What exactly, precisely, specifically, will an "experienced user" do?
> > > 
> > > Set something to a sysfs entry to enable the feature (like my RFC patch does to
> > > have a new sysfs entry for the feature), and readd disk. resync then does 'only
> > > write mismatch data'. Is this what you asked?
> 
> sorry for the delay.
>  
> > Yes, that is the sort of thing I was asking for.
> > When you say "readd disk" I assume you mean to use the --readd option to
> > mdadm.
> > The only works when there is a bitmap active on the array,  so relatively few
> > blocks will be resynced so does it really matter which approach is taken?
> > Always copy, or read-and-test?
> > 
> > Though maybe you really mean to "--add" the device.  In that case it would
> > probably make sense to add some other option to mdadm to say "enable
> > read-mostly recovery".  I wonder what a good name would be.
> > --minimize-writes ??
> 
> Yep, it's '--add' case. For the '--readd' with bitmap case, bitmap can already
> avoid a lot of write already. The useage case is something like:
> one disk is broken; trim whole disk of a new disk; add the new disk
> If source disk has a lot of 0 and we only write mismatch data, we can avoid
> write a lot.
> 
> I believe we need such mechanism for '--create' too, if the first disk has some
> data, but the second disk is empty.
>  
> > You earlier gave a list of scenarios in which you thought this would be
> > useful.  It was:
> > 
> > > > > For 'compare and avoid write if equal' case:
> > > > > 1. update SSD firmware. This doesn't change the data, but we need take one disk
> > > > > off from the raid one time.
> > > > > 2. One disk has errors, but these errors don't ruin most of the data (for
> > > > > example, a pcie error)
> > > > > 3. driver/os crash.
> > > > > In all these cases, two raid disks must be resync, and they have almost identical
> > > > > data. write avoidness will be very helpful for these.  
> > 
> > 
> > For case '3', it would be a "resync" rather than a "recovery".  How would you
> > expect an "advanced user" to choose read-and-test recovery in that case?
> > There is no "readd" command happening.
> 
> If there is bitmap, maybe we don't need do read-and-test, so this one isn't
> very necessary in current stage. If not, what I suggested is:
> 1. user suspends resync (write something to a sysfs file)
> 2. user enables read-and-test (again, write a sysfs file)
> 3. resume resync

So you are happy for the resync to start doing the wrong thing, and expect
the sysadmin to notice, and then take some obscure action to stop it doing
the wrong thing and start it doing the right thing.
Certainly possible, but very error prone I would think.

NeilBrown

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 828 bytes --]

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC 1/2]raid1: only write mismatch sectors in sync
  2012-10-31  5:43                               ` NeilBrown
@ 2012-10-31  6:05                                 ` Shaohua Li
  0 siblings, 0 replies; 24+ messages in thread
From: Shaohua Li @ 2012-10-31  6:05 UTC (permalink / raw)
  To: NeilBrown; +Cc: linux-raid

On Wed, Oct 31, 2012 at 04:43:36PM +1100, NeilBrown wrote:
> On Wed, 31 Oct 2012 11:25:33 +0800 Shaohua Li <shli@kernel.org> wrote:
> 
> > On Thu, Oct 18, 2012 at 01:36:57PM +1100, NeilBrown wrote:
> > > On Thu, 18 Oct 2012 10:01:34 +0800 Shaohua Li <shli@kernel.org> wrote:
> > > 
> > > > On Thu, Oct 18, 2012 at 12:29:59PM +1100, NeilBrown wrote:
> > > > > On Thu, 18 Oct 2012 09:17:35 +0800 Shaohua Li <shli@kernel.org> wrote:
> > > > >  
> > > > > > > > Neil,
> > > > > > > > any further comments on this? This is a usable feature, I hope we can have some
> > > > > > > > agreements.
> > > > > > > 
> > > > > > > You still haven't answered my main question, which possibly means I haven't
> > > > > > > asked it very clearly.
> > > > > > > 
> > > > > > > You are saying that this new behaviour should not be the default and I think
> > > > > > > I agree.
> > > > > > > So the question is:  how it is selected?
> > > > > > > 
> > > > > > > You cannot expect the user to explicitly enable it any time a resync or
> > > > > > > recovery starts that should use this new feature.  You must have some
> > > > > > > automatic, or semi-automatic, way for the feature to be activated, otherwise
> > > > > > > it will never be used.
> > > > > > > 
> > > > > > > I'm not asking "when should the feature be used" - you've answered that
> > > > > > > question a few time and it really isn't an issue.
> > > > > > > The question it "What it the exact process by which the feature is turned on
> > > > > > > for any particular resync or recovery?"
> > > > > > 
> > > > > > So you worried about users don't know how to correctly select the feature. An
> > > > > > experienced user knows this, the usage scenario I mentioned describes how to do
> > > > > > the decision. For example, a resync after system crash should enable the
> > > > > > feature. I admit an inexperienced user doesn't know how to select it, but this
> > > > > > isn't a big problem to me. There are a lot of tunables in the kernel (even MD),
> > > > > > which can significantly impact kernel behavior. These tunables are just for
> > > > > > experienced users.
> > > > > > 
> > > > > > Thanks,
> > > > > > Shaohua
> > > > > 
> > > > > 
> > > > > You still aren't answering my question.
> > > > > 
> > > > > What exactly, precisely, specifically, will an "experienced user" do?
> > > > 
> > > > Set something to a sysfs entry to enable the feature (like my RFC patch does to
> > > > have a new sysfs entry for the feature), and readd disk. resync then does 'only
> > > > write mismatch data'. Is this what you asked?
> > 
> > sorry for the delay.
> >  
> > > Yes, that is the sort of thing I was asking for.
> > > When you say "readd disk" I assume you mean to use the --readd option to
> > > mdadm.
> > > The only works when there is a bitmap active on the array,  so relatively few
> > > blocks will be resynced so does it really matter which approach is taken?
> > > Always copy, or read-and-test?
> > > 
> > > Though maybe you really mean to "--add" the device.  In that case it would
> > > probably make sense to add some other option to mdadm to say "enable
> > > read-mostly recovery".  I wonder what a good name would be.
> > > --minimize-writes ??
> > 
> > Yep, it's '--add' case. For the '--readd' with bitmap case, bitmap can already
> > avoid a lot of write already. The useage case is something like:
> > one disk is broken; trim whole disk of a new disk; add the new disk
> > If source disk has a lot of 0 and we only write mismatch data, we can avoid
> > write a lot.
> > 
> > I believe we need such mechanism for '--create' too, if the first disk has some
> > data, but the second disk is empty.
> >  
> > > You earlier gave a list of scenarios in which you thought this would be
> > > useful.  It was:
> > > 
> > > > > > For 'compare and avoid write if equal' case:
> > > > > > 1. update SSD firmware. This doesn't change the data, but we need take one disk
> > > > > > off from the raid one time.
> > > > > > 2. One disk has errors, but these errors don't ruin most of the data (for
> > > > > > example, a pcie error)
> > > > > > 3. driver/os crash.
> > > > > > In all these cases, two raid disks must be resync, and they have almost identical
> > > > > > data. write avoidness will be very helpful for these.  
> > > 
> > > 
> > > For case '3', it would be a "resync" rather than a "recovery".  How would you
> > > expect an "advanced user" to choose read-and-test recovery in that case?
> > > There is no "readd" command happening.
> > 
> > If there is bitmap, maybe we don't need do read-and-test, so this one isn't
> > very necessary in current stage. If not, what I suggested is:
> > 1. user suspends resync (write something to a sysfs file)
> > 2. user enables read-and-test (again, write a sysfs file)
> > 3. resume resync
> 
> So you are happy for the resync to start doing the wrong thing, and expect
> the sysadmin to notice, and then take some obscure action to stop it doing
> the wrong thing and start it doing the right thing.
> Certainly possible, but very error prone I would think.

This one isn't very important if bitmap is used. But it would be great if --add
or --create can do read-and-test to avoid write.

Thanks,
Shaohua

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: Re: [RFC 1/2]raid1: only write mismatch sectors in sync
  2012-10-18  1:17                     ` Shaohua Li
  2012-10-18  1:29                       ` NeilBrown
@ 2012-10-18  1:30                       ` kedacomkernel
  1 sibling, 0 replies; 24+ messages in thread
From: kedacomkernel @ 2012-10-18  1:30 UTC (permalink / raw)
  To: Shaohua Li, NeilBrown; +Cc: linux-raid

>On Thu, Oct 18, 2012 at 09:56:01AM +1100, NeilBrown wrote:
>> On Wed, 17 Oct 2012 13:11:13 +0800 Shaohua Li <shli@kernel.org> wrote:
>> 
>> > On Thu, Sep 20, 2012 at 09:56:55AM +0800, Shaohua Li wrote:
>> > > On Wed, Sep 19, 2012 at 05:16:46PM +1000, NeilBrown wrote:
>> > > > On Wed, 19 Sep 2012 13:51:06 +0800 Shaohua Li <shli@kernel.org> wrote:
>> > > > 
>> > > > > On Tue, Sep 18, 2012 at 02:57:10PM +1000, NeilBrown wrote:
>> > > > > > On Wed, 12 Sep 2012 13:29:41 +0800 Shaohua Li <shli@kernel.org> wrote:
>> > > > > > 
>> > > > > > > On Tue, Sep 11, 2012 at 10:59:08AM +1000, NeilBrown wrote:
>> > > > > > > > On Tue, 31 Jul 2012 16:12:04 +0800 Shaohua Li <shli@kernel.org> wrote:
>> > > > > > > > 
>> > > > > > > > > 2012/7/31 NeilBrown <neilb@suse.de>:
>> > > > > > > > > > On Thu, 26 Jul 2012 16:01:50 +0800 Shaohua Li <shli@kernel.org> wrote:
>> > > > > > > > > >
>> > > > > > > > > >> Write has some impacts to SSD:
>> > > > > > > > > >> 1. wear out flash. Frequent write can speed out the progress.
>> > > > > > > > > >> 2. increase the burden of garbage collection of SSD firmware. If no space
>> > > > > > > > > >> left for write, SSD firmware garbage collection will try to free some space.
>> > > > > > > > > >> 3. slow down subsequent write. After write SSD to some extents (for example,
>> > > > > > > > > >> write the whole disk), subsequent write will slow down significantly (because
>> > > > > > > > > >> almost every write invokes garbage collection in such case).
>> > > > > > > > > >>
>> > > > > > > > > >> We want to avoid unnecessary write as more as possible. raid sync generally
>> > > > > > > > > >> involves a lot of unnecessary write. For example, even two disks don't have
>> > > > > > > > > >> any data, we write the second disk for the whole disk size.
>> > > > > > > > > >>
>> > > > > > > > > >> To reduce write, we always compare raid disk data and only write mismatch part.
>> > > > > > > > > >> This means sync will have extra IO read and memory compare. So this scheme is
>> > > > > > > > > >> very bad for hard disk raid and sometimes SSD raid too if mismatch part is
>> > > > > > > > > >> majority. But sometimes this can be very helpful to reduce write, in that case,
>> > > > > > > > > >> since sync is rare operation, the extra IO/CPU usage is worthy paying. People
>> > > > > > > > > >> who want to use the feature should understand the risk first. So this ability
>> > > > > > > > > >> is off by default, a sysfs entry can be used to enable it.
>> > > > > > > > > >>
>> > > > > > > > > >> Signed-off-by: Shaohua Li <shli@fusionio.com>
>> > > > > > > > > >> ---
>> > > > > > > > > >>  drivers/md/md.c    |   41 +++++++++++++++++++++++++++++++
>> > > > > > > > > >>  drivers/md/md.h    |    3 ++
>> > > > > > > > > >>  drivers/md/raid1.c |   70 +++++++++++++++++++++++++++++++++++++++++++++++++----
>> > > > > > > > > >>  3 files changed, 110 insertions(+), 4 deletions(-)
>> > > > > > > > > >>
>> > > > > > > > > >> Index: linux/drivers/md/md.h
>> > > > > > > > > >> ===================================================================
>> > > > > > > > > >> --- linux.orig/drivers/md/md.h        2012-07-25 13:51:00.353775521 +0800
>> > > > > > > > > >> +++ linux/drivers/md/md.h     2012-07-26 10:36:38.500740552 +0800
>> > > > > > > > > >> @@ -325,6 +325,9 @@ struct mddev {
>> > > > > > > > > >>  #define      MD_RECOVERY_FROZEN      9
>> > > > > > > > > >>
>> > > > > > > > > >>       unsigned long                   recovery;
>> > > > > > > > > >> +#define MD_RECOVERY_MODE_REPAIR              0
>> > > > > > > > > >> +#define MD_RECOVERY_MODE_DISCARD     1
>> > > > > > > > > >> +     unsigned long                   recovery_mode;
>> > > > > > > > > >
>> > > > > > > > > > You have not documented the meaning of these two flags at all, and I don't
>> > > > > > > > > > feel up to guessing.
>> > > > > > > > > >
>> > > > > > > > > > The patch looks more complex that it should be.  The behaviour you are
>> > > > > > > > > > suggesting is exactly the behaviour you get when MD_RECOVERY_REQUESTED is
>> > > > > > > > > > set, so at most I expect to see a few places where that flag is tested
>> > > > > > > > > > changed to test something else as well.
>> > > > > > > > > >
>> > > > > > > > > > How would this be used?  It affects resync and resync happens as soon as the
>> > > > > > > > > > array is assembled.  So when and how would you set the flag which says
>> > > > > > > > > > "prefer reads to writes"?  It seems like it needs to be set in the metadata.
>> > > > > > > > > >
>> > > > > > > > > > BTW RAID10 already does this - it reads and compares for a normal sync.  So
>> > > > > > > > > > maybe just tell people to use raid10 if they want this behaviour?
>> > > > > > > > > 
>> > > > > > > > > It's true, the behavior likes MD_RECOVERY_REQUESTED, ie, read all disks,
>> > > > > > > > > only do write if two disk data not match. But I hope it works as soon as the
>> > > > > > > > > array is assembled. Surely we can set in the metadata, but I didn't want to
>> > > > > > > > > enable it by default, since even with SSD, the read-compare-write isn't optimal
>> > > > > > > > > some times, because it involves extra IO read and memory compare.
>> > > > > > > > > 
>> > > > > > > > > It appears MD_RECOVERY_REQUESTED assumes disks are insync.
>> > > > > > > > > It doesn't read disk !insync, so it doesn't work for assemble.
>> > > > > > > > > 
>> > > > > > > > > In my mind, user frozen the sync first (with sync_action setting), and then
>> > > > > > > > > enable read-compare-write, and finally continue the sync. I can't stop
>> > > > > > > > > a recovery and set MD_RECOVERY_REQUESTED, so I added a flag.
>> > > > > > > > > Anyway, please suggest what the preferred way for this.
>> > > > > > > > 
>> > > > > > > > I guess I just don't find this functionality at all convincing.  It isn't
>> > > > > > > > clear when anyone would use it, or how they would use it.  It seems best not
>> > > > > > > > to provide it.
>> > > > > > > 
>> > > > > > > The background is: For SSD, writting a hard formatted disk and fully filled
>> > > > > > > disk, the first case could be 300% faster than the second case depending on SSD
>> > > > > > > firmware and how fragmented the filled disk is. So this isn't a trival
>> > > > > > > performance issue.
>> > > > > > > 
>> > > > > > > The usage model is something like this: one disk is borken, add a new disk to
>> > > > > > > the raid. Currently we copy the whole disk of the first disk to the second. For
>> > > > > > > SSD, if the first disk and second disk have most data identical or most content
>> > > > > > > of the first is 0, the copy is very bad. In this scenario, we can avoid some
>> > > > > > > copy, which will make latter write to the disk faster. Thinking about 300%
>> > > > > > > faster :).
>> > > > > > > 
>> > > > > > > I don't think we can do this in underlayer. We don't want to do it always.
>> > > > > > > Detecting 0 is very expensive.
>> > > > > > > 
>> > > > > > > And this isn't intrusive to me. We only do the 'copy avoidness' in sync, and
>> > > > > > > sync is rare event.
>> > > > > > > 
>> > > > > > > I hope to convince you this is a useful functionality, then we can discuss the
>> > > > > > > implementation.
>> > > > > > > 
>> > > > > > 
>> > > > > > Let's start with "when would someone use it".
>> > > > > > 
>> > > > > > You said that you don't want to make it the default.  If something isn't a
>> > > > > > default it will not be used very often, and will only be used at all if it is
>> > > > > > reasonably easy to use.
>> > > > > > So how would someone use this?  What would make them choose to use it, and
>> > > > > > what action would that take to make it happen?
>> > > > > 
>> > > > > Ok, let me explain the usage model.
>> > > > > 
>> > > > > For 'compare and avoid write if equal' case:
>> > > > > 1. update SSD firmware. This doesn't change the data, but we need take one disk
>> > > > > off from the raid one time.
>> > > > > 2. One disk has errors, but these errors don't ruin most of the data (for
>> > > > > example, a pcie error)
>> > > > > 3. driver/os crash.
>> > > > > In all these cases, two raid disks must be resync, and they have almost identical
>> > > > > data. write avoidness will be very helpful for these.
>> > > > 
>> > > > This is all valid, but doesn't explain how it actually happens.
>> > > 
>> > > Those are real use cases actually.
>> > >  
>> > > > Maybe the rsync should initially do a comparison and write-if-different.  If
>> > > > at any time after the first megabyte the fraction of blocks that requires a
>> > > > write exceeds (say) 10%, we switch to "read one, write the others".
>> > > > ??
>> > > 
>> > > Not sure, but this will limit the usage. For example, in my case 1, the first
>> > > disk can be written slightly when the second disk is doing firmware upgrade.
>> > > The write can be in the first 1M for sure.
>> > >  
>> > > > > For 'compare and trim if source disk data is 0' case:
>> > > > > If filesystem support trim, this can be enabled always (or at least we can
>> > > > > enable it if used capacity of the disk is less than 80% for example).
>> > > > 
>> > > > One of my concerns about this is that I would expect smart flash drives to
>> > > > actually do this zero-detection internally.  Maybe they don't now, but surely
>> > > > they will soon.  By the time we get this feature into production there seems
>> > > > a reasonably chance that it won't be needed - at least on new hardware.
>> > > 
>> > > Doesn't make sense to me. zero-detection is very expensive. blindly doing it
>> > > for every request is insane. While my proposal is just doing it for sync, which
>> > > is a rare scenario.
>> > >  
>> > > > And on old hardware I believe that 'discard' isn't particularly fast (as it
>> > > > cannot be queued ... or something).  So I would only want to send a discard
>> > > > request if it was very large, though I'm not sure how large is large enough.
>> > > 
>> > > Yep, discard speed varies between SSDs. Some are very slow, much slower than
>> > > write, some SATA SSD discard speed is even slower than harddisk write. While
>> > > our card is very fast, faster than write.
>> > >  
>> > > > > User enables these features and then add disk to downgrade raid. raid resync
>> > > > > then avoids write or does trim.
>> > > > 
>> > > > This is the bit that concerns me the most - expecting the user to enable this
>> > > > sort of feature.  In many cases they won't making the code pointless.  If an
>> > > > optimisation is needed, we should use it automatically.  Maybe we can for the
>> > > > first first case.  Not sure yet about the second one.
>> > > 
>> > > I understood your concerns. Giving this feature can harm/benefit, depending on
>> > > different SSDs and workload/usage model, an interface to enable is preferred.
>> > 
>> > Neil,
>> > any further comments on this? This is a usable feature, I hope we can have some
>> > agreements.
>> 
>> You still haven't answered my main question, which possibly means I haven't
>> asked it very clearly.
>> 
>> You are saying that this new behaviour should not be the default and I think
>> I agree.
>> So the question is:  how it is selected?
>> 
>> You cannot expect the user to explicitly enable it any time a resync or
>> recovery starts that should use this new feature.  You must have some
>> automatic, or semi-automatic, way for the feature to be activated, otherwise
>> it will never be used.
>> 
>> I'm not asking "when should the feature be used" - you've answered that
>> question a few time and it really isn't an issue.
>> The question it "What it the exact process by which the feature is turned on
>> for any particular resync or recovery?"
>
>So you worried about users don't know how to correctly select the feature. An
>experienced user knows this, the usage scenario I mentioned describes how to do
>the decision. For example, a resync after system crash should enable the
>feature. I admit an inexperienced user doesn't know how to select it, but this
>isn't a big problem to me. There are a lot of tunables in the kernel (even MD),
>which can significantly impact kernel behavior. These tunables are just for
>experienced users.
>
How about re-add parameter for recovery mode?
Like shaohua's example: "update SSD firmware. This doesn't change the data, but we need take one disk
 off from the raid one time",  the user can know the different of two disks.So they can use 're-add' to add.
For resync, like os crash, i think the bitmap can do.
>Thanks,
>Shaohua
>--
>To unsubscribe from this list: send the line "unsubscribe linux-raid" in
>the body of a message to majordomo@vger.kernel.org
>More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC 1/2]raid1: only write mismatch sectors in sync
  2012-10-17 22:56                   ` NeilBrown
  2012-10-18  1:17                     ` Shaohua Li
@ 2012-11-20 17:00                     ` Joseph Glanville
  1 sibling, 0 replies; 24+ messages in thread
From: Joseph Glanville @ 2012-11-20 17:00 UTC (permalink / raw)
  To: NeilBrown; +Cc: Shaohua Li, linux-raid

Hi Neil,

Sorry for necro'ing a very old thread.

> You still haven't answered my main question, which possibly means I haven't
> asked it very clearly.
>
> You are saying that this new behaviour should not be the default and I think
> I agree.
> So the question is:  how it is selected?
>
> You cannot expect the user to explicitly enable it any time a resync or
> recovery starts that should use this new feature.  You must have some
> automatic, or semi-automatic, way for the feature to be activated, otherwise
> it will never be used.
>
> I'm not asking "when should the feature be used" - you've answered that
> question a few time and it really isn't an issue.
> The question it "What it the exact process by which the feature is turned on
> for any particular resync or recovery?"
>
> NeilBrown
>

Stumbled across this looking for trim and also 'compare and trim
rather than write 0's' behavior for md.
Personally where a device is non_rotational and support discard this
feature would be a big boon.
Detecting if a device is non_rotational and testing for discard
support should probably be sufficient, this is considered enough for
btrfs to switch itself into SSD mode.

It is true that older SSDs trim is quite slow. and even now trim
causes the SATA queue to be flushed but it's still much better for the
life and performance of the majority of current flash devices.
If anything for users that know they have a very strange SSD or SSD
like device the ability to disable it easy with an option on the rdev
would be good.

Especially for users that are using md in odd configurations (like 2
remote block devices on arrays that support thin provisioning for
example).
I know of atleast another user on the linux-raid list that is using md
like this and there should also be all the old users of "fast md raid"
(predecessor to write-mostly and write intent bitmap)

Joseph.

-- 
CTO | Orion Virtualisation Solutions | www.orionvm.com.au
Phone: 1300 56 99 52 | Mobile: 0428 754 846

^ permalink raw reply	[flat|nested] 24+ messages in thread

end of thread, other threads:[~2012-11-20 17:00 UTC | newest]

Thread overview: 24+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2012-07-26  8:01 [RFC 1/2]raid1: only write mismatch sectors in sync Shaohua Li
2012-07-27 16:01 ` Jan Ceuleers
2012-07-30  0:39   ` Shaohua Li
2012-07-30  1:07     ` Roberto Spadim
2012-07-31  5:53 ` NeilBrown
2012-07-31  8:12   ` Shaohua Li
2012-09-11  0:59     ` NeilBrown
2012-09-12  5:29       ` Shaohua Li
2012-09-18  4:57         ` NeilBrown
2012-09-19  5:51           ` Shaohua Li
2012-09-19  7:16             ` NeilBrown
2012-09-20  1:56               ` Shaohua Li
2012-10-17  5:11                 ` Shaohua Li
2012-10-17 22:56                   ` NeilBrown
2012-10-18  1:17                     ` Shaohua Li
2012-10-18  1:29                       ` NeilBrown
2012-10-18  2:01                         ` Shaohua Li
2012-10-18  2:36                           ` NeilBrown
2012-10-21 17:14                             ` Michael Tokarev
2012-10-31  3:25                             ` Shaohua Li
2012-10-31  5:43                               ` NeilBrown
2012-10-31  6:05                                 ` Shaohua Li
2012-10-18  1:30                       ` kedacomkernel
2012-11-20 17:00                     ` Joseph Glanville

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).