[patch 0/7 v2] MD linear/0/1/10/5 TRIM support

linux-raid.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [patch 0/7 v2] MD linear/0/1/10/5 TRIM support
@ 2012-08-10  2:51 Shaohua Li
  2012-08-10  2:51 ` [patch 1/7 v2] block: makes bio_split support bio without data Shaohua Li
                   ` (8 more replies)
  0 siblings, 9 replies; 19+ messages in thread
From: Shaohua Li @ 2012-08-10  2:51 UTC (permalink / raw)
  To: linux-raid; +Cc: neilb, axboe, shli

This is the patchset to support MD discard, which I refresh against latest
kernel. It's pretty straightforward for raid linear/0/1/10. The raid456 discard
support is tricky, please see the log in the patches for details.

Previously when I posted the patches out, people complain SATA SSD discard
request merge error. Now a patch to disable discard request merge is in Jens's
tree. We don't have this issue any more.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* [patch 1/7 v2] block: makes bio_split support bio without data
  2012-08-10  2:51 [patch 0/7 v2] MD linear/0/1/10/5 TRIM support Shaohua Li
@ 2012-08-10  2:51 ` Shaohua Li
  2012-08-10  2:51 ` [patch 2/7 v2] md: linear supports TRIM Shaohua Li
                   ` (7 subsequent siblings)
  8 siblings, 0 replies; 19+ messages in thread
From: Shaohua Li @ 2012-08-10  2:51 UTC (permalink / raw)
  To: linux-raid; +Cc: neilb, axboe, shli

[-- Attachment #1: bio_split-fix.patch --]
[-- Type: text/plain, Size: 1638 bytes --]

discard bio hasn't data attached. We hit a BUG_ON with such bio. This makes
bio_split works for such bio.

Signed-off-by: Shaohua Li <shli@fusionio.com>
---
 fs/bio.c |   22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

Index: linux/fs/bio.c
===================================================================
--- linux.orig/fs/bio.c	2012-06-01 10:10:31.626395215 +0800
+++ linux/fs/bio.c	2012-06-04 16:45:17.175941457 +0800
@@ -1500,7 +1500,7 @@ struct bio_pair *bio_split(struct bio *b
 	trace_block_split(bdev_get_queue(bi->bi_bdev), bi,
 				bi->bi_sector + first_sectors);
 
-	BUG_ON(bi->bi_vcnt != 1);
+	BUG_ON(bi->bi_vcnt != 1 && bi->bi_vcnt != 0);
 	BUG_ON(bi->bi_idx != 0);
 	atomic_set(&bp->cnt, 3);
 	bp->error = 0;
@@ -1510,17 +1510,19 @@ struct bio_pair *bio_split(struct bio *b
 	bp->bio2.bi_size -= first_sectors << 9;
 	bp->bio1.bi_size = first_sectors << 9;
 
-	bp->bv1 = bi->bi_io_vec[0];
-	bp->bv2 = bi->bi_io_vec[0];
-	bp->bv2.bv_offset += first_sectors << 9;
-	bp->bv2.bv_len -= first_sectors << 9;
-	bp->bv1.bv_len = first_sectors << 9;
+	if (bi->bi_vcnt != 0) {
+		bp->bv1 = bi->bi_io_vec[0];
+		bp->bv2 = bi->bi_io_vec[0];
+		bp->bv2.bv_offset += first_sectors << 9;
+		bp->bv2.bv_len -= first_sectors << 9;
+		bp->bv1.bv_len = first_sectors << 9;
 
-	bp->bio1.bi_io_vec = &bp->bv1;
-	bp->bio2.bi_io_vec = &bp->bv2;
+		bp->bio1.bi_io_vec = &bp->bv1;
+		bp->bio2.bi_io_vec = &bp->bv2;
 
-	bp->bio1.bi_max_vecs = 1;
-	bp->bio2.bi_max_vecs = 1;
+		bp->bio1.bi_max_vecs = 1;
+		bp->bio2.bi_max_vecs = 1;
+	}
 
 	bp->bio1.bi_end_io = bio_pair_end_1;
 	bp->bio2.bi_end_io = bio_pair_end_2;


^ permalink raw reply	[flat|nested] 19+ messages in thread

* [patch 2/7 v2] md: linear supports TRIM
  2012-08-10  2:51 [patch 0/7 v2] MD linear/0/1/10/5 TRIM support Shaohua Li
  2012-08-10  2:51 ` [patch 1/7 v2] block: makes bio_split support bio without data Shaohua Li
@ 2012-08-10  2:51 ` Shaohua Li
  2012-08-10  2:51 ` [patch 3/7 v2] md: raid 0 " Shaohua Li
                   ` (6 subsequent siblings)
  8 siblings, 0 replies; 19+ messages in thread
From: Shaohua Li @ 2012-08-10  2:51 UTC (permalink / raw)
  To: linux-raid; +Cc: neilb, axboe, shli

[-- Attachment #1: md-linear-discard-support.patch --]
[-- Type: text/plain, Size: 1621 bytes --]

This makes md linear support TRIM.

Signed-off-by: Shaohua Li <shli@fusionio.com>
---
 drivers/md/linear.c |   16 ++++++++++++++++
 1 file changed, 16 insertions(+)

Index: linux/drivers/md/linear.c
===================================================================
--- linux.orig/drivers/md/linear.c	2012-06-06 09:35:56.478255641 +0800
+++ linux/drivers/md/linear.c	2012-08-10 07:08:52.816281828 +0800
@@ -138,6 +138,7 @@ static struct linear_conf *linear_conf(s
 	struct linear_conf *conf;
 	struct md_rdev *rdev;
 	int i, cnt;
+	bool discard_supported = false;
 
 	conf = kzalloc (sizeof (*conf) + raid_disks*sizeof(struct dev_info),
 			GFP_KERNEL);
@@ -171,6 +172,8 @@ static struct linear_conf *linear_conf(s
 		conf->array_sectors += rdev->sectors;
 		cnt++;
 
+		if (blk_queue_discard(bdev_get_queue(rdev->bdev)))
+			discard_supported = true;
 	}
 	if (cnt != raid_disks) {
 		printk(KERN_ERR "md/linear:%s: not enough drives present. Aborting!\n",
@@ -178,6 +181,11 @@ static struct linear_conf *linear_conf(s
 		goto out;
 	}
 
+	if (!discard_supported)
+		queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
+	else
+		queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
+
 	/*
 	 * Here we calculate the device offsets.
 	 */
@@ -326,6 +334,14 @@ static void linear_make_request(struct m
 	bio->bi_sector = bio->bi_sector - start_sector
 		+ tmp_dev->rdev->data_offset;
 	rcu_read_unlock();
+
+	if (unlikely((bio->bi_rw & REQ_DISCARD) &&
+		!blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) {
+		/* Just ignore it */
+		bio_endio(bio, 0);
+		return;
+	}
+
 	generic_make_request(bio);
 }
 


^ permalink raw reply	[flat|nested] 19+ messages in thread

* [patch 3/7 v2] md: raid 0 supports TRIM
  2012-08-10  2:51 [patch 0/7 v2] MD linear/0/1/10/5 TRIM support Shaohua Li
  2012-08-10  2:51 ` [patch 1/7 v2] block: makes bio_split support bio without data Shaohua Li
  2012-08-10  2:51 ` [patch 2/7 v2] md: linear supports TRIM Shaohua Li
@ 2012-08-10  2:51 ` Shaohua Li
  2012-08-10  2:51 ` [patch 4/7 v2] md: raid 1 " Shaohua Li
                   ` (5 subsequent siblings)
  8 siblings, 0 replies; 19+ messages in thread
From: Shaohua Li @ 2012-08-10  2:51 UTC (permalink / raw)
  To: linux-raid; +Cc: neilb, axboe, shli

[-- Attachment #1: md-raid0-discard-support.patch --]
[-- Type: text/plain, Size: 2433 bytes --]

This makes md raid 0 support TRIM.

Signed-off-by: Shaohua Li <shli@fusionio.com>
---
 drivers/md/raid0.c |   19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

Index: linux/drivers/md/raid0.c
===================================================================
--- linux.orig/drivers/md/raid0.c	2012-06-06 09:35:56.446256044 +0800
+++ linux/drivers/md/raid0.c	2012-08-10 07:09:02.504164114 +0800
@@ -88,6 +88,7 @@ static int create_strip_zones(struct mdd
 	char b[BDEVNAME_SIZE];
 	char b2[BDEVNAME_SIZE];
 	struct r0conf *conf = kzalloc(sizeof(*conf), GFP_KERNEL);
+	bool discard_supported = false;
 
 	if (!conf)
 		return -ENOMEM;
@@ -195,6 +196,9 @@ static int create_strip_zones(struct mdd
 		if (!smallest || (rdev1->sectors < smallest->sectors))
 			smallest = rdev1;
 		cnt++;
+
+		if (blk_queue_discard(bdev_get_queue(rdev1->bdev)))
+			discard_supported = true;
 	}
 	if (cnt != mddev->raid_disks) {
 		printk(KERN_ERR "md/raid0:%s: too few disks (%d of %d) - "
@@ -272,6 +276,11 @@ static int create_strip_zones(struct mdd
 	blk_queue_io_opt(mddev->queue,
 			 (mddev->chunk_sectors << 9) * mddev->raid_disks);
 
+	if (!discard_supported)
+		queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
+	else
+		queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
+
 	pr_debug("md/raid0:%s: done.\n", mdname(mddev));
 	*private_conf = conf;
 
@@ -422,6 +431,7 @@ static int raid0_run(struct mddev *mddev
 	if (md_check_no_bitmap(mddev))
 		return -EINVAL;
 	blk_queue_max_hw_sectors(mddev->queue, mddev->chunk_sectors);
+	blk_queue_max_discard_sectors(mddev->queue, mddev->chunk_sectors);
 
 	/* if private is not null, we are here after takeover */
 	if (mddev->private == NULL) {
@@ -509,7 +519,7 @@ static void raid0_make_request(struct md
 		sector_t sector = bio->bi_sector;
 		struct bio_pair *bp;
 		/* Sanity check -- queue functions should prevent this happening */
-		if (bio->bi_vcnt != 1 ||
+		if ((bio->bi_vcnt != 1 && bio->bi_vcnt != 0) ||
 		    bio->bi_idx != 0)
 			goto bad_map;
 		/* This is a one page bio that upper layers
@@ -535,6 +545,13 @@ static void raid0_make_request(struct md
 	bio->bi_sector = sector_offset + zone->dev_start +
 		tmp_dev->data_offset;
 
+	if (unlikely((bio->bi_rw & REQ_DISCARD) &&
+		!blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) {
+		/* Just ignore it */
+		bio_endio(bio, 0);
+		return;
+	}
+
 	generic_make_request(bio);
 	return;
 


^ permalink raw reply	[flat|nested] 19+ messages in thread

* [patch 4/7 v2] md: raid 1 supports TRIM
  2012-08-10  2:51 [patch 0/7 v2] MD linear/0/1/10/5 TRIM support Shaohua Li
                   ` (2 preceding siblings ...)
  2012-08-10  2:51 ` [patch 3/7 v2] md: raid 0 " Shaohua Li
@ 2012-08-10  2:51 ` Shaohua Li
  2012-08-10  2:51 ` [patch 5/7 v2] md: raid 10 " Shaohua Li
                   ` (4 subsequent siblings)
  8 siblings, 0 replies; 19+ messages in thread
From: Shaohua Li @ 2012-08-10  2:51 UTC (permalink / raw)
  To: linux-raid; +Cc: neilb, axboe, shli

[-- Attachment #1: md-raid1-discard-support.patch --]
[-- Type: text/plain, Size: 3102 bytes --]

This makes md raid 1 support TRIM.
If one disk supports discard and another not, or one has discard_zero_data and
another not, there could be inconsistent between data from such disks. But this
should not matter, discarded data is useless. This will add extra copy in rebuild
though.

Signed-off-by: Shaohua Li <shli@fusionio.com>
---
 drivers/md/raid1.c |   20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

Index: linux/drivers/md/raid1.c
===================================================================
--- linux.orig/drivers/md/raid1.c	2012-08-10 06:55:23.914455183 +0800
+++ linux/drivers/md/raid1.c	2012-08-10 07:10:53.810764795 +0800
@@ -781,7 +781,12 @@ static void flush_pending_writes(struct
 		while (bio) { /* submit pending writes */
 			struct bio *next = bio->bi_next;
 			bio->bi_next = NULL;
-			generic_make_request(bio);
+			if (unlikely((bio->bi_rw & REQ_DISCARD) &&
+			    !blk_queue_discard(bdev_get_queue(bio->bi_bdev))))
+				/* Just ignore it */
+				bio_endio(bio, 0);
+			else
+				generic_make_request(bio);
 			bio = next;
 		}
 	} else
@@ -994,6 +999,7 @@ static void make_request(struct mddev *m
 	const int rw = bio_data_dir(bio);
 	const unsigned long do_sync = (bio->bi_rw & REQ_SYNC);
 	const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA));
+	const unsigned long do_discard = (bio->bi_rw & (REQ_DISCARD | REQ_SECURE));
 	struct md_rdev *blocked_rdev;
 	struct blk_plug_cb *cb;
 	struct raid1_plug_cb *plug = NULL;
@@ -1295,7 +1301,7 @@ read_again:
 				   conf->mirrors[i].rdev->data_offset);
 		mbio->bi_bdev = conf->mirrors[i].rdev->bdev;
 		mbio->bi_end_io	= raid1_end_write_request;
-		mbio->bi_rw = WRITE | do_flush_fua | do_sync;
+		mbio->bi_rw = WRITE | do_flush_fua | do_sync | do_discard;
 		mbio->bi_private = r1_bio;
 
 		atomic_inc(&r1_bio->remaining);
@@ -1549,6 +1555,8 @@ static int raid1_add_disk(struct mddev *
 		clear_bit(Unmerged, &rdev->flags);
 	}
 	md_integrity_add_rdev(rdev, mddev);
+	if (blk_queue_discard(bdev_get_queue(rdev->bdev)))
+		queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
 	print_conf(conf);
 	return err;
 }
@@ -2783,6 +2791,7 @@ static int run(struct mddev *mddev)
 	int i;
 	struct md_rdev *rdev;
 	int ret;
+	bool discard_supported = false;
 
 	if (mddev->level != 1) {
 		printk(KERN_ERR "md/raid1:%s: raid level not set to mirroring (%d)\n",
@@ -2812,6 +2821,8 @@ static int run(struct mddev *mddev)
 			continue;
 		disk_stack_limits(mddev->gendisk, rdev->bdev,
 				  rdev->data_offset << 9);
+		if (blk_queue_discard(bdev_get_queue(rdev->bdev)))
+			discard_supported = true;
 	}
 
 	mddev->degraded = 0;
@@ -2846,6 +2857,11 @@ static int run(struct mddev *mddev)
 		mddev->queue->backing_dev_info.congested_fn = raid1_congested;
 		mddev->queue->backing_dev_info.congested_data = mddev;
 		blk_queue_merge_bvec(mddev->queue, raid1_mergeable_bvec);
+
+		if (discard_supported)
+			queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
+		else
+			queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
 	}
 
 	ret =  md_integrity_register(mddev);


^ permalink raw reply	[flat|nested] 19+ messages in thread

* [patch 5/7 v2] md: raid 10 supports TRIM
  2012-08-10  2:51 [patch 0/7 v2] MD linear/0/1/10/5 TRIM support Shaohua Li
                   ` (3 preceding siblings ...)
  2012-08-10  2:51 ` [patch 4/7 v2] md: raid 1 " Shaohua Li
@ 2012-08-10  2:51 ` Shaohua Li
  2012-08-10  2:51 ` [patch 6/7 v2] MD: raid5 trim support Shaohua Li
                   ` (3 subsequent siblings)
  8 siblings, 0 replies; 19+ messages in thread
From: Shaohua Li @ 2012-08-10  2:51 UTC (permalink / raw)
  To: linux-raid; +Cc: neilb, axboe, shli

[-- Attachment #1: md-raid10-discard-support.patch --]
[-- Type: text/plain, Size: 3897 bytes --]

This makes md raid 10 support TRIM.
If one disk supports discard and another not, or one has discard_zero_data and
another not, there could be inconsistent between data from such disks. But this
should not matter, discarded data is useless. This will add extra copy in rebuild
though.

Signed-off-by: Shaohua Li <shli@fusionio.com>
---
 drivers/md/raid10.c |   27 +++++++++++++++++++++++----
 1 file changed, 23 insertions(+), 4 deletions(-)

Index: linux/drivers/md/raid10.c
===================================================================
--- linux.orig/drivers/md/raid10.c	2012-08-10 06:55:23.902455333 +0800
+++ linux/drivers/md/raid10.c	2012-08-10 07:15:04.759610157 +0800
@@ -907,7 +907,12 @@ static void flush_pending_writes(struct
 		while (bio) { /* submit pending writes */
 			struct bio *next = bio->bi_next;
 			bio->bi_next = NULL;
-			generic_make_request(bio);
+			if (unlikely((bio->bi_rw & REQ_DISCARD) &&
+			    !blk_queue_discard(bdev_get_queue(bio->bi_bdev))))
+				/* Just ignore it */
+				bio_endio(bio, 0);
+			else
+				generic_make_request(bio);
 			bio = next;
 		}
 	} else
@@ -1057,6 +1062,7 @@ static void make_request(struct mddev *m
 	const int rw = bio_data_dir(bio);
 	const unsigned long do_sync = (bio->bi_rw & REQ_SYNC);
 	const unsigned long do_fua = (bio->bi_rw & REQ_FUA);
+	const unsigned long do_discard = (bio->bi_rw & (REQ_DISCARD | REQ_SECURE));
 	unsigned long flags;
 	struct md_rdev *blocked_rdev;
 	int sectors_handled;
@@ -1077,7 +1083,7 @@ static void make_request(struct mddev *m
 			 || conf->prev.near_copies < conf->prev.raid_disks))) {
 		struct bio_pair *bp;
 		/* Sanity check -- queue functions should prevent this happening */
-		if (bio->bi_vcnt != 1 ||
+		if ((bio->bi_vcnt != 1 && bio->bi_vcnt !=0) ||
 		    bio->bi_idx != 0)
 			goto bad_map;
 		/* This is a one page bio that upper layers
@@ -1406,7 +1412,7 @@ retry_write:
 						      conf->mirrors[d].rdev));
 		mbio->bi_bdev = conf->mirrors[d].rdev->bdev;
 		mbio->bi_end_io	= raid10_end_write_request;
-		mbio->bi_rw = WRITE | do_sync | do_fua;
+		mbio->bi_rw = WRITE | do_sync | do_fua | do_discard;
 		mbio->bi_private = r10_bio;
 
 		atomic_inc(&r10_bio->remaining);
@@ -1435,7 +1441,7 @@ retry_write:
 					   conf->mirrors[d].replacement));
 		mbio->bi_bdev = conf->mirrors[d].replacement->bdev;
 		mbio->bi_end_io	= raid10_end_write_request;
-		mbio->bi_rw = WRITE | do_sync | do_fua;
+		mbio->bi_rw = WRITE | do_sync | do_fua | do_discard;
 		mbio->bi_private = r10_bio;
 
 		atomic_inc(&r10_bio->remaining);
@@ -1719,6 +1725,9 @@ static int raid10_add_disk(struct mddev
 		clear_bit(Unmerged, &rdev->flags);
 	}
 	md_integrity_add_rdev(rdev, mddev);
+	if (blk_queue_discard(bdev_get_queue(rdev->bdev)))
+		queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
+
 	print_conf(conf);
 	return err;
 }
@@ -3476,6 +3485,7 @@ static int run(struct mddev *mddev)
 	sector_t size;
 	sector_t min_offset_diff = 0;
 	int first = 1;
+	bool discard_supported = false;
 
 	if (mddev->private == NULL) {
 		conf = setup_conf(mddev);
@@ -3490,6 +3500,7 @@ static int run(struct mddev *mddev)
 	mddev->thread = conf->thread;
 	conf->thread = NULL;
 
+	blk_queue_max_discard_sectors(mddev->queue, mddev->chunk_sectors);
 	chunk_size = mddev->chunk_sectors << 9;
 	if (mddev->queue) {
 		blk_queue_io_min(mddev->queue, chunk_size);
@@ -3537,8 +3548,16 @@ static int run(struct mddev *mddev)
 					  rdev->data_offset << 9);
 
 		disk->head_position = 0;
+
+		if (blk_queue_discard(bdev_get_queue(rdev->bdev)))
+			discard_supported = true;
 	}
 
+	if (discard_supported)
+		queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
+	else
+		queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
+
 	/* need to check that every block has at least one working mirror */
 	if (!enough(conf, -1)) {
 		printk(KERN_ERR "md/raid10:%s: not enough operational mirrors.\n",


^ permalink raw reply	[flat|nested] 19+ messages in thread

* [patch 6/7 v2] MD: raid5 trim support
  2012-08-10  2:51 [patch 0/7 v2] MD linear/0/1/10/5 TRIM support Shaohua Li
                   ` (4 preceding siblings ...)
  2012-08-10  2:51 ` [patch 5/7 v2] md: raid 10 " Shaohua Li
@ 2012-08-10  2:51 ` Shaohua Li
  2012-08-13  1:50   ` NeilBrown
  2012-08-10  2:51 ` [patch 7/7 v2] MD: raid5 avoid unnecessary zero page for trim Shaohua Li
                   ` (2 subsequent siblings)
  8 siblings, 1 reply; 19+ messages in thread
From: Shaohua Li @ 2012-08-10  2:51 UTC (permalink / raw)
  To: linux-raid; +Cc: neilb, axboe, shli

[-- Attachment #1: md-raid5-discard-support.patch --]
[-- Type: text/plain, Size: 10278 bytes --]

Discard for raid4/5/6 has limitation. If discard request size is small, we do
discard for one disk, but we need calculate parity and write parity disk.  To
correctly calculate parity, zero_after_discard must be guaranteed. Even it's
true, we need do discard for one disk but write another disks, which makes the
parity disks wear out fast. This doesn't make sense. So an efficient discard
for raid4/5/6 should discard all data disks and parity disks, which requires
the write pattern to be (A, A+chunk_size, A+chunk_size*2...). If A's size is
smaller than chunk_size, such pattern is almost impossible in practice. So in
this patch, I only handle the case that A's size equals to chunk_size. That is
discard request should be aligned to stripe size and its size is multiple of
stripe size.

Since we can only handle request with specific alignment and size (or part of
the request fitting stripes), we can't guarantee zero_after_discard even
zero_after_discard is true in low level drives.

The block layer doesn't send down correctly aligned requests even correct
discard alignment is set, so I must filter out.

For raid4/5/6 parity calculation, if data is 0, parity is 0. So if
zero_after_discard is true for all disks, data is consistent after discard.
Otherwise, data might be lost. Let's consider a scenario: discard a stripe,
write data to one disk and write parity disk. The stripe could be still
inconsistent till then depending on using data from other data disks or parity
disks to calculate new parity. If the disk is broken, we can't restore it. So
in this patch, we only enable discard support if all disks have
zero_after_discard.

If discard fails in one disk, we face the similar inconsistent issue above. The
patch will make discard follow the same path as normal write request. If
discard fails, a resync will be scheduled to make the data consistent. This
isn't good to have extra writes, but data consistency is important.

If a subsequent read/write request hits raid5 cache of a discarded stripe, the
discarded dev page should have zero filled, so the data is consistent. This
patch will always zero dev page for discarded request stripe. This isn't
optimal because discard request doesn't need such payload. Next patch will
avoid it.

Signed-off-by: Shaohua Li <shli@fusionio.com>
---
 drivers/md/raid5.c |  130 +++++++++++++++++++++++++++++++++++++++++++++++++++--
 drivers/md/raid5.h |    1 
 2 files changed, 127 insertions(+), 4 deletions(-)

Index: linux/drivers/md/raid5.c
===================================================================
--- linux.orig/drivers/md/raid5.c	2012-08-10 06:55:23.000000000 +0800
+++ linux/drivers/md/raid5.c	2012-08-10 07:38:49.601697576 +0800
@@ -547,6 +547,8 @@ static void ops_run_io(struct stripe_hea
 				rw = WRITE_FUA;
 			else
 				rw = WRITE;
+			if (test_and_clear_bit(R5_Discard, &sh->dev[i].flags))
+				rw |= REQ_DISCARD;
 		} else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
 			rw = READ;
 		else if (test_and_clear_bit(R5_WantReplace,
@@ -1170,8 +1172,13 @@ ops_run_biodrain(struct stripe_head *sh,
 					set_bit(R5_WantFUA, &dev->flags);
 				if (wbi->bi_rw & REQ_SYNC)
 					set_bit(R5_SyncIO, &dev->flags);
-				tx = async_copy_data(1, wbi, dev->page,
-					dev->sector, tx);
+				if (wbi->bi_rw & REQ_DISCARD) {
+					memset(page_address(dev->page), 0,
+						STRIPE_SECTORS << 9);
+					set_bit(R5_Discard, &dev->flags);
+				} else
+					tx = async_copy_data(1, wbi, dev->page,
+						dev->sector, tx);
 				wbi = r5_next_bio(wbi, dev->sector);
 			}
 		}
@@ -1237,6 +1244,20 @@ ops_run_reconstruct5(struct stripe_head
 	pr_debug("%s: stripe %llu\n", __func__,
 		(unsigned long long)sh->sector);
 
+	for (i = 0; i < sh->disks; i++) {
+		if (pd_idx == i)
+			continue;
+		if (!test_bit(R5_Discard, &sh->dev[i].flags))
+			break;
+	}
+	if (i >= sh->disks) {
+		atomic_inc(&sh->count);
+		memset(page_address(sh->dev[pd_idx].page), 0,
+			STRIPE_SECTORS << 9);
+		set_bit(R5_Discard, &sh->dev[pd_idx].flags);
+		ops_complete_reconstruct(sh);
+		return;
+	}
 	/* check if prexor is active which means only process blocks
 	 * that are part of a read-modify-write (written)
 	 */
@@ -1281,10 +1302,28 @@ ops_run_reconstruct6(struct stripe_head
 {
 	struct async_submit_ctl submit;
 	struct page **blocks = percpu->scribble;
-	int count;
+	int count, i;
 
 	pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector);
 
+	for (i = 0; i < sh->disks; i++) {
+		if (sh->pd_idx == i || sh->qd_idx == i)
+			continue;
+		if (!test_bit(R5_Discard, &sh->dev[i].flags))
+			break;
+	}
+	if (i >= sh->disks) {
+		atomic_inc(&sh->count);
+		memset(page_address(sh->dev[sh->pd_idx].page), 0,
+			STRIPE_SECTORS << 9);
+		memset(page_address(sh->dev[sh->qd_idx].page), 0,
+			STRIPE_SECTORS << 9);
+		set_bit(R5_Discard, &sh->dev[sh->pd_idx].flags);
+		set_bit(R5_Discard, &sh->dev[sh->qd_idx].flags);
+		ops_complete_reconstruct(sh);
+		return;
+	}
+
 	count = set_syndrome_sources(blocks, sh);
 
 	atomic_inc(&sh->count);
@@ -2353,7 +2392,7 @@ schedule_reconstruction(struct stripe_he
  */
 static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, int forwrite)
 {
-	struct bio **bip;
+	struct bio **bip, *orig_bi = bi;
 	struct r5conf *conf = sh->raid_conf;
 	int firstwrite=0;
 
@@ -2370,6 +2409,23 @@ static int add_stripe_bio(struct stripe_
 	 * protect it.
 	 */
 	spin_lock_irq(&sh->stripe_lock);
+
+	if (bi->bi_rw & REQ_DISCARD) {
+		int i;
+		dd_idx = -1;
+		for (i = 0; i < sh->disks; i++) {
+			if (i == sh->pd_idx || i == sh->qd_idx)
+				continue;
+			if (dd_idx == -1)
+				dd_idx = i;
+			if (sh->dev[i].towrite) {
+				dd_idx = i;
+				goto overlap;
+			}
+		}
+	}
+
+again:
 	if (forwrite) {
 		bip = &sh->dev[dd_idx].towrite;
 		if (*bip == NULL)
@@ -2403,6 +2459,15 @@ static int add_stripe_bio(struct stripe_
 		if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS)
 			set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags);
 	}
+
+	bi = orig_bi;
+	if (bi->bi_rw & REQ_DISCARD) {
+		dd_idx++;
+		while (dd_idx == sh->pd_idx || dd_idx == sh->qd_idx)
+			dd_idx++;
+		if (dd_idx < sh->disks)
+			goto again;
+	}
 	spin_unlock_irq(&sh->stripe_lock);
 
 	pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n",
@@ -4094,6 +4159,19 @@ static void make_request(struct mddev *m
 	bi->bi_next = NULL;
 	bi->bi_phys_segments = 1;	/* over-loaded to count active stripes */
 
+	/* block layer doesn't correctly do alignment even we set correct alignment */
+	if (unlikely(bi->bi_rw & REQ_DISCARD)) {
+		int stripe_sectors = conf->chunk_sectors *
+			(conf->raid_disks - conf->max_degraded);
+
+		logical_sector = (logical_sector + stripe_sectors - 1);
+		sector_div(logical_sector, stripe_sectors);
+		sector_div(last_sector, stripe_sectors);
+
+		logical_sector *= stripe_sectors;
+		last_sector *= stripe_sectors;
+	}
+
 	for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
 		DEFINE_WAIT(w);
 		int previous;
@@ -4114,6 +4192,11 @@ static void make_request(struct mddev *m
 			if (mddev->reshape_backwards
 			    ? logical_sector < conf->reshape_progress
 			    : logical_sector >= conf->reshape_progress) {
+				/* The stripe will be reshaped soon, ignore it */
+				if (bi->bi_rw & REQ_DISCARD) {
+					spin_unlock_irq(&conf->device_lock);
+					goto next_stripe;
+				}
 				previous = 1;
 			} else {
 				if (mddev->reshape_backwards
@@ -4202,6 +4285,12 @@ static void make_request(struct mddev *m
 			finish_wait(&conf->wait_for_overlap, &w);
 			break;
 		}
+next_stripe:
+		/* For discard, we always discard one stripe */
+		if (unlikely((bi->bi_rw & REQ_DISCARD) &&
+		   !((logical_sector + STRIPE_SECTORS) % conf->chunk_sectors)))
+			logical_sector += conf->chunk_sectors *
+				(conf->raid_disks - conf->max_degraded - 1);
 	}
 
 	remaining = raid5_dec_bi_active_stripes(bi);
@@ -5361,6 +5450,7 @@ static int run(struct mddev *mddev)
 
 	if (mddev->queue) {
 		int chunk_size;
+		bool discard_supported = true;
 		/* read-ahead size must cover two whole stripes, which
 		 * is 2 * (datadisks) * chunksize where 'n' is the
 		 * number of raid devices
@@ -5380,13 +5470,45 @@ static int run(struct mddev *mddev)
 		blk_queue_io_min(mddev->queue, chunk_size);
 		blk_queue_io_opt(mddev->queue, chunk_size *
 				 (conf->raid_disks - conf->max_degraded));
+		/*
+		 * We can only discard a whole stripe. It doesn't make sense to
+		 * discard data disk but write parity disk
+		 */
+		stripe = stripe * PAGE_SIZE;
+		mddev->queue->limits.discard_alignment = stripe;
+		mddev->queue->limits.discard_granularity = stripe;
+		/*
+		 * unaligned part of discard request will be ignored, so can't
+		 * guarantee discard_zerors_data
+		 */
+		mddev->queue->limits.discard_zeroes_data = 0;
 
 		rdev_for_each(rdev, mddev) {
 			disk_stack_limits(mddev->gendisk, rdev->bdev,
 					  rdev->data_offset << 9);
 			disk_stack_limits(mddev->gendisk, rdev->bdev,
 					  rdev->new_data_offset << 9);
+			/*
+			 * discard_zeroes_data is required, otherwise data
+			 * could be lost. Consider a scenario: discard a stripe
+			 * (the stripe could be inconsistent if
+			 * discard_zeroes_data is 0); write one disk of the
+			 * stripe (the stripe could be inconsistent again
+			 * depending on which disks are used to calculate
+			 * parity); the disk is broken; The stripe data of this
+			 * disk is lost.
+			 */
+			if (!blk_queue_discard(bdev_get_queue(rdev->bdev)) ||
+			   !bdev_get_queue(rdev->bdev)->limits.discard_zeroes_data)
+				discard_supported = false;
 		}
+
+		if (discard_supported &&
+		   mddev->queue->limits.max_discard_sectors >= stripe &&
+		   mddev->queue->limits.discard_granularity >= stripe)
+			queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
+		else
+			queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
 	}
 
 	return 0;
Index: linux/drivers/md/raid5.h
===================================================================
--- linux.orig/drivers/md/raid5.h	2012-08-10 06:55:23.000000000 +0800
+++ linux/drivers/md/raid5.h	2012-08-10 07:38:49.601697576 +0800
@@ -298,6 +298,7 @@ enum r5dev_flags {
 	R5_WantReplace, /* We need to update the replacement, we have read
 			 * data in, and now is a good time to write it out.
 			 */
+	R5_Discard,	/* Discard the stripe */
 };
 
 /*


^ permalink raw reply	[flat|nested] 19+ messages in thread

* [patch 7/7 v2] MD: raid5 avoid unnecessary zero page for trim
  2012-08-10  2:51 [patch 0/7 v2] MD linear/0/1/10/5 TRIM support Shaohua Li
                   ` (5 preceding siblings ...)
  2012-08-10  2:51 ` [patch 6/7 v2] MD: raid5 trim support Shaohua Li
@ 2012-08-10  2:51 ` Shaohua Li
  2012-08-13  1:51 ` [patch 0/7 v2] MD linear/0/1/10/5 TRIM support NeilBrown
  2012-08-29 18:58 ` Holger Kiehl
  8 siblings, 0 replies; 19+ messages in thread
From: Shaohua Li @ 2012-08-10  2:51 UTC (permalink / raw)
  To: linux-raid; +Cc: neilb, axboe, shli

[-- Attachment #1: md-raid5-remove-unnecessary-memset.patch --]
[-- Type: text/plain, Size: 7593 bytes --]

We want to avoid zero discarded dev page, because it's useless for discard.
But if we don't zero it, another read/write hit such page in the cache and
will get inconsistent data. To avoid zero the page, we set R5_WantZeroFill
for discarded dev page. Every time before the page is accessed and the
flag is set, we zero the page and clear the flag. If the page will be
drained or computed, we just clear the flag for it. In this way, the dev
page data is alway consistent. And since the chance discarded data is
accessed soon is low, zero discard dev page is largely avoided.

Signed-off-by: Shaohua Li <shli@fusionio.com>
---
 drivers/md/raid5.c |   83 +++++++++++++++++++++++++++++++++++++++++++----------
 drivers/md/raid5.h |    1 
 2 files changed, 69 insertions(+), 15 deletions(-)

Index: linux/drivers/md/raid5.c
===================================================================
--- linux.orig/drivers/md/raid5.c	2012-08-10 07:38:49.000000000 +0800
+++ linux/drivers/md/raid5.c	2012-08-10 07:40:02.544779350 +0800
@@ -824,6 +824,10 @@ static void ops_run_biofill(struct strip
 			dev->read = rbi = dev->toread;
 			dev->toread = NULL;
 			spin_unlock_irq(&sh->stripe_lock);
+
+			if (test_and_clear_bit(R5_WantZeroFill, &dev->flags))
+				memset(page_address(dev->page), 0, STRIPE_SIZE);
+
 			while (rbi && rbi->bi_sector <
 				dev->sector + STRIPE_SECTORS) {
 				tx = async_copy_data(0, rbi, dev->page,
@@ -893,9 +897,16 @@ ops_run_compute5(struct stripe_head *sh,
 		__func__, (unsigned long long)sh->sector, target);
 	BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
 
-	for (i = disks; i--; )
-		if (i != target)
+	for (i = disks; i--; ) {
+		if (i != target) {
 			xor_srcs[count++] = sh->dev[i].page;
+			if (test_and_clear_bit(R5_WantZeroFill,
+			   &sh->dev[i].flags))
+				memset(page_address(sh->dev[i].page), 0,
+					STRIPE_SIZE);
+		}
+		clear_bit(R5_WantZeroFill, &sh->dev[i].flags);
+	}
 
 	atomic_inc(&sh->count);
 
@@ -972,6 +983,10 @@ ops_run_compute6_1(struct stripe_head *s
 
 	atomic_inc(&sh->count);
 
+	for (i = 0; i < sh->disks; i++)
+		if (test_and_clear_bit(R5_WantZeroFill, &sh->dev[i].flags))
+			memset(page_address(sh->dev[i].page), 0, STRIPE_SIZE);
+
 	if (target == qd_idx) {
 		count = set_syndrome_sources(blocks, sh);
 		blocks[count] = NULL; /* regenerating p is not necessary */
@@ -1022,8 +1037,11 @@ ops_run_compute6_2(struct stripe_head *s
 	/* we need to open-code set_syndrome_sources to handle the
 	 * slot number conversion for 'faila' and 'failb'
 	 */
-	for (i = 0; i < disks ; i++)
+	for (i = 0; i < disks ; i++) {
 		blocks[i] = NULL;
+		if (test_and_clear_bit(R5_WantZeroFill, &sh->dev[i].flags))
+			memset(page_address(sh->dev[i].page), 0, STRIPE_SIZE);
+	}
 	count = 0;
 	i = d0_idx;
 	do {
@@ -1134,6 +1152,9 @@ ops_run_prexor(struct stripe_head *sh, s
 		/* Only process blocks that are known to be uptodate */
 		if (test_bit(R5_Wantdrain, &dev->flags))
 			xor_srcs[count++] = dev->page;
+		if ((i == pd_idx || test_bit(R5_Wantdrain, &dev->flags)) &&
+		   test_and_clear_bit(R5_WantZeroFill, &dev->flags))
+			memset(page_address(dev->page), 0, STRIPE_SIZE);
 	}
 
 	init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx,
@@ -1173,12 +1194,13 @@ ops_run_biodrain(struct stripe_head *sh,
 				if (wbi->bi_rw & REQ_SYNC)
 					set_bit(R5_SyncIO, &dev->flags);
 				if (wbi->bi_rw & REQ_DISCARD) {
-					memset(page_address(dev->page), 0,
-						STRIPE_SECTORS << 9);
+					set_bit(R5_WantZeroFill, &dev->flags);
 					set_bit(R5_Discard, &dev->flags);
-				} else
+				} else {
+					clear_bit(R5_WantZeroFill, &dev->flags);
 					tx = async_copy_data(1, wbi, dev->page,
 						dev->sector, tx);
+				}
 				wbi = r5_next_bio(wbi, dev->sector);
 			}
 		}
@@ -1252,8 +1274,7 @@ ops_run_reconstruct5(struct stripe_head
 	}
 	if (i >= sh->disks) {
 		atomic_inc(&sh->count);
-		memset(page_address(sh->dev[pd_idx].page), 0,
-			STRIPE_SECTORS << 9);
+		set_bit(R5_WantZeroFill, &sh->dev[pd_idx].flags);
 		set_bit(R5_Discard, &sh->dev[pd_idx].flags);
 		ops_complete_reconstruct(sh);
 		return;
@@ -1268,13 +1289,21 @@ ops_run_reconstruct5(struct stripe_head
 			struct r5dev *dev = &sh->dev[i];
 			if (dev->written)
 				xor_srcs[count++] = dev->page;
+			if ((i == pd_idx || dev->written) &&
+			   test_and_clear_bit(R5_WantZeroFill, &dev->flags))
+				memset(page_address(dev->page), 0, STRIPE_SIZE);
 		}
 	} else {
 		xor_dest = sh->dev[pd_idx].page;
+		clear_bit(R5_WantZeroFill, &sh->dev[pd_idx].flags);
 		for (i = disks; i--; ) {
 			struct r5dev *dev = &sh->dev[i];
-			if (i != pd_idx)
+			if (i != pd_idx) {
 				xor_srcs[count++] = dev->page;
+			   	if (test_and_clear_bit(R5_WantZeroFill, &dev->flags))
+					memset(page_address(dev->page), 0,
+					      STRIPE_SIZE);
+			}
 		}
 	}
 
@@ -1314,16 +1343,23 @@ ops_run_reconstruct6(struct stripe_head
 	}
 	if (i >= sh->disks) {
 		atomic_inc(&sh->count);
-		memset(page_address(sh->dev[sh->pd_idx].page), 0,
-			STRIPE_SECTORS << 9);
-		memset(page_address(sh->dev[sh->qd_idx].page), 0,
-			STRIPE_SECTORS << 9);
+		set_bit(R5_WantZeroFill, &sh->dev[sh->pd_idx].flags);
 		set_bit(R5_Discard, &sh->dev[sh->pd_idx].flags);
+		set_bit(R5_WantZeroFill, &sh->dev[sh->qd_idx].flags);
 		set_bit(R5_Discard, &sh->dev[sh->qd_idx].flags);
 		ops_complete_reconstruct(sh);
 		return;
 	}
 
+	for (i = 0; i < sh->disks; i++) {
+		if (sh->pd_idx == i || sh->qd_idx == i) {
+			clear_bit(R5_WantZeroFill, &sh->dev[i].flags);
+			continue;
+		}
+		if (test_and_clear_bit(R5_WantZeroFill, &sh->dev[i].flags))
+			memset(page_address(sh->dev[i].page), 0, STRIPE_SIZE);
+	}
+
 	count = set_syndrome_sources(blocks, sh);
 
 	atomic_inc(&sh->count);
@@ -1364,8 +1400,13 @@ static void ops_run_check_p(struct strip
 	xor_dest = sh->dev[pd_idx].page;
 	xor_srcs[count++] = xor_dest;
 	for (i = disks; i--; ) {
-		if (i == pd_idx || i == qd_idx)
+		if (i != qd_idx &&
+		   test_and_clear_bit(R5_WantZeroFill, &sh->dev[i].flags))
+			memset(page_address(sh->dev[i].page), 0, STRIPE_SIZE);
+		if (i == pd_idx || i == qd_idx) {
+			clear_bit(R5_WantZeroFill, &sh->dev[i].flags);
 			continue;
+		}
 		xor_srcs[count++] = sh->dev[i].page;
 	}
 
@@ -1383,11 +1424,20 @@ static void ops_run_check_pq(struct stri
 {
 	struct page **srcs = percpu->scribble;
 	struct async_submit_ctl submit;
-	int count;
+	int count, i;
 
 	pr_debug("%s: stripe %llu checkp: %d\n", __func__,
 		(unsigned long long)sh->sector, checkp);
 
+	for (i = 0; i < sh->disks; i++) {
+		if (sh->pd_idx == i || sh->qd_idx == i) {
+			clear_bit(R5_WantZeroFill, &sh->dev[i].flags);
+			continue;
+		}
+		if (test_and_clear_bit(R5_WantZeroFill, &sh->dev[i].flags))
+			memset(page_address(sh->dev[i].page), 0, STRIPE_SIZE);
+	}
+
 	count = set_syndrome_sources(srcs, sh);
 	if (!checkp)
 		srcs[count] = NULL;
@@ -3213,6 +3263,9 @@ static void handle_stripe_expansion(stru
 				release_stripe(sh2);
 				continue;
 			}
+			if (test_and_clear_bit(R5_WantZeroFill, &sh->dev[i].flags))
+				memset(page_address(sh->dev[i].page),
+					0, STRIPE_SIZE);
 
 			/* place all the copies on one channel */
 			init_async_submit(&submit, 0, tx, NULL, NULL, NULL);
Index: linux/drivers/md/raid5.h
===================================================================
--- linux.orig/drivers/md/raid5.h	2012-08-10 07:38:49.000000000 +0800
+++ linux/drivers/md/raid5.h	2012-08-10 07:40:02.544779350 +0800
@@ -299,6 +299,7 @@ enum r5dev_flags {
 			 * data in, and now is a good time to write it out.
 			 */
 	R5_Discard,	/* Discard the stripe */
+	R5_WantZeroFill, /* should be zero filled before read */
 };
 
 /*


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [patch 6/7 v2] MD: raid5 trim support
  2012-08-10  2:51 ` [patch 6/7 v2] MD: raid5 trim support Shaohua Li
@ 2012-08-13  1:50   ` NeilBrown
  2012-08-13  2:04     ` Shaohua Li
  0 siblings, 1 reply; 19+ messages in thread
From: NeilBrown @ 2012-08-13  1:50 UTC (permalink / raw)
  To: Shaohua Li; +Cc: linux-raid, axboe, shli

[-- Attachment #1: Type: text/plain, Size: 2091 bytes --]

On Fri, 10 Aug 2012 10:51:19 +0800 Shaohua Li <shli@fusionio.com> wrote:

> @@ -4094,6 +4159,19 @@ static void make_request(struct mddev *m
>  	bi->bi_next = NULL;
>  	bi->bi_phys_segments = 1;	/* over-loaded to count active stripes */
>  
> +	/* block layer doesn't correctly do alignment even we set correct alignment */
> +	if (unlikely(bi->bi_rw & REQ_DISCARD)) {
> +		int stripe_sectors = conf->chunk_sectors *
> +			(conf->raid_disks - conf->max_degraded);

This isn't right when an array is being reshaped.
I suspect that during a reshape we should only attempt DISCARD on the part of
the array which has already been reshaped.  On the other section we can
either fail the discard (is that a good idea?) or write zeros.

> +
> +		logical_sector = (logical_sector + stripe_sectors - 1);
> +		sector_div(logical_sector, stripe_sectors);

This would look better with DIV_ROUND_UP_SECTOR_T().


> +		sector_div(last_sector, stripe_sectors);
> +
> +		logical_sector *= stripe_sectors;
> +		last_sector *= stripe_sectors;
> +	}
> +
>  	for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
>  		DEFINE_WAIT(w);
>  		int previous;
> @@ -4114,6 +4192,11 @@ static void make_request(struct mddev *m
>  			if (mddev->reshape_backwards
>  			    ? logical_sector < conf->reshape_progress
>  			    : logical_sector >= conf->reshape_progress) {
> +				/* The stripe will be reshaped soon, ignore it */
> +				if (bi->bi_rw & REQ_DISCARD) {
> +					spin_unlock_irq(&conf->device_lock);
> +					goto next_stripe;
> +				}
>  				previous = 1;
>  			} else {
>  				if (mddev->reshape_backwards
> @@ -4202,6 +4285,12 @@ static void make_request(struct mddev *m
>  			finish_wait(&conf->wait_for_overlap, &w);
>  			break;
>  		}
> +next_stripe:
> +		/* For discard, we always discard one stripe */
> +		if (unlikely((bi->bi_rw & REQ_DISCARD) &&
> +		   !((logical_sector + STRIPE_SECTORS) % conf->chunk_sectors)))

You are using '%' on a sector_t value.  That isn't right - need to use
sector_div().


Thanks,
NeilBrown


[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 828 bytes --]

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [patch 0/7 v2] MD linear/0/1/10/5 TRIM support
  2012-08-10  2:51 [patch 0/7 v2] MD linear/0/1/10/5 TRIM support Shaohua Li
                   ` (6 preceding siblings ...)
  2012-08-10  2:51 ` [patch 7/7 v2] MD: raid5 avoid unnecessary zero page for trim Shaohua Li
@ 2012-08-13  1:51 ` NeilBrown
  2012-08-29 18:58 ` Holger Kiehl
  8 siblings, 0 replies; 19+ messages in thread
From: NeilBrown @ 2012-08-13  1:51 UTC (permalink / raw)
  To: Shaohua Li; +Cc: linux-raid, axboe, shli

[-- Attachment #1: Type: text/plain, Size: 723 bytes --]

On Fri, 10 Aug 2012 10:51:13 +0800 Shaohua Li <shli@fusionio.com> wrote:

> This is the patchset to support MD discard, which I refresh against latest
> kernel. It's pretty straightforward for raid linear/0/1/10. The raid456 discard
> support is tricky, please see the log in the patches for details.
> 
> Previously when I posted the patches out, people complain SATA SSD discard
> request merge error. Now a patch to disable discard request merge is in Jens's
> tree. We don't have this issue any more.

I've applied 1 to 5 (linear,raid0,raid1,raid10), though '1' will probably
need to go through Jens' tree.

The RAID5 changes need a bit more work as described in a separate email.

Thanks,
NeilBrown


[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 828 bytes --]

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [patch 6/7 v2] MD: raid5 trim support
  2012-08-13  1:50   ` NeilBrown
@ 2012-08-13  2:04     ` Shaohua Li
  2012-08-13  3:58       ` NeilBrown
  0 siblings, 1 reply; 19+ messages in thread
From: Shaohua Li @ 2012-08-13  2:04 UTC (permalink / raw)
  To: NeilBrown; +Cc: Shaohua Li, linux-raid, axboe

On Mon, Aug 13, 2012 at 11:50:51AM +1000, NeilBrown wrote:
> On Fri, 10 Aug 2012 10:51:19 +0800 Shaohua Li <shli@fusionio.com> wrote:
> 
> > @@ -4094,6 +4159,19 @@ static void make_request(struct mddev *m
> >  	bi->bi_next = NULL;
> >  	bi->bi_phys_segments = 1;	/* over-loaded to count active stripes */
> >  
> > +	/* block layer doesn't correctly do alignment even we set correct alignment */
> > +	if (unlikely(bi->bi_rw & REQ_DISCARD)) {
> > +		int stripe_sectors = conf->chunk_sectors *
> > +			(conf->raid_disks - conf->max_degraded);
> 
> This isn't right when an array is being reshaped.
> I suspect that during a reshape we should only attempt DISCARD on the part of
> the array which has already been reshaped.  On the other section we can
> either fail the discard (is that a good idea?) or write zeros.

I had a check in below for-loop for reshape, is it enough? If not, I'd like
just ignore discard request for reshape. We force discard_zero_data to be 0, so
should be ok.

I'll fix other two issues. Will repost the raid5 discard patches later.

Thanks,
Shaohua 

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [patch 6/7 v2] MD: raid5 trim support
  2012-08-13  2:04     ` Shaohua Li
@ 2012-08-13  3:58       ` NeilBrown
  2012-08-13  5:43         ` Shaohua Li
  0 siblings, 1 reply; 19+ messages in thread
From: NeilBrown @ 2012-08-13  3:58 UTC (permalink / raw)
  To: Shaohua Li; +Cc: Shaohua Li, linux-raid, axboe

[-- Attachment #1: Type: text/plain, Size: 1296 bytes --]

On Mon, 13 Aug 2012 10:04:54 +0800 Shaohua Li <shli@kernel.org> wrote:

> On Mon, Aug 13, 2012 at 11:50:51AM +1000, NeilBrown wrote:
> > On Fri, 10 Aug 2012 10:51:19 +0800 Shaohua Li <shli@fusionio.com> wrote:
> > 
> > > @@ -4094,6 +4159,19 @@ static void make_request(struct mddev *m
> > >  	bi->bi_next = NULL;
> > >  	bi->bi_phys_segments = 1;	/* over-loaded to count active stripes */
> > >  
> > > +	/* block layer doesn't correctly do alignment even we set correct alignment */
> > > +	if (unlikely(bi->bi_rw & REQ_DISCARD)) {
> > > +		int stripe_sectors = conf->chunk_sectors *
> > > +			(conf->raid_disks - conf->max_degraded);
> > 
> > This isn't right when an array is being reshaped.
> > I suspect that during a reshape we should only attempt DISCARD on the part of
> > the array which has already been reshaped.  On the other section we can
> > either fail the discard (is that a good idea?) or write zeros.
> 
> I had a check in below for-loop for reshape, is it enough? If not, I'd like
> just ignore discard request for reshape. We force discard_zero_data to be 0, so
> should be ok.

Yes, you are right - that is sufficient.  I hadn't read it properly.

> 
> I'll fix other two issues. Will repost the raid5 discard patches later.

thanks,
NeilBrown

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 828 bytes --]

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [patch 6/7 v2] MD: raid5 trim support
  2012-08-13  3:58       ` NeilBrown
@ 2012-08-13  5:43         ` Shaohua Li
  2012-09-11  4:10           ` NeilBrown
  0 siblings, 1 reply; 19+ messages in thread
From: Shaohua Li @ 2012-08-13  5:43 UTC (permalink / raw)
  To: NeilBrown; +Cc: Shaohua Li, linux-raid, axboe

On Mon, Aug 13, 2012 at 01:58:31PM +1000, NeilBrown wrote:
> On Mon, 13 Aug 2012 10:04:54 +0800 Shaohua Li <shli@kernel.org> wrote:
> 
> > On Mon, Aug 13, 2012 at 11:50:51AM +1000, NeilBrown wrote:
> > > On Fri, 10 Aug 2012 10:51:19 +0800 Shaohua Li <shli@fusionio.com> wrote:
> > > 
> > > > @@ -4094,6 +4159,19 @@ static void make_request(struct mddev *m
> > > >  	bi->bi_next = NULL;
> > > >  	bi->bi_phys_segments = 1;	/* over-loaded to count active stripes */
> > > >  
> > > > +	/* block layer doesn't correctly do alignment even we set correct alignment */
> > > > +	if (unlikely(bi->bi_rw & REQ_DISCARD)) {
> > > > +		int stripe_sectors = conf->chunk_sectors *
> > > > +			(conf->raid_disks - conf->max_degraded);
> > > 
> > > This isn't right when an array is being reshaped.
> > > I suspect that during a reshape we should only attempt DISCARD on the part of
> > > the array which has already been reshaped.  On the other section we can
> > > either fail the discard (is that a good idea?) or write zeros.
> > 
> > I had a check in below for-loop for reshape, is it enough? If not, I'd like
> > just ignore discard request for reshape. We force discard_zero_data to be 0, so
> > should be ok.
> 
> Yes, you are right - that is sufficient.  I hadn't read it properly.
> 
> > 
> > I'll fix other two issues. Will repost the raid5 discard patches later.

Here is the updated version. Last patch can still be applied cleanly.


Subject: MD: raid5 trim support

Discard for raid4/5/6 has limitation. If discard request size is small, we do
discard for one disk, but we need calculate parity and write parity disk.  To
correctly calculate parity, zero_after_discard must be guaranteed. Even it's
true, we need do discard for one disk but write another disks, which makes the
parity disks wear out fast. This doesn't make sense. So an efficient discard
for raid4/5/6 should discard all data disks and parity disks, which requires
the write pattern to be (A, A+chunk_size, A+chunk_size*2...). If A's size is
smaller than chunk_size, such pattern is almost impossible in practice. So in
this patch, I only handle the case that A's size equals to chunk_size. That is
discard request should be aligned to stripe size and its size is multiple of
stripe size.

Since we can only handle request with specific alignment and size (or part of
the request fitting stripes), we can't guarantee zero_after_discard even
zero_after_discard is true in low level drives.

The block layer doesn't send down correctly aligned requests even correct
discard alignment is set, so I must filter out.

For raid4/5/6 parity calculation, if data is 0, parity is 0. So if
zero_after_discard is true for all disks, data is consistent after discard.
Otherwise, data might be lost. Let's consider a scenario: discard a stripe,
write data to one disk and write parity disk. The stripe could be still
inconsistent till then depending on using data from other data disks or parity
disks to calculate new parity. If the disk is broken, we can't restore it. So
in this patch, we only enable discard support if all disks have
zero_after_discard.

If discard fails in one disk, we face the similar inconsistent issue above. The
patch will make discard follow the same path as normal write request. If
discard fails, a resync will be scheduled to make the data consistent. This
isn't good to have extra writes, but data consistency is important.

If a subsequent read/write request hits raid5 cache of a discarded stripe, the
discarded dev page should have zero filled, so the data is consistent. This
patch will always zero dev page for discarded request stripe. This isn't
optimal because discard request doesn't need such payload. Next patch will
avoid it.

Signed-off-by: Shaohua Li <shli@fusionio.com>
---
 drivers/md/raid5.c |  135 +++++++++++++++++++++++++++++++++++++++++++++++++++--
 drivers/md/raid5.h |    1 
 2 files changed, 132 insertions(+), 4 deletions(-)

Index: linux/drivers/md/raid5.c
===================================================================
--- linux.orig/drivers/md/raid5.c	2012-08-13 10:25:22.325095834 +0800
+++ linux/drivers/md/raid5.c	2012-08-13 12:01:28.628603083 +0800
@@ -547,6 +547,8 @@ static void ops_run_io(struct stripe_hea
 				rw = WRITE_FUA;
 			else
 				rw = WRITE;
+			if (test_and_clear_bit(R5_Discard, &sh->dev[i].flags))
+				rw |= REQ_DISCARD;
 		} else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
 			rw = READ;
 		else if (test_and_clear_bit(R5_WantReplace,
@@ -1170,8 +1172,13 @@ ops_run_biodrain(struct stripe_head *sh,
 					set_bit(R5_WantFUA, &dev->flags);
 				if (wbi->bi_rw & REQ_SYNC)
 					set_bit(R5_SyncIO, &dev->flags);
-				tx = async_copy_data(1, wbi, dev->page,
-					dev->sector, tx);
+				if (wbi->bi_rw & REQ_DISCARD) {
+					memset(page_address(dev->page), 0,
+						STRIPE_SECTORS << 9);
+					set_bit(R5_Discard, &dev->flags);
+				} else
+					tx = async_copy_data(1, wbi, dev->page,
+						dev->sector, tx);
 				wbi = r5_next_bio(wbi, dev->sector);
 			}
 		}
@@ -1237,6 +1244,20 @@ ops_run_reconstruct5(struct stripe_head
 	pr_debug("%s: stripe %llu\n", __func__,
 		(unsigned long long)sh->sector);
 
+	for (i = 0; i < sh->disks; i++) {
+		if (pd_idx == i)
+			continue;
+		if (!test_bit(R5_Discard, &sh->dev[i].flags))
+			break;
+	}
+	if (i >= sh->disks) {
+		atomic_inc(&sh->count);
+		memset(page_address(sh->dev[pd_idx].page), 0,
+			STRIPE_SECTORS << 9);
+		set_bit(R5_Discard, &sh->dev[pd_idx].flags);
+		ops_complete_reconstruct(sh);
+		return;
+	}
 	/* check if prexor is active which means only process blocks
 	 * that are part of a read-modify-write (written)
 	 */
@@ -1281,10 +1302,28 @@ ops_run_reconstruct6(struct stripe_head
 {
 	struct async_submit_ctl submit;
 	struct page **blocks = percpu->scribble;
-	int count;
+	int count, i;
 
 	pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector);
 
+	for (i = 0; i < sh->disks; i++) {
+		if (sh->pd_idx == i || sh->qd_idx == i)
+			continue;
+		if (!test_bit(R5_Discard, &sh->dev[i].flags))
+			break;
+	}
+	if (i >= sh->disks) {
+		atomic_inc(&sh->count);
+		memset(page_address(sh->dev[sh->pd_idx].page), 0,
+			STRIPE_SECTORS << 9);
+		memset(page_address(sh->dev[sh->qd_idx].page), 0,
+			STRIPE_SECTORS << 9);
+		set_bit(R5_Discard, &sh->dev[sh->pd_idx].flags);
+		set_bit(R5_Discard, &sh->dev[sh->qd_idx].flags);
+		ops_complete_reconstruct(sh);
+		return;
+	}
+
 	count = set_syndrome_sources(blocks, sh);
 
 	atomic_inc(&sh->count);
@@ -2353,7 +2392,7 @@ schedule_reconstruction(struct stripe_he
  */
 static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, int forwrite)
 {
-	struct bio **bip;
+	struct bio **bip, *orig_bi = bi;
 	struct r5conf *conf = sh->raid_conf;
 	int firstwrite=0;
 
@@ -2370,6 +2409,23 @@ static int add_stripe_bio(struct stripe_
 	 * protect it.
 	 */
 	spin_lock_irq(&sh->stripe_lock);
+
+	if (bi->bi_rw & REQ_DISCARD) {
+		int i;
+		dd_idx = -1;
+		for (i = 0; i < sh->disks; i++) {
+			if (i == sh->pd_idx || i == sh->qd_idx)
+				continue;
+			if (dd_idx == -1)
+				dd_idx = i;
+			if (sh->dev[i].towrite) {
+				dd_idx = i;
+				goto overlap;
+			}
+		}
+	}
+
+again:
 	if (forwrite) {
 		bip = &sh->dev[dd_idx].towrite;
 		if (*bip == NULL)
@@ -2403,6 +2459,15 @@ static int add_stripe_bio(struct stripe_
 		if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS)
 			set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags);
 	}
+
+	bi = orig_bi;
+	if (bi->bi_rw & REQ_DISCARD) {
+		dd_idx++;
+		while (dd_idx == sh->pd_idx || dd_idx == sh->qd_idx)
+			dd_idx++;
+		if (dd_idx < sh->disks)
+			goto again;
+	}
 	spin_unlock_irq(&sh->stripe_lock);
 
 	pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n",
@@ -4091,12 +4156,26 @@ static void make_request(struct mddev *m
 
 	logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
 	last_sector = bi->bi_sector + (bi->bi_size>>9);
+
+	/* block layer doesn't correctly do alignment even we set */
+	if (unlikely(bi->bi_rw & REQ_DISCARD)) {
+		int stripe_sectors = conf->chunk_sectors *
+			(conf->raid_disks - conf->max_degraded);
+		logical_sector = DIV_ROUND_UP_SECTOR_T(logical_sector,
+						       stripe_sectors);
+		sector_div(last_sector, stripe_sectors);
+
+		logical_sector *= stripe_sectors;
+		last_sector *= stripe_sectors;
+	}
+
 	bi->bi_next = NULL;
 	bi->bi_phys_segments = 1;	/* over-loaded to count active stripes */
 
 	for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
 		DEFINE_WAIT(w);
 		int previous;
+		sector_t tmp;
 
 	retry:
 		previous = 0;
@@ -4114,6 +4193,11 @@ static void make_request(struct mddev *m
 			if (mddev->reshape_backwards
 			    ? logical_sector < conf->reshape_progress
 			    : logical_sector >= conf->reshape_progress) {
+				/* The stripe will be reshaped soon, ignore it */
+				if (bi->bi_rw & REQ_DISCARD) {
+					spin_unlock_irq(&conf->device_lock);
+					goto next_stripe;
+				}
 				previous = 1;
 			} else {
 				if (mddev->reshape_backwards
@@ -4202,6 +4286,13 @@ static void make_request(struct mddev *m
 			finish_wait(&conf->wait_for_overlap, &w);
 			break;
 		}
+next_stripe:
+		/* For discard, we always discard one stripe */
+		tmp = logical_sector + STRIPE_SECTORS;
+		if (unlikely((bi->bi_rw & REQ_DISCARD) &&
+		   !sector_div(tmp, conf->chunk_sectors)))
+			logical_sector += conf->chunk_sectors *
+				(conf->raid_disks - conf->max_degraded - 1);
 	}
 
 	remaining = raid5_dec_bi_active_stripes(bi);
@@ -5361,6 +5452,7 @@ static int run(struct mddev *mddev)
 
 	if (mddev->queue) {
 		int chunk_size;
+		bool discard_supported = true;
 		/* read-ahead size must cover two whole stripes, which
 		 * is 2 * (datadisks) * chunksize where 'n' is the
 		 * number of raid devices
@@ -5380,13 +5472,48 @@ static int run(struct mddev *mddev)
 		blk_queue_io_min(mddev->queue, chunk_size);
 		blk_queue_io_opt(mddev->queue, chunk_size *
 				 (conf->raid_disks - conf->max_degraded));
+		/*
+		 * We can only discard a whole stripe. It doesn't make sense to
+		 * discard data disk but write parity disk
+		 */
+		stripe = stripe * PAGE_SIZE;
+		mddev->queue->limits.discard_alignment = stripe;
+		mddev->queue->limits.discard_granularity = stripe;
+		/*
+		 * unaligned part of discard request will be ignored, so can't
+		 * guarantee discard_zerors_data
+		 */
+		mddev->queue->limits.discard_zeroes_data = 0;
 
 		rdev_for_each(rdev, mddev) {
 			disk_stack_limits(mddev->gendisk, rdev->bdev,
 					  rdev->data_offset << 9);
 			disk_stack_limits(mddev->gendisk, rdev->bdev,
 					  rdev->new_data_offset << 9);
+			/*
+			 * discard_zeroes_data is required, otherwise data
+			 * could be lost. Consider a scenario: discard a stripe
+			 * (the stripe could be inconsistent if
+			 * discard_zeroes_data is 0); write one disk of the
+			 * stripe (the stripe could be inconsistent again
+			 * depending on which disks are used to calculate
+			 * parity); the disk is broken; The stripe data of this
+			 * disk is lost.
+			 */
+			if (!blk_queue_discard(bdev_get_queue(rdev->bdev)) ||
+			    !bdev_get_queue(rdev->bdev)->
+						limits.discard_zeroes_data)
+				discard_supported = false;
 		}
+
+		if (discard_supported &&
+		   mddev->queue->limits.max_discard_sectors >= stripe &&
+		   mddev->queue->limits.discard_granularity >= stripe)
+			queue_flag_set_unlocked(QUEUE_FLAG_DISCARD,
+						mddev->queue);
+		else
+			queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD,
+						mddev->queue);
 	}
 
 	return 0;
Index: linux/drivers/md/raid5.h
===================================================================
--- linux.orig/drivers/md/raid5.h	2012-08-13 10:25:22.337095684 +0800
+++ linux/drivers/md/raid5.h	2012-08-13 11:58:44.082670917 +0800
@@ -298,6 +298,7 @@ enum r5dev_flags {
 	R5_WantReplace, /* We need to update the replacement, we have read
 			 * data in, and now is a good time to write it out.
 			 */
+	R5_Discard,	/* Discard the stripe */
 };
 
 /*

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [patch 0/7 v2] MD linear/0/1/10/5 TRIM support
  2012-08-10  2:51 [patch 0/7 v2] MD linear/0/1/10/5 TRIM support Shaohua Li
                   ` (7 preceding siblings ...)
  2012-08-13  1:51 ` [patch 0/7 v2] MD linear/0/1/10/5 TRIM support NeilBrown
@ 2012-08-29 18:58 ` Holger Kiehl
  2012-08-29 20:19   ` Martin K. Petersen
  2012-08-30  0:45   ` Shaohua Li
  8 siblings, 2 replies; 19+ messages in thread
From: Holger Kiehl @ 2012-08-29 18:58 UTC (permalink / raw)
  To: Shaohua Li; +Cc: linux-raid, neilb, axboe, shli

Hello,

On Fri, 10 Aug 2012, Shaohua Li wrote:

> This is the patchset to support MD discard, which I refresh against latest
> kernel. It's pretty straightforward for raid linear/0/1/10. The raid456 discard
> support is tricky, please see the log in the patches for details.
>
> Previously when I posted the patches out, people complain SATA SSD discard
> request merge error. Now a patch to disable discard request merge is in Jens's
> tree. We don't have this issue any more.
>
First many thanks for sending those updated patches. An also many thanks
to Neil that you still maintain MD for linux! Thank you for all the great
work!

I have tried the raid linear/0/1/10 patches on 3.6.0-rc3 and I sometimes
get this in /var/log/messages:

    Aug 27 21:44:53 yoda kernel: [10576.964134] request botched: dev sda: type=1, flags=916c081
    Aug 27 21:44:53 yoda kernel: [10576.964138]   sector 316197889, nr/cnr 0/256
    Aug 27 21:44:53 yoda kernel: [10576.964140]   bio ffff8803741b8600, biotail ffff8803741b8600, buffer           (null), len 0
    Aug 27 21:44:53 yoda kernel: [10576.971023] request botched: dev sdb: type=1, flags=917c081
    Aug 27 21:44:53 yoda kernel: [10576.971026]   sector 320574465, nr/cnr 0/384
    Aug 27 21:44:53 yoda kernel: [10576.971029]   bio ffff8803741b9e00, biotail ffff8803741b9e00, buffer           (null), len 0
    Aug 27 21:44:53 yoda kernel: [10576.972736] request botched: dev sdb: type=1, flags=916c081
    Aug 27 21:44:53 yoda kernel: [10576.972740]   sector 320575489, nr/cnr 0/288
    Aug 27 21:44:53 yoda kernel: [10576.972742]   bio ffff8803ff80e840, biotail ffff8803ff80e840, buffer           (null), len 0
    Aug 27 21:44:53 yoda kernel: [10576.973851] request botched: dev sda: type=1, flags=917c081
    Aug 27 21:44:53 yoda kernel: [10576.973854]   sector 320575105, nr/cnr 0/640
    Aug 27 21:44:53 yoda kernel: [10576.973857]   bio ffff8803741b9e80, biotail ffff8803741b9e80, buffer           (null), len 0

This happened on a Raid0 when I deleted lots of files (make clean in
kernel directory). The Raid0 is accross two SSD's:

    /dev/sda   Crucial M4-CT512M4SSD2 Firmware Revision:  000F
    /dev/sdb   Crucial M4-CT256M4SSD2 Firmware Revision:  000F

cat /proc/mounts for the partition looks as follows:

    /dev/md1 / ext4 rw,noatime,discard,journal_checksum,journal_async_commit,commit=600,stripe=256,data=ordered 0 0

And here cat /proc/mdstat:

    Personalities : [raid0]
    md1 : active raid0 sdb3[1] sda3[0]
          413851648 blocks super 1.2 512k chunks

    unused devices: <none>

However, when I apply this patch which Shaohua Li send to me on 13th March
(http://lkml.indiana.edu/hypermail/linux/kernel/1203.1/03323.html) these
messages do NOT show up:

--- linux-3.6-rc3/include/linux/blkdev.h.original	2012-08-22 22:29:06.000000000 +0200
+++ linux-3.6-rc3/include/linux/blkdev.h	2012-08-27 21:42:19.979566802 +0200
@@ -601,7 +601,7 @@
   * it already be started by driver.
   */
  #define RQ_NOMERGE_FLAGS	\
-	(REQ_NOMERGE | REQ_STARTED | REQ_SOFTBARRIER | REQ_FLUSH | REQ_FUA)
+	(REQ_NOMERGE | REQ_STARTED | REQ_SOFTBARRIER | REQ_FLUSH | REQ_FUA | REQ_DISCARD)
  #define rq_mergeable(rq)	\
  	(!((rq)->cmd_flags & RQ_NOMERGE_FLAGS) && \
  	 (((rq)->cmd_flags & REQ_DISCARD) || \


Regards,
Holger

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [patch 0/7 v2] MD linear/0/1/10/5 TRIM support
  2012-08-29 18:58 ` Holger Kiehl
@ 2012-08-29 20:19   ` Martin K. Petersen
  2012-08-30  0:45   ` Shaohua Li
  1 sibling, 0 replies; 19+ messages in thread
From: Martin K. Petersen @ 2012-08-29 20:19 UTC (permalink / raw)
  To: Holger Kiehl; +Cc: Shaohua Li, linux-raid, neilb, axboe, shli

>>>>> "Holger" == Holger Kiehl <Holger.Kiehl@dwd.de> writes:

>> Previously when I posted the patches out, people complain SATA SSD
>> discard request merge error. Now a patch to disable discard request
>> merge is in Jens's tree. We don't have this issue any more.

And fwiw I have been working on discard/write same merging the last
couple of days. Almost there...

-- 
Martin K. Petersen	Oracle Linux Engineering

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [patch 0/7 v2] MD linear/0/1/10/5 TRIM support
  2012-08-29 18:58 ` Holger Kiehl
  2012-08-29 20:19   ` Martin K. Petersen
@ 2012-08-30  0:45   ` Shaohua Li
  1 sibling, 0 replies; 19+ messages in thread
From: Shaohua Li @ 2012-08-30  0:45 UTC (permalink / raw)
  To: Holger Kiehl; +Cc: linux-raid, neilb, axboe

2012/8/30 Holger Kiehl <Holger.Kiehl@dwd.de>:
> Hello,
>
>
> On Fri, 10 Aug 2012, Shaohua Li wrote:
>
>> This is the patchset to support MD discard, which I refresh against latest
>> kernel. It's pretty straightforward for raid linear/0/1/10. The raid456
>> discard
>> support is tricky, please see the log in the patches for details.
>>
>> Previously when I posted the patches out, people complain SATA SSD discard
>> request merge error. Now a patch to disable discard request merge is in
>> Jens's
>> tree. We don't have this issue any more.
>>
> First many thanks for sending those updated patches. An also many thanks
> to Neil that you still maintain MD for linux! Thank you for all the great
> work!
>
> I have tried the raid linear/0/1/10 patches on 3.6.0-rc3 and I sometimes
> get this in /var/log/messages:
>
>    Aug 27 21:44:53 yoda kernel: [10576.964134] request botched: dev sda:
> type=1, flags=916c081
>    Aug 27 21:44:53 yoda kernel: [10576.964138]   sector 316197889, nr/cnr
> 0/256
>    Aug 27 21:44:53 yoda kernel: [10576.964140]   bio ffff8803741b8600,
> biotail ffff8803741b8600, buffer           (null), len 0
>    Aug 27 21:44:53 yoda kernel: [10576.971023] request botched: dev sdb:
> type=1, flags=917c081
>    Aug 27 21:44:53 yoda kernel: [10576.971026]   sector 320574465, nr/cnr
> 0/384
>    Aug 27 21:44:53 yoda kernel: [10576.971029]   bio ffff8803741b9e00,
> biotail ffff8803741b9e00, buffer           (null), len 0
>    Aug 27 21:44:53 yoda kernel: [10576.972736] request botched: dev sdb:
> type=1, flags=916c081
>    Aug 27 21:44:53 yoda kernel: [10576.972740]   sector 320575489, nr/cnr
> 0/288
>    Aug 27 21:44:53 yoda kernel: [10576.972742]   bio ffff8803ff80e840,
> biotail ffff8803ff80e840, buffer           (null), len 0
>    Aug 27 21:44:53 yoda kernel: [10576.973851] request botched: dev sda:
> type=1, flags=917c081
>    Aug 27 21:44:53 yoda kernel: [10576.973854]   sector 320575105, nr/cnr
> 0/640
>    Aug 27 21:44:53 yoda kernel: [10576.973857]   bio ffff8803741b9e80,
> biotail ffff8803741b9e80, buffer           (null), len 0
>
> This happened on a Raid0 when I deleted lots of files (make clean in
> kernel directory). The Raid0 is accross two SSD's:
>
>    /dev/sda   Crucial M4-CT512M4SSD2 Firmware Revision:  000F
>    /dev/sdb   Crucial M4-CT256M4SSD2 Firmware Revision:  000F
>
> cat /proc/mounts for the partition looks as follows:
>
>    /dev/md1 / ext4
> rw,noatime,discard,journal_checksum,journal_async_commit,commit=600,stripe=256,data=ordered
> 0 0
>
> And here cat /proc/mdstat:
>
>    Personalities : [raid0]
>    md1 : active raid0 sdb3[1] sda3[0]
>          413851648 blocks super 1.2 512k chunks
>
>    unused devices: <none>
>
> However, when I apply this patch which Shaohua Li send to me on 13th March
> (http://lkml.indiana.edu/hypermail/linux/kernel/1203.1/03323.html) these
> messages do NOT show up:
>
> --- linux-3.6-rc3/include/linux/blkdev.h.original       2012-08-22
> 22:29:06.000000000 +0200
> +++ linux-3.6-rc3/include/linux/blkdev.h        2012-08-27
> 21:42:19.979566802 +0200
> @@ -601,7 +601,7 @@
>   * it already be started by driver.
>   */
>  #define RQ_NOMERGE_FLAGS       \
> -       (REQ_NOMERGE | REQ_STARTED | REQ_SOFTBARRIER | REQ_FLUSH | REQ_FUA)
> +       (REQ_NOMERGE | REQ_STARTED | REQ_SOFTBARRIER | REQ_FLUSH | REQ_FUA |
> REQ_DISCARD)
>  #define rq_mergeable(rq)       \
>         (!((rq)->cmd_flags & RQ_NOMERGE_FLAGS) && \
>          (((rq)->cmd_flags & REQ_DISCARD) || \
>
yes, this patch is in Jens's tree. We disabled discard request
merge temporarily till Martin's discard request merge ready.

Thanks,
Shaohua

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [patch 6/7 v2] MD: raid5 trim support
  2012-08-13  5:43         ` Shaohua Li
@ 2012-09-11  4:10           ` NeilBrown
  2012-09-12  4:09             ` Shaohua Li
  0 siblings, 1 reply; 19+ messages in thread
From: NeilBrown @ 2012-09-11  4:10 UTC (permalink / raw)
  To: Shaohua Li; +Cc: Shaohua Li, linux-raid, axboe

[-- Attachment #1: Type: text/plain, Size: 9216 bytes --]

On Mon, 13 Aug 2012 13:43:47 +0800 Shaohua Li <shli@kernel.org> wrote:

> On Mon, Aug 13, 2012 at 01:58:31PM +1000, NeilBrown wrote:
> > On Mon, 13 Aug 2012 10:04:54 +0800 Shaohua Li <shli@kernel.org> wrote:
> > 
> > > On Mon, Aug 13, 2012 at 11:50:51AM +1000, NeilBrown wrote:
> > > > On Fri, 10 Aug 2012 10:51:19 +0800 Shaohua Li <shli@fusionio.com> wrote:
> > > > 
> > > > > @@ -4094,6 +4159,19 @@ static void make_request(struct mddev *m
> > > > >  	bi->bi_next = NULL;
> > > > >  	bi->bi_phys_segments = 1;	/* over-loaded to count active stripes */
> > > > >  
> > > > > +	/* block layer doesn't correctly do alignment even we set correct alignment */
> > > > > +	if (unlikely(bi->bi_rw & REQ_DISCARD)) {
> > > > > +		int stripe_sectors = conf->chunk_sectors *
> > > > > +			(conf->raid_disks - conf->max_degraded);
> > > > 
> > > > This isn't right when an array is being reshaped.
> > > > I suspect that during a reshape we should only attempt DISCARD on the part of
> > > > the array which has already been reshaped.  On the other section we can
> > > > either fail the discard (is that a good idea?) or write zeros.
> > > 
> > > I had a check in below for-loop for reshape, is it enough? If not, I'd like
> > > just ignore discard request for reshape. We force discard_zero_data to be 0, so
> > > should be ok.
> > 
> > Yes, you are right - that is sufficient.  I hadn't read it properly.
> > 
> > > 
> > > I'll fix other two issues. Will repost the raid5 discard patches later.
> 
> Here is the updated version. Last patch can still be applied cleanly.
> 
> 
> Subject: MD: raid5 trim support
> 
> Discard for raid4/5/6 has limitation. If discard request size is small, we do
> discard for one disk, but we need calculate parity and write parity disk.  To
> correctly calculate parity, zero_after_discard must be guaranteed. Even it's
> true, we need do discard for one disk but write another disks, which makes the
> parity disks wear out fast. This doesn't make sense. So an efficient discard
> for raid4/5/6 should discard all data disks and parity disks, which requires
> the write pattern to be (A, A+chunk_size, A+chunk_size*2...). If A's size is
> smaller than chunk_size, such pattern is almost impossible in practice. So in
> this patch, I only handle the case that A's size equals to chunk_size. That is
> discard request should be aligned to stripe size and its size is multiple of
> stripe size.
> 
> Since we can only handle request with specific alignment and size (or part of
> the request fitting stripes), we can't guarantee zero_after_discard even
> zero_after_discard is true in low level drives.
> 
> The block layer doesn't send down correctly aligned requests even correct
> discard alignment is set, so I must filter out.
> 
> For raid4/5/6 parity calculation, if data is 0, parity is 0. So if
> zero_after_discard is true for all disks, data is consistent after discard.
> Otherwise, data might be lost. Let's consider a scenario: discard a stripe,
> write data to one disk and write parity disk. The stripe could be still
> inconsistent till then depending on using data from other data disks or parity
> disks to calculate new parity. If the disk is broken, we can't restore it. So
> in this patch, we only enable discard support if all disks have
> zero_after_discard.
> 
> If discard fails in one disk, we face the similar inconsistent issue above. The
> patch will make discard follow the same path as normal write request. If
> discard fails, a resync will be scheduled to make the data consistent. This
> isn't good to have extra writes, but data consistency is important.
> 
> If a subsequent read/write request hits raid5 cache of a discarded stripe, the
> discarded dev page should have zero filled, so the data is consistent. This
> patch will always zero dev page for discarded request stripe. This isn't
> optimal because discard request doesn't need such payload. Next patch will
> avoid it.
> 
> Signed-off-by: Shaohua Li <shli@fusionio.com>
> ---
>  drivers/md/raid5.c |  135 +++++++++++++++++++++++++++++++++++++++++++++++++++--
>  drivers/md/raid5.h |    1 
>  2 files changed, 132 insertions(+), 4 deletions(-)
> 
> Index: linux/drivers/md/raid5.c
> ===================================================================
> --- linux.orig/drivers/md/raid5.c	2012-08-13 10:25:22.325095834 +0800
> +++ linux/drivers/md/raid5.c	2012-08-13 12:01:28.628603083 +0800
> @@ -547,6 +547,8 @@ static void ops_run_io(struct stripe_hea
>  				rw = WRITE_FUA;
>  			else
>  				rw = WRITE;
> +			if (test_and_clear_bit(R5_Discard, &sh->dev[i].flags))
> +				rw |= REQ_DISCARD;
>  		} else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
>  			rw = READ;
>  		else if (test_and_clear_bit(R5_WantReplace,
> @@ -1170,8 +1172,13 @@ ops_run_biodrain(struct stripe_head *sh,
>  					set_bit(R5_WantFUA, &dev->flags);
>  				if (wbi->bi_rw & REQ_SYNC)
>  					set_bit(R5_SyncIO, &dev->flags);
> -				tx = async_copy_data(1, wbi, dev->page,
> -					dev->sector, tx);
> +				if (wbi->bi_rw & REQ_DISCARD) {
> +					memset(page_address(dev->page), 0,
> +						STRIPE_SECTORS << 9);
> +					set_bit(R5_Discard, &dev->flags);
> +				} else
> +					tx = async_copy_data(1, wbi, dev->page,
> +						dev->sector, tx);
>  				wbi = r5_next_bio(wbi, dev->sector);
>  			}
>  		}
> @@ -1237,6 +1244,20 @@ ops_run_reconstruct5(struct stripe_head
>  	pr_debug("%s: stripe %llu\n", __func__,
>  		(unsigned long long)sh->sector);
>  
> +	for (i = 0; i < sh->disks; i++) {
> +		if (pd_idx == i)
> +			continue;
> +		if (!test_bit(R5_Discard, &sh->dev[i].flags))
> +			break;
> +	}
> +	if (i >= sh->disks) {
> +		atomic_inc(&sh->count);
> +		memset(page_address(sh->dev[pd_idx].page), 0,
> +			STRIPE_SECTORS << 9);
> +		set_bit(R5_Discard, &sh->dev[pd_idx].flags);
> +		ops_complete_reconstruct(sh);
> +		return;
> +	}
>  	/* check if prexor is active which means only process blocks
>  	 * that are part of a read-modify-write (written)
>  	 */
> @@ -1281,10 +1302,28 @@ ops_run_reconstruct6(struct stripe_head
>  {
>  	struct async_submit_ctl submit;
>  	struct page **blocks = percpu->scribble;
> -	int count;
> +	int count, i;
>  
>  	pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector);
>  
> +	for (i = 0; i < sh->disks; i++) {
> +		if (sh->pd_idx == i || sh->qd_idx == i)
> +			continue;
> +		if (!test_bit(R5_Discard, &sh->dev[i].flags))
> +			break;
> +	}
> +	if (i >= sh->disks) {
> +		atomic_inc(&sh->count);
> +		memset(page_address(sh->dev[sh->pd_idx].page), 0,
> +			STRIPE_SECTORS << 9);
> +		memset(page_address(sh->dev[sh->qd_idx].page), 0,
> +			STRIPE_SECTORS << 9);
> +		set_bit(R5_Discard, &sh->dev[sh->pd_idx].flags);
> +		set_bit(R5_Discard, &sh->dev[sh->qd_idx].flags);
> +		ops_complete_reconstruct(sh);
> +		return;
> +	}
> +
>  	count = set_syndrome_sources(blocks, sh);
>  
>  	atomic_inc(&sh->count);
> @@ -2353,7 +2392,7 @@ schedule_reconstruction(struct stripe_he
>   */
>  static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, int forwrite)
>  {
> -	struct bio **bip;
> +	struct bio **bip, *orig_bi = bi;
>  	struct r5conf *conf = sh->raid_conf;
>  	int firstwrite=0;
>  
> @@ -2370,6 +2409,23 @@ static int add_stripe_bio(struct stripe_
>  	 * protect it.
>  	 */
>  	spin_lock_irq(&sh->stripe_lock);
> +
> +	if (bi->bi_rw & REQ_DISCARD) {
> +		int i;
> +		dd_idx = -1;
> +		for (i = 0; i < sh->disks; i++) {
> +			if (i == sh->pd_idx || i == sh->qd_idx)
> +				continue;
> +			if (dd_idx == -1)
> +				dd_idx = i;
> +			if (sh->dev[i].towrite) {
> +				dd_idx = i;
> +				goto overlap;
> +			}
> +		}
> +	}
> +
> +again:
>  	if (forwrite) {
>  		bip = &sh->dev[dd_idx].towrite;
>  		if (*bip == NULL)
> @@ -2403,6 +2459,15 @@ static int add_stripe_bio(struct stripe_
>  		if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS)
>  			set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags);
>  	}
> +
> +	bi = orig_bi;
> +	if (bi->bi_rw & REQ_DISCARD) {
> +		dd_idx++;
> +		while (dd_idx == sh->pd_idx || dd_idx == sh->qd_idx)
> +			dd_idx++;
> +		if (dd_idx < sh->disks)
> +			goto again;
> +	}

I'm afraid there there is something else here that I can't make my self happy
with.

You added a new "goto again" loop inside add_stripe_bio, and to compensate
the increase logical_sector in make_request so that it doesn't call
add_stripe_bio so many times.
This means that to control flow between make_request and add_stripe_bio is
very different depending on whether it is a discard or not.  That make the
code harder to understand and easier to break later.

I think it would to create a completely separate "make_trim_request()" which
handles the trim/discard case, and leave the current code as it is.

If/when you do send a patch to do that, please also resend the other raid5
patch which comes after this one.  I tend not to keep patches once I've
devices not to apply them immediately.  It also reduces the chance of
confusion if you just send whatever you want me to apply.

Thanks,
NeilBrown

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 828 bytes --]

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [patch 6/7 v2] MD: raid5 trim support
  2012-09-11  4:10           ` NeilBrown
@ 2012-09-12  4:09             ` Shaohua Li
  2012-09-18  4:52               ` NeilBrown
  0 siblings, 1 reply; 19+ messages in thread
From: Shaohua Li @ 2012-09-12  4:09 UTC (permalink / raw)
  To: NeilBrown; +Cc: Shaohua Li, linux-raid, axboe

On Tue, Sep 11, 2012 at 02:10:08PM +1000, NeilBrown wrote:
> > +	bi = orig_bi;
> > +	if (bi->bi_rw & REQ_DISCARD) {
> > +		dd_idx++;
> > +		while (dd_idx == sh->pd_idx || dd_idx == sh->qd_idx)
> > +			dd_idx++;
> > +		if (dd_idx < sh->disks)
> > +			goto again;
> > +	}
> 
> I'm afraid there there is something else here that I can't make my self happy
> with.
> 
> You added a new "goto again" loop inside add_stripe_bio, and to compensate
> the increase logical_sector in make_request so that it doesn't call
> add_stripe_bio so many times.
> This means that to control flow between make_request and add_stripe_bio is
> very different depending on whether it is a discard or not.  That make the
> code harder to understand and easier to break later.
> 
> I think it would to create a completely separate "make_trim_request()" which
> handles the trim/discard case, and leave the current code as it is.
> 
> If/when you do send a patch to do that, please also resend the other raid5
> patch which comes after this one.  I tend not to keep patches once I've
> devices not to apply them immediately.  It also reduces the chance of
> confusion if you just send whatever you want me to apply.

I'm afraid there will be a lot of duplicate code doing this way. How about
below patch? I made it clearer for discard. If you like it, I'll send other
patches.

Thanks,
Shaohua

Subject: MD: raid5 trim support

Discard for raid4/5/6 has limitation. If discard request size is small, we do
discard for one disk, but we need calculate parity and write parity disk.  To
correctly calculate parity, zero_after_discard must be guaranteed. Even it's
true, we need do discard for one disk but write another disks, which makes the
parity disks wear out fast. This doesn't make sense. So an efficient discard
for raid4/5/6 should discard all data disks and parity disks, which requires
the write pattern to be (A, A+chunk_size, A+chunk_size*2...). If A's size is
smaller than chunk_size, such pattern is almost impossible in practice. So in
this patch, I only handle the case that A's size equals to chunk_size. That is
discard request should be aligned to stripe size and its size is multiple of
stripe size.

Since we can only handle request with specific alignment and size (or part of
the request fitting stripes), we can't guarantee zero_after_discard even
zero_after_discard is true in low level drives.

The block layer doesn't send down correctly aligned requests even correct
discard alignment is set, so I must filter out.

For raid4/5/6 parity calculation, if data is 0, parity is 0. So if
zero_after_discard is true for all disks, data is consistent after discard.
Otherwise, data might be lost. Let's consider a scenario: discard a stripe,
write data to one disk and write parity disk. The stripe could be still
inconsistent till then depending on using data from other data disks or parity
disks to calculate new parity. If the disk is broken, we can't restore it. So
in this patch, we only enable discard support if all disks have
zero_after_discard.

If discard fails in one disk, we face the similar inconsistent issue above. The
patch will make discard follow the same path as normal write request. If
discard fails, a resync will be scheduled to make the data consistent. This
isn't good to have extra writes, but data consistency is important.

If a subsequent read/write request hits raid5 cache of a discarded stripe, the
discarded dev page should have zero filled, so the data is consistent. This
patch will always zero dev page for discarded request stripe. This isn't
optimal because discard request doesn't need such payload. Next patch will
avoid it.

Signed-off-by: Shaohua Li <shli@fusionio.com>
---
 drivers/md/raid5.c |  219 ++++++++++++++++++++++++++++++++++++++++++-----------
 drivers/md/raid5.h |    1 
 2 files changed, 176 insertions(+), 44 deletions(-)

Index: linux/drivers/md/raid5.c
===================================================================
--- linux.orig/drivers/md/raid5.c	2012-09-11 17:36:30.619148566 +0800
+++ linux/drivers/md/raid5.c	2012-09-12 12:03:18.952735349 +0800
@@ -547,6 +547,8 @@ static void ops_run_io(struct stripe_hea
 				rw = WRITE_FUA;
 			else
 				rw = WRITE;
+			if (test_and_clear_bit(R5_Discard, &sh->dev[i].flags))
+				rw |= REQ_DISCARD;
 		} else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
 			rw = READ;
 		else if (test_and_clear_bit(R5_WantReplace,
@@ -1170,8 +1172,13 @@ ops_run_biodrain(struct stripe_head *sh,
 					set_bit(R5_WantFUA, &dev->flags);
 				if (wbi->bi_rw & REQ_SYNC)
 					set_bit(R5_SyncIO, &dev->flags);
-				tx = async_copy_data(1, wbi, dev->page,
-					dev->sector, tx);
+				if (wbi->bi_rw & REQ_DISCARD) {
+					memset(page_address(dev->page), 0,
+						STRIPE_SECTORS << 9);
+					set_bit(R5_Discard, &dev->flags);
+				} else
+					tx = async_copy_data(1, wbi, dev->page,
+						dev->sector, tx);
 				wbi = r5_next_bio(wbi, dev->sector);
 			}
 		}
@@ -1237,6 +1244,20 @@ ops_run_reconstruct5(struct stripe_head
 	pr_debug("%s: stripe %llu\n", __func__,
 		(unsigned long long)sh->sector);
 
+	for (i = 0; i < sh->disks; i++) {
+		if (pd_idx == i)
+			continue;
+		if (!test_bit(R5_Discard, &sh->dev[i].flags))
+			break;
+	}
+	if (i >= sh->disks) {
+		atomic_inc(&sh->count);
+		memset(page_address(sh->dev[pd_idx].page), 0,
+			STRIPE_SECTORS << 9);
+		set_bit(R5_Discard, &sh->dev[pd_idx].flags);
+		ops_complete_reconstruct(sh);
+		return;
+	}
 	/* check if prexor is active which means only process blocks
 	 * that are part of a read-modify-write (written)
 	 */
@@ -1281,10 +1302,28 @@ ops_run_reconstruct6(struct stripe_head
 {
 	struct async_submit_ctl submit;
 	struct page **blocks = percpu->scribble;
-	int count;
+	int count, i;
 
 	pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector);
 
+	for (i = 0; i < sh->disks; i++) {
+		if (sh->pd_idx == i || sh->qd_idx == i)
+			continue;
+		if (!test_bit(R5_Discard, &sh->dev[i].flags))
+			break;
+	}
+	if (i >= sh->disks) {
+		atomic_inc(&sh->count);
+		memset(page_address(sh->dev[sh->pd_idx].page), 0,
+			STRIPE_SECTORS << 9);
+		memset(page_address(sh->dev[sh->qd_idx].page), 0,
+			STRIPE_SECTORS << 9);
+		set_bit(R5_Discard, &sh->dev[sh->pd_idx].flags);
+		set_bit(R5_Discard, &sh->dev[sh->qd_idx].flags);
+		ops_complete_reconstruct(sh);
+		return;
+	}
+
 	count = set_syndrome_sources(blocks, sh);
 
 	atomic_inc(&sh->count);
@@ -2351,38 +2390,24 @@ schedule_reconstruction(struct stripe_he
  * toread/towrite point to the first in a chain.
  * The bi_next chain must be in order.
  */
-static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, int forwrite)
+static int __add_stripe_bio(struct stripe_head *sh, struct bio *bi,
+	int dd_idx, int forwrite, int *firstwrite)
 {
 	struct bio **bip;
-	struct r5conf *conf = sh->raid_conf;
-	int firstwrite=0;
-
-	pr_debug("adding bi b#%llu to stripe s#%llu\n",
-		(unsigned long long)bi->bi_sector,
-		(unsigned long long)sh->sector);
 
-	/*
-	 * If several bio share a stripe. The bio bi_phys_segments acts as a
-	 * reference count to avoid race. The reference count should already be
-	 * increased before this function is called (for example, in
-	 * make_request()), so other bio sharing this stripe will not free the
-	 * stripe. If a stripe is owned by one stripe, the stripe lock will
-	 * protect it.
-	 */
-	spin_lock_irq(&sh->stripe_lock);
 	if (forwrite) {
 		bip = &sh->dev[dd_idx].towrite;
 		if (*bip == NULL)
-			firstwrite = 1;
+			*firstwrite |= 1;
 	} else
 		bip = &sh->dev[dd_idx].toread;
 	while (*bip && (*bip)->bi_sector < bi->bi_sector) {
 		if ((*bip)->bi_sector + ((*bip)->bi_size >> 9) > bi->bi_sector)
-			goto overlap;
+			return 0;
 		bip = & (*bip)->bi_next;
 	}
 	if (*bip && (*bip)->bi_sector < bi->bi_sector + ((bi->bi_size)>>9))
-		goto overlap;
+		return 0;
 
 	BUG_ON(*bip && bi->bi_next && (*bip) != bi->bi_next);
 	if (*bip)
@@ -2403,10 +2428,48 @@ static int add_stripe_bio(struct stripe_
 		if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS)
 			set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags);
 	}
+	return 1;
+}
+
+static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, int forwrite)
+{
+	struct r5conf *conf = sh->raid_conf;
+	int firstwrite=0;
+
+	pr_debug("adding bi b#%llu to stripe s#%llu\n",
+		(unsigned long long)bi->bi_sector,
+		(unsigned long long)sh->sector);
+
+	/*
+	 * If several bio share a stripe. The bio bi_phys_segments acts as a
+	 * reference count to avoid race. The reference count should already be
+	 * increased before this function is called (for example, in
+	 * make_request()), so other bio sharing this stripe will not free the
+	 * stripe. If a stripe is owned by one stripe, the stripe lock will
+	 * protect it.
+	 */
+	spin_lock_irq(&sh->stripe_lock);
+
+	if (!(bi->bi_rw & REQ_DISCARD)) {
+		if (!__add_stripe_bio(sh, bi, dd_idx, forwrite, &firstwrite))
+			goto overlap;
+	} else {
+		/* Discard always handles the whole strip */
+		for (dd_idx = 0; dd_idx < sh->disks; dd_idx++) {
+			if (dd_idx == sh->pd_idx || dd_idx == sh->qd_idx)
+				continue;
+			if (sh->dev[dd_idx].towrite)
+				goto overlap;
+			if (!__add_stripe_bio(sh, bi, dd_idx, forwrite,
+						&firstwrite))
+				goto overlap;
+		}
+	}
+
 	spin_unlock_irq(&sh->stripe_lock);
 
 	pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n",
-		(unsigned long long)(*bip)->bi_sector,
+		(unsigned long long)bi->bi_sector,
 		(unsigned long long)sh->sector, dd_idx);
 
 	if (conf->mddev->bitmap && firstwrite) {
@@ -4067,32 +4130,14 @@ static void release_stripe_plug(struct m
 		release_stripe(sh);
 }
 
-static void make_request(struct mddev *mddev, struct bio * bi)
+static void make_request_range(struct mddev *mddev, struct bio *bi,
+			sector_t logical_sector, sector_t last_sector)
 {
 	struct r5conf *conf = mddev->private;
 	int dd_idx;
 	sector_t new_sector;
-	sector_t logical_sector, last_sector;
-	struct stripe_head *sh;
 	const int rw = bio_data_dir(bi);
-	int remaining;
-
-	if (unlikely(bi->bi_rw & REQ_FLUSH)) {
-		md_flush_request(mddev, bi);
-		return;
-	}
-
-	md_write_start(mddev, bi);
-
-	if (rw == READ &&
-	     mddev->reshape_position == MaxSector &&
-	     chunk_aligned_read(mddev,bi))
-		return;
-
-	logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
-	last_sector = bi->bi_sector + (bi->bi_size>>9);
-	bi->bi_next = NULL;
-	bi->bi_phys_segments = 1;	/* over-loaded to count active stripes */
+	struct stripe_head *sh;
 
 	for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
 		DEFINE_WAIT(w);
@@ -4114,6 +4159,11 @@ static void make_request(struct mddev *m
 			if (mddev->reshape_backwards
 			    ? logical_sector < conf->reshape_progress
 			    : logical_sector >= conf->reshape_progress) {
+				/* The stripe will be reshaped soon, ignore it */
+				if (bi->bi_rw & REQ_DISCARD) {
+					spin_unlock_irq(&conf->device_lock);
+					continue;
+				}
 				previous = 1;
 			} else {
 				if (mddev->reshape_backwards
@@ -4203,6 +4253,51 @@ static void make_request(struct mddev *m
 			break;
 		}
 	}
+}
+
+static void make_request(struct mddev *mddev, struct bio * bi)
+{
+	struct r5conf *conf = mddev->private;
+	sector_t logical_sector, last_sector;
+	const int rw = bio_data_dir(bi);
+	int remaining;
+
+	if (unlikely(bi->bi_rw & REQ_FLUSH)) {
+		md_flush_request(mddev, bi);
+		return;
+	}
+
+	md_write_start(mddev, bi);
+
+	if (rw == READ &&
+	     mddev->reshape_position == MaxSector &&
+	     chunk_aligned_read(mddev,bi))
+		return;
+
+	logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
+	last_sector = bi->bi_sector + (bi->bi_size>>9);
+
+	bi->bi_next = NULL;
+	bi->bi_phys_segments = 1;	/* over-loaded to count active stripes */
+
+	if (likely(!(bi->bi_rw & REQ_DISCARD))) {
+		make_request_range(mddev, bi, logical_sector, last_sector);
+	} else {
+		int stripe_sectors = conf->chunk_sectors *
+			(conf->raid_disks - conf->max_degraded);
+		logical_sector = DIV_ROUND_UP_SECTOR_T(logical_sector,
+						       stripe_sectors);
+		sector_div(last_sector, stripe_sectors);
+
+		logical_sector *= stripe_sectors;
+		last_sector *= stripe_sectors;
+
+		while (logical_sector < last_sector) {
+			make_request_range(mddev, bi, logical_sector,
+				logical_sector + conf->chunk_sectors);
+			logical_sector += stripe_sectors;
+		}
+	}
 
 	remaining = raid5_dec_bi_active_stripes(bi);
 	if (remaining == 0) {
@@ -5361,6 +5456,7 @@ static int run(struct mddev *mddev)
 
 	if (mddev->queue) {
 		int chunk_size;
+		bool discard_supported = true;
 		/* read-ahead size must cover two whole stripes, which
 		 * is 2 * (datadisks) * chunksize where 'n' is the
 		 * number of raid devices
@@ -5380,13 +5476,48 @@ static int run(struct mddev *mddev)
 		blk_queue_io_min(mddev->queue, chunk_size);
 		blk_queue_io_opt(mddev->queue, chunk_size *
 				 (conf->raid_disks - conf->max_degraded));
+		/*
+		 * We can only discard a whole stripe. It doesn't make sense to
+		 * discard data disk but write parity disk
+		 */
+		stripe = stripe * PAGE_SIZE;
+		mddev->queue->limits.discard_alignment = stripe;
+		mddev->queue->limits.discard_granularity = stripe;
+		/*
+		 * unaligned part of discard request will be ignored, so can't
+		 * guarantee discard_zerors_data
+		 */
+		mddev->queue->limits.discard_zeroes_data = 0;
 
 		rdev_for_each(rdev, mddev) {
 			disk_stack_limits(mddev->gendisk, rdev->bdev,
 					  rdev->data_offset << 9);
 			disk_stack_limits(mddev->gendisk, rdev->bdev,
 					  rdev->new_data_offset << 9);
+			/*
+			 * discard_zeroes_data is required, otherwise data
+			 * could be lost. Consider a scenario: discard a stripe
+			 * (the stripe could be inconsistent if
+			 * discard_zeroes_data is 0); write one disk of the
+			 * stripe (the stripe could be inconsistent again
+			 * depending on which disks are used to calculate
+			 * parity); the disk is broken; The stripe data of this
+			 * disk is lost.
+			 */
+			if (!blk_queue_discard(bdev_get_queue(rdev->bdev)) ||
+			    !bdev_get_queue(rdev->bdev)->
+						limits.discard_zeroes_data)
+				discard_supported = false;
 		}
+
+		if (discard_supported &&
+		   mddev->queue->limits.max_discard_sectors >= stripe &&
+		   mddev->queue->limits.discard_granularity >= stripe)
+			queue_flag_set_unlocked(QUEUE_FLAG_DISCARD,
+						mddev->queue);
+		else
+			queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD,
+						mddev->queue);
 	}
 
 	return 0;
Index: linux/drivers/md/raid5.h
===================================================================
--- linux.orig/drivers/md/raid5.h	2012-09-11 17:36:30.631148325 +0800
+++ linux/drivers/md/raid5.h	2012-09-12 12:01:10.654354592 +0800
@@ -298,6 +298,7 @@ enum r5dev_flags {
 	R5_WantReplace, /* We need to update the replacement, we have read
 			 * data in, and now is a good time to write it out.
 			 */
+	R5_Discard,	/* Discard the stripe */
 };
 
 /*

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [patch 6/7 v2] MD: raid5 trim support
  2012-09-12  4:09             ` Shaohua Li
@ 2012-09-18  4:52               ` NeilBrown
  0 siblings, 0 replies; 19+ messages in thread
From: NeilBrown @ 2012-09-18  4:52 UTC (permalink / raw)
  To: Shaohua Li; +Cc: Shaohua Li, linux-raid, axboe

[-- Attachment #1: Type: text/plain, Size: 4973 bytes --]

On Wed, 12 Sep 2012 12:09:53 +0800 Shaohua Li <shli@kernel.org> wrote:

> On Tue, Sep 11, 2012 at 02:10:08PM +1000, NeilBrown wrote:
> > > +	bi = orig_bi;
> > > +	if (bi->bi_rw & REQ_DISCARD) {
> > > +		dd_idx++;
> > > +		while (dd_idx == sh->pd_idx || dd_idx == sh->qd_idx)
> > > +			dd_idx++;
> > > +		if (dd_idx < sh->disks)
> > > +			goto again;
> > > +	}
> > 
> > I'm afraid there there is something else here that I can't make my self happy
> > with.
> > 
> > You added a new "goto again" loop inside add_stripe_bio, and to compensate
> > the increase logical_sector in make_request so that it doesn't call
> > add_stripe_bio so many times.
> > This means that to control flow between make_request and add_stripe_bio is
> > very different depending on whether it is a discard or not.  That make the
> > code harder to understand and easier to break later.
> > 
> > I think it would to create a completely separate "make_trim_request()" which
> > handles the trim/discard case, and leave the current code as it is.
> > 
> > If/when you do send a patch to do that, please also resend the other raid5
> > patch which comes after this one.  I tend not to keep patches once I've
> > devices not to apply them immediately.  It also reduces the chance of
> > confusion if you just send whatever you want me to apply.
> 
> I'm afraid there will be a lot of duplicate code doing this way. How about
> below patch? I made it clearer for discard. If you like it, I'll send other
> patches.

I don't think there will be that much duplicate code.  And even if there is
some, it is worth it to get a clearer control flow.

I tried writing something and realised that your code is (I think) racy.
You really need the whole stripe to be either all-discard, or no-discards.
So you need to hold the stripe_lock while adding the discard bios to every
device.  Your code doesn't do that.

Here is what I came up with.  It only addresses the 'make_request' part of
the patch and isn't even compile tested but it should show the general shape
of the solution, and show how little code is duplicated.

NeilBrown

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 7031b86..ca80290 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -4071,6 +4071,73 @@ static void release_stripe_plug(struct mddev *mddev,
 		release_stripe(sh);
 }
 
+static void make_discard_request(struct mddev *mddev, struct bio *bi)
+{
+	sector_t logical_sector, last_sector;
+	int stripe_sectors;
+	struct r5conf *conf = mddev->private;
+
+	if (mddev->reshape_position != MaxSector)
+		/* Skip discard while reshape is happening */
+		return;
+
+	logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
+	last_sector = bi->bi_sector + (bi->bi_size>>9);
+
+	bi->bi_next = NULL;
+	bi->bi_phys_segments = 1;	/* over-loaded to count active stripes */
+
+	stripe_sectors = conf->chunk_sectors *
+		(conf->raid_disks - conf->max_degraded);
+	logical_sector = DIV_ROUND_UP_SECTOR_T(logical_sector,
+					       stripe_sectors);
+	sector_div(last_sector, stripe_sectors);
+
+	for (;logical_sector < last_sector;
+	     logical_sector += STRIPE_SECTORS) {
+		sh = get_active_stripe(conf, logical_sector, previous,
+				       0, 0);
+	again:
+		prepare_to_wait(&conf->wait_for_overlap, &w,
+				TASK_UNINTERRUPTIBLE);
+		spin_lock_irq(&sh->stripe_lock);
+		for (d = 0; d < conf->raid_disks; d++) {
+			if (sh->dev[d].towrite ||
+			    sh->dev[d].toread) {
+				set_bit(R5_Overwrite, &sh->dev[d].flags);
+				spin_unlock_irq(&sh->stripe_lock);
+				schedule();
+				goto again;
+			}
+		}
+		finish_wait(&conf->wait_for_overlap, &w);
+		for (d = 0; d < conf->raid_disks; d++)
+			if (d != conf->pd_idx &&
+			    d != conf->qd_idx) {
+				sh->dev[d].towrite = bi;
+				set_bit(R5_OVERWRITE, &sh->dev[d].flags);
+			}
+		spin_unlock_irq(&sh->stripe_lock);
+		if (conf->mddev->bitmap) {
+			for (d = 0; d < conf->raid_disks - conf->max_degraded;
+			     d++)
+				bitmap_startwrite(mddev->bitmap,
+						  sh->sector,
+						  STRIPE_SECTORS,
+						  0);
+			sh->bm_seq = conf->seq_flush + 1;
+			set_bit(STRIPE_BIT_DELAY, &sh->state);
+		}
+
+		set_bit(STRIPE_HANDLE, &sh->state);
+		clear_bit(STRIPE_DELAYED, &sh->state);
+		if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE,
+				      &sh->state))
+			atomic_inc(&conf->preread_active_stripes);
+		release_stripe_plug(mddev, sh);
+	}
+}
+
 static void make_request(struct mddev *mddev, struct bio * bi)
 {
 	struct r5conf *conf = mddev->private;
@@ -4093,6 +4160,11 @@ static void make_request(struct mddev *mddev, struct bio * bi)
 	     chunk_aligned_read(mddev,bi))
 		return;
 
+	if (unlikely(bi->bi_rw & REQ_DISCARD)) {
+		make_discard_request(mddev, bi);
+		return;
+	}
+
 	logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
 	last_sector = bi->bi_sector + (bi->bi_size>>9);
 	bi->bi_next = NULL;

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 828 bytes --]

^ permalink raw reply related	[flat|nested] 19+ messages in thread

end of thread, other threads:[~2012-09-18  4:52 UTC | newest]

Thread overview: 19+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2012-08-10  2:51 [patch 0/7 v2] MD linear/0/1/10/5 TRIM support Shaohua Li
2012-08-10  2:51 ` [patch 1/7 v2] block: makes bio_split support bio without data Shaohua Li
2012-08-10  2:51 ` [patch 2/7 v2] md: linear supports TRIM Shaohua Li
2012-08-10  2:51 ` [patch 3/7 v2] md: raid 0 " Shaohua Li
2012-08-10  2:51 ` [patch 4/7 v2] md: raid 1 " Shaohua Li
2012-08-10  2:51 ` [patch 5/7 v2] md: raid 10 " Shaohua Li
2012-08-10  2:51 ` [patch 6/7 v2] MD: raid5 trim support Shaohua Li
2012-08-13  1:50   ` NeilBrown
2012-08-13  2:04     ` Shaohua Li
2012-08-13  3:58       ` NeilBrown
2012-08-13  5:43         ` Shaohua Li
2012-09-11  4:10           ` NeilBrown
2012-09-12  4:09             ` Shaohua Li
2012-09-18  4:52               ` NeilBrown
2012-08-10  2:51 ` [patch 7/7 v2] MD: raid5 avoid unnecessary zero page for trim Shaohua Li
2012-08-13  1:51 ` [patch 0/7 v2] MD linear/0/1/10/5 TRIM support NeilBrown
2012-08-29 18:58 ` Holger Kiehl
2012-08-29 20:19   ` Martin K. Petersen
2012-08-30  0:45   ` Shaohua Li

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).