[patch 1/2]MD: raid5 trim support

linux-raid.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [patch 1/2]MD: raid5 trim support
@ 2012-09-18  8:25 Shaohua Li
  2012-09-20  1:15 ` NeilBrown
  0 siblings, 1 reply; 10+ messages in thread
From: Shaohua Li @ 2012-09-18  8:25 UTC (permalink / raw)
  To: linux-raid; +Cc: neilb

Discard for raid4/5/6 has limitation. If discard request size is small, we do
discard for one disk, but we need calculate parity and write parity disk.  To
correctly calculate parity, zero_after_discard must be guaranteed. Even it's
true, we need do discard for one disk but write another disks, which makes the
parity disks wear out fast. This doesn't make sense. So an efficient discard
for raid4/5/6 should discard all data disks and parity disks, which requires
the write pattern to be (A, A+chunk_size, A+chunk_size*2...). If A's size is
smaller than chunk_size, such pattern is almost impossible in practice. So in
this patch, I only handle the case that A's size equals to chunk_size. That is
discard request should be aligned to stripe size and its size is multiple of
stripe size.

Since we can only handle request with specific alignment and size (or part of
the request fitting stripes), we can't guarantee zero_after_discard even
zero_after_discard is true in low level drives.

The block layer doesn't send down correctly aligned requests even correct
discard alignment is set, so I must filter out.

For raid4/5/6 parity calculation, if data is 0, parity is 0. So if
zero_after_discard is true for all disks, data is consistent after discard.
Otherwise, data might be lost. Let's consider a scenario: discard a stripe,
write data to one disk and write parity disk. The stripe could be still
inconsistent till then depending on using data from other data disks or parity
disks to calculate new parity. If the disk is broken, we can't restore it. So
in this patch, we only enable discard support if all disks have
zero_after_discard.

If discard fails in one disk, we face the similar inconsistent issue above. The
patch will make discard follow the same path as normal write request. If
discard fails, a resync will be scheduled to make the data consistent. This
isn't good to have extra writes, but data consistency is important.

If a subsequent read/write request hits raid5 cache of a discarded stripe, the
discarded dev page should have zero filled, so the data is consistent. This
patch will always zero dev page for discarded request stripe. This isn't
optimal because discard request doesn't need such payload. Next patch will
avoid it.

Signed-off-by: Shaohua Li <shli@fusionio.com>
---
 drivers/md/raid5.c |  176 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
 drivers/md/raid5.h |    1 
 2 files changed, 174 insertions(+), 3 deletions(-)

Index: linux/drivers/md/raid5.c
===================================================================
--- linux.orig/drivers/md/raid5.c	2012-09-18 16:15:51.219353357 +0800
+++ linux/drivers/md/raid5.c	2012-09-18 16:15:55.471299904 +0800
@@ -547,6 +547,8 @@ static void ops_run_io(struct stripe_hea
 				rw = WRITE_FUA;
 			else
 				rw = WRITE;
+			if (test_and_clear_bit(R5_Discard, &sh->dev[i].flags))
+				rw |= REQ_DISCARD;
 		} else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
 			rw = READ;
 		else if (test_and_clear_bit(R5_WantReplace,
@@ -1170,8 +1172,13 @@ ops_run_biodrain(struct stripe_head *sh,
 					set_bit(R5_WantFUA, &dev->flags);
 				if (wbi->bi_rw & REQ_SYNC)
 					set_bit(R5_SyncIO, &dev->flags);
-				tx = async_copy_data(1, wbi, dev->page,
-					dev->sector, tx);
+				if (wbi->bi_rw & REQ_DISCARD) {
+					memset(page_address(dev->page), 0,
+						STRIPE_SECTORS << 9);
+					set_bit(R5_Discard, &dev->flags);
+				} else
+					tx = async_copy_data(1, wbi, dev->page,
+						dev->sector, tx);
 				wbi = r5_next_bio(wbi, dev->sector);
 			}
 		}
@@ -1237,6 +1244,20 @@ ops_run_reconstruct5(struct stripe_head
 	pr_debug("%s: stripe %llu\n", __func__,
 		(unsigned long long)sh->sector);
 
+	for (i = 0; i < sh->disks; i++) {
+		if (pd_idx == i)
+			continue;
+		if (!test_bit(R5_Discard, &sh->dev[i].flags))
+			break;
+	}
+	if (i >= sh->disks) {
+		atomic_inc(&sh->count);
+		memset(page_address(sh->dev[pd_idx].page), 0,
+			STRIPE_SECTORS << 9);
+		set_bit(R5_Discard, &sh->dev[pd_idx].flags);
+		ops_complete_reconstruct(sh);
+		return;
+	}
 	/* check if prexor is active which means only process blocks
 	 * that are part of a read-modify-write (written)
 	 */
@@ -1281,10 +1302,28 @@ ops_run_reconstruct6(struct stripe_head
 {
 	struct async_submit_ctl submit;
 	struct page **blocks = percpu->scribble;
-	int count;
+	int count, i;
 
 	pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector);
 
+	for (i = 0; i < sh->disks; i++) {
+		if (sh->pd_idx == i || sh->qd_idx == i)
+			continue;
+		if (!test_bit(R5_Discard, &sh->dev[i].flags))
+			break;
+	}
+	if (i >= sh->disks) {
+		atomic_inc(&sh->count);
+		memset(page_address(sh->dev[sh->pd_idx].page), 0,
+			STRIPE_SECTORS << 9);
+		memset(page_address(sh->dev[sh->qd_idx].page), 0,
+			STRIPE_SECTORS << 9);
+		set_bit(R5_Discard, &sh->dev[sh->pd_idx].flags);
+		set_bit(R5_Discard, &sh->dev[sh->qd_idx].flags);
+		ops_complete_reconstruct(sh);
+		return;
+	}
+
 	count = set_syndrome_sources(blocks, sh);
 
 	atomic_inc(&sh->count);
@@ -4067,6 +4106,96 @@ static void release_stripe_plug(struct m
 		release_stripe(sh);
 }
 
+static void make_discard_request(struct mddev *mddev, struct bio *bi)
+{
+	struct r5conf *conf = mddev->private;
+	sector_t logical_sector, last_sector;
+	struct stripe_head *sh;
+	int remaining;
+	int stripe_sectors;
+
+	if (mddev->reshape_position != MaxSector)
+		/* Skip discard while reshape is happening */
+		return;
+
+	logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
+	last_sector = bi->bi_sector + (bi->bi_size>>9);
+
+	bi->bi_next = NULL;
+	bi->bi_phys_segments = 1; /* over-loaded to count active stripes */
+
+	stripe_sectors = conf->chunk_sectors *
+		(conf->raid_disks - conf->max_degraded);
+	logical_sector = DIV_ROUND_UP_SECTOR_T(logical_sector,
+			stripe_sectors);
+	sector_div(last_sector, stripe_sectors);
+
+	logical_sector *= stripe_sectors;
+	last_sector *= stripe_sectors;
+
+	for (;logical_sector < last_sector;
+					logical_sector += STRIPE_SECTORS) {
+		DEFINE_WAIT(w);
+		sector_t new_sector;
+		int d;
+
+		new_sector = raid5_compute_sector(conf, logical_sector,
+						  0, &d, NULL);
+again:
+		sh = get_active_stripe(conf, new_sector, 0, 0, 0);
+		prepare_to_wait(&conf->wait_for_overlap, &w,
+						TASK_UNINTERRUPTIBLE);
+		spin_lock_irq(&sh->stripe_lock);
+		for (d = 0; d < conf->raid_disks; d++) {
+			if (d == sh->pd_idx || d == sh->qd_idx)
+				continue;
+			if (sh->dev[d].towrite || sh->dev[d].toread) {
+				set_bit(R5_Overlap, &sh->dev[d].flags);
+				spin_unlock_irq(&sh->stripe_lock);
+				release_stripe(sh);
+				schedule();
+				goto again;
+			}
+		}
+		finish_wait(&conf->wait_for_overlap, &w);
+		for (d = 0; d < conf->raid_disks; d++) {
+			if (d == sh->pd_idx || d == sh->qd_idx)
+				continue;
+			sh->dev[d].towrite = bi;
+			set_bit(R5_OVERWRITE, &sh->dev[d].flags);
+			raid5_inc_bi_active_stripes(bi);
+		}
+		spin_unlock_irq(&sh->stripe_lock);
+		if (conf->mddev->bitmap) {
+			for (d = 0; d < conf->raid_disks - conf->max_degraded;
+									d++)
+				bitmap_startwrite(mddev->bitmap,
+						sh->sector,
+						STRIPE_SECTORS,
+						0);
+			sh->bm_seq = conf->seq_flush + 1;
+			set_bit(STRIPE_BIT_DELAY, &sh->state);
+		}
+
+		set_bit(STRIPE_HANDLE, &sh->state);
+		clear_bit(STRIPE_DELAYED, &sh->state);
+		if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
+			atomic_inc(&conf->preread_active_stripes);
+		release_stripe_plug(mddev, sh);
+
+		new_sector = logical_sector + STRIPE_SECTORS;
+		if (!sector_div(new_sector, conf->chunk_sectors))
+			logical_sector += conf->chunk_sectors *
+				(conf->raid_disks - conf->max_degraded - 1);
+	}
+
+	remaining = raid5_dec_bi_active_stripes(bi);
+	if (remaining == 0) {
+		md_write_end(mddev);
+		bio_endio(bi, 0);
+	}
+}
+
 static void make_request(struct mddev *mddev, struct bio * bi)
 {
 	struct r5conf *conf = mddev->private;
@@ -4089,6 +4218,11 @@ static void make_request(struct mddev *m
 	     chunk_aligned_read(mddev,bi))
 		return;
 
+	if (unlikely(bi->bi_rw & REQ_DISCARD)) {
+		make_discard_request(mddev, bi);
+		return;
+	}
+
 	logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
 	last_sector = bi->bi_sector + (bi->bi_size>>9);
 	bi->bi_next = NULL;
@@ -5361,6 +5495,7 @@ static int run(struct mddev *mddev)
 
 	if (mddev->queue) {
 		int chunk_size;
+		bool discard_supported = true;
 		/* read-ahead size must cover two whole stripes, which
 		 * is 2 * (datadisks) * chunksize where 'n' is the
 		 * number of raid devices
@@ -5380,13 +5515,48 @@ static int run(struct mddev *mddev)
 		blk_queue_io_min(mddev->queue, chunk_size);
 		blk_queue_io_opt(mddev->queue, chunk_size *
 				 (conf->raid_disks - conf->max_degraded));
+		/*
+		 * We can only discard a whole stripe. It doesn't make sense to
+		 * discard data disk but write parity disk
+		 */
+		stripe = stripe * PAGE_SIZE;
+		mddev->queue->limits.discard_alignment = stripe;
+		mddev->queue->limits.discard_granularity = stripe;
+		/*
+		 * unaligned part of discard request will be ignored, so can't
+		 * guarantee discard_zerors_data
+		 */
+		mddev->queue->limits.discard_zeroes_data = 0;
 
 		rdev_for_each(rdev, mddev) {
 			disk_stack_limits(mddev->gendisk, rdev->bdev,
 					  rdev->data_offset << 9);
 			disk_stack_limits(mddev->gendisk, rdev->bdev,
 					  rdev->new_data_offset << 9);
+			/*
+			 * discard_zeroes_data is required, otherwise data
+			 * could be lost. Consider a scenario: discard a stripe
+			 * (the stripe could be inconsistent if
+			 * discard_zeroes_data is 0); write one disk of the
+			 * stripe (the stripe could be inconsistent again
+			 * depending on which disks are used to calculate
+			 * parity); the disk is broken; The stripe data of this
+			 * disk is lost.
+			 */
+			if (!blk_queue_discard(bdev_get_queue(rdev->bdev)) ||
+			    !bdev_get_queue(rdev->bdev)->
+						limits.discard_zeroes_data)
+				discard_supported = false;
 		}
+
+		if (discard_supported &&
+		   mddev->queue->limits.max_discard_sectors >= stripe &&
+		   mddev->queue->limits.discard_granularity >= stripe)
+			queue_flag_set_unlocked(QUEUE_FLAG_DISCARD,
+						mddev->queue);
+		else
+			queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD,
+						mddev->queue);
 	}
 
 	return 0;
Index: linux/drivers/md/raid5.h
===================================================================
--- linux.orig/drivers/md/raid5.h	2012-09-18 16:15:51.235353157 +0800
+++ linux/drivers/md/raid5.h	2012-09-18 16:15:55.471299904 +0800
@@ -298,6 +298,7 @@ enum r5dev_flags {
 	R5_WantReplace, /* We need to update the replacement, we have read
 			 * data in, and now is a good time to write it out.
 			 */
+	R5_Discard,	/* Discard the stripe */
 };
 
 /*

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [patch 1/2]MD: raid5 trim support
  2012-09-18  8:25 [patch 1/2]MD: raid5 trim support Shaohua Li
@ 2012-09-20  1:15 ` NeilBrown
  2012-09-20  1:36   ` Shaohua Li
  0 siblings, 1 reply; 10+ messages in thread
From: NeilBrown @ 2012-09-20  1:15 UTC (permalink / raw)
  To: Shaohua Li; +Cc: linux-raid

[-- Attachment #1: Type: text/plain, Size: 12127 bytes --]

On Tue, 18 Sep 2012 16:25:11 +0800 Shaohua Li <shli@kernel.org> wrote:

> Discard for raid4/5/6 has limitation. If discard request size is small, we do
> discard for one disk, but we need calculate parity and write parity disk.  To
> correctly calculate parity, zero_after_discard must be guaranteed. Even it's
> true, we need do discard for one disk but write another disks, which makes the
> parity disks wear out fast. This doesn't make sense. So an efficient discard
> for raid4/5/6 should discard all data disks and parity disks, which requires
> the write pattern to be (A, A+chunk_size, A+chunk_size*2...). If A's size is
> smaller than chunk_size, such pattern is almost impossible in practice. So in
> this patch, I only handle the case that A's size equals to chunk_size. That is
> discard request should be aligned to stripe size and its size is multiple of
> stripe size.
> 
> Since we can only handle request with specific alignment and size (or part of
> the request fitting stripes), we can't guarantee zero_after_discard even
> zero_after_discard is true in low level drives.
> 
> The block layer doesn't send down correctly aligned requests even correct
> discard alignment is set, so I must filter out.
> 
> For raid4/5/6 parity calculation, if data is 0, parity is 0. So if
> zero_after_discard is true for all disks, data is consistent after discard.
> Otherwise, data might be lost. Let's consider a scenario: discard a stripe,
> write data to one disk and write parity disk. The stripe could be still
> inconsistent till then depending on using data from other data disks or parity
> disks to calculate new parity. If the disk is broken, we can't restore it. So
> in this patch, we only enable discard support if all disks have
> zero_after_discard.
> 
> If discard fails in one disk, we face the similar inconsistent issue above. The
> patch will make discard follow the same path as normal write request. If
> discard fails, a resync will be scheduled to make the data consistent. This
> isn't good to have extra writes, but data consistency is important.
> 
> If a subsequent read/write request hits raid5 cache of a discarded stripe, the
> discarded dev page should have zero filled, so the data is consistent. This
> patch will always zero dev page for discarded request stripe. This isn't
> optimal because discard request doesn't need such payload. Next patch will
> avoid it.
> 
> Signed-off-by: Shaohua Li <shli@fusionio.com>
> ---
>  drivers/md/raid5.c |  176 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
>  drivers/md/raid5.h |    1 
>  2 files changed, 174 insertions(+), 3 deletions(-)
> 
> Index: linux/drivers/md/raid5.c
> ===================================================================
> --- linux.orig/drivers/md/raid5.c	2012-09-18 16:15:51.219353357 +0800
> +++ linux/drivers/md/raid5.c	2012-09-18 16:15:55.471299904 +0800
> @@ -547,6 +547,8 @@ static void ops_run_io(struct stripe_hea
>  				rw = WRITE_FUA;
>  			else
>  				rw = WRITE;
> +			if (test_and_clear_bit(R5_Discard, &sh->dev[i].flags))
> +				rw |= REQ_DISCARD;
>  		} else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
>  			rw = READ;
>  		else if (test_and_clear_bit(R5_WantReplace,
> @@ -1170,8 +1172,13 @@ ops_run_biodrain(struct stripe_head *sh,
>  					set_bit(R5_WantFUA, &dev->flags);
>  				if (wbi->bi_rw & REQ_SYNC)
>  					set_bit(R5_SyncIO, &dev->flags);
> -				tx = async_copy_data(1, wbi, dev->page,
> -					dev->sector, tx);
> +				if (wbi->bi_rw & REQ_DISCARD) {
> +					memset(page_address(dev->page), 0,
> +						STRIPE_SECTORS << 9);
> +					set_bit(R5_Discard, &dev->flags);
> +				} else
> +					tx = async_copy_data(1, wbi, dev->page,
> +						dev->sector, tx);
>  				wbi = r5_next_bio(wbi, dev->sector);
>  			}
>  		}
> @@ -1237,6 +1244,20 @@ ops_run_reconstruct5(struct stripe_head
>  	pr_debug("%s: stripe %llu\n", __func__,
>  		(unsigned long long)sh->sector);
>  
> +	for (i = 0; i < sh->disks; i++) {
> +		if (pd_idx == i)
> +			continue;
> +		if (!test_bit(R5_Discard, &sh->dev[i].flags))
> +			break;
> +	}
> +	if (i >= sh->disks) {
> +		atomic_inc(&sh->count);
> +		memset(page_address(sh->dev[pd_idx].page), 0,
> +			STRIPE_SECTORS << 9);
> +		set_bit(R5_Discard, &sh->dev[pd_idx].flags);
> +		ops_complete_reconstruct(sh);
> +		return;
> +	}
>  	/* check if prexor is active which means only process blocks
>  	 * that are part of a read-modify-write (written)
>  	 */
> @@ -1281,10 +1302,28 @@ ops_run_reconstruct6(struct stripe_head
>  {
>  	struct async_submit_ctl submit;
>  	struct page **blocks = percpu->scribble;
> -	int count;
> +	int count, i;
>  
>  	pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector);
>  
> +	for (i = 0; i < sh->disks; i++) {
> +		if (sh->pd_idx == i || sh->qd_idx == i)
> +			continue;
> +		if (!test_bit(R5_Discard, &sh->dev[i].flags))
> +			break;
> +	}
> +	if (i >= sh->disks) {
> +		atomic_inc(&sh->count);
> +		memset(page_address(sh->dev[sh->pd_idx].page), 0,
> +			STRIPE_SECTORS << 9);
> +		memset(page_address(sh->dev[sh->qd_idx].page), 0,
> +			STRIPE_SECTORS << 9);
> +		set_bit(R5_Discard, &sh->dev[sh->pd_idx].flags);
> +		set_bit(R5_Discard, &sh->dev[sh->qd_idx].flags);
> +		ops_complete_reconstruct(sh);
> +		return;
> +	}
> +
>  	count = set_syndrome_sources(blocks, sh);
>  
>  	atomic_inc(&sh->count);
> @@ -4067,6 +4106,96 @@ static void release_stripe_plug(struct m
>  		release_stripe(sh);
>  }
>  
> +static void make_discard_request(struct mddev *mddev, struct bio *bi)
> +{
> +	struct r5conf *conf = mddev->private;
> +	sector_t logical_sector, last_sector;
> +	struct stripe_head *sh;
> +	int remaining;
> +	int stripe_sectors;
> +
> +	if (mddev->reshape_position != MaxSector)
> +		/* Skip discard while reshape is happening */
> +		return;
> +
> +	logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
> +	last_sector = bi->bi_sector + (bi->bi_size>>9);
> +
> +	bi->bi_next = NULL;
> +	bi->bi_phys_segments = 1; /* over-loaded to count active stripes */
> +
> +	stripe_sectors = conf->chunk_sectors *
> +		(conf->raid_disks - conf->max_degraded);
> +	logical_sector = DIV_ROUND_UP_SECTOR_T(logical_sector,
> +			stripe_sectors);
> +	sector_div(last_sector, stripe_sectors);
> +
> +	logical_sector *= stripe_sectors;
> +	last_sector *= stripe_sectors;
> +
> +	for (;logical_sector < last_sector;
> +					logical_sector += STRIPE_SECTORS) {
> +		DEFINE_WAIT(w);
> +		sector_t new_sector;
> +		int d;
> +
> +		new_sector = raid5_compute_sector(conf, logical_sector,
> +						  0, &d, NULL);

This is pointless.  Look at the patch I posted again.  You don't need to call
raid5_compute_sector().  It essentially just divides logical_sector by
stripe_sectors.  It is cleaner not to do the multiple in the first place.

NeilBrown


> +again:
> +		sh = get_active_stripe(conf, new_sector, 0, 0, 0);
> +		prepare_to_wait(&conf->wait_for_overlap, &w,
> +						TASK_UNINTERRUPTIBLE);
> +		spin_lock_irq(&sh->stripe_lock);
> +		for (d = 0; d < conf->raid_disks; d++) {
> +			if (d == sh->pd_idx || d == sh->qd_idx)
> +				continue;
> +			if (sh->dev[d].towrite || sh->dev[d].toread) {
> +				set_bit(R5_Overlap, &sh->dev[d].flags);
> +				spin_unlock_irq(&sh->stripe_lock);
> +				release_stripe(sh);
> +				schedule();
> +				goto again;
> +			}
> +		}
> +		finish_wait(&conf->wait_for_overlap, &w);
> +		for (d = 0; d < conf->raid_disks; d++) {
> +			if (d == sh->pd_idx || d == sh->qd_idx)
> +				continue;
> +			sh->dev[d].towrite = bi;
> +			set_bit(R5_OVERWRITE, &sh->dev[d].flags);
> +			raid5_inc_bi_active_stripes(bi);
> +		}
> +		spin_unlock_irq(&sh->stripe_lock);
> +		if (conf->mddev->bitmap) {
> +			for (d = 0; d < conf->raid_disks - conf->max_degraded;
> +									d++)
> +				bitmap_startwrite(mddev->bitmap,
> +						sh->sector,
> +						STRIPE_SECTORS,
> +						0);
> +			sh->bm_seq = conf->seq_flush + 1;
> +			set_bit(STRIPE_BIT_DELAY, &sh->state);
> +		}
> +
> +		set_bit(STRIPE_HANDLE, &sh->state);
> +		clear_bit(STRIPE_DELAYED, &sh->state);
> +		if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
> +			atomic_inc(&conf->preread_active_stripes);
> +		release_stripe_plug(mddev, sh);
> +
> +		new_sector = logical_sector + STRIPE_SECTORS;
> +		if (!sector_div(new_sector, conf->chunk_sectors))
> +			logical_sector += conf->chunk_sectors *
> +				(conf->raid_disks - conf->max_degraded - 1);
> +	}
> +
> +	remaining = raid5_dec_bi_active_stripes(bi);
> +	if (remaining == 0) {
> +		md_write_end(mddev);
> +		bio_endio(bi, 0);
> +	}
> +}
> +
>  static void make_request(struct mddev *mddev, struct bio * bi)
>  {
>  	struct r5conf *conf = mddev->private;
> @@ -4089,6 +4218,11 @@ static void make_request(struct mddev *m
>  	     chunk_aligned_read(mddev,bi))
>  		return;
>  
> +	if (unlikely(bi->bi_rw & REQ_DISCARD)) {
> +		make_discard_request(mddev, bi);
> +		return;
> +	}
> +
>  	logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
>  	last_sector = bi->bi_sector + (bi->bi_size>>9);
>  	bi->bi_next = NULL;
> @@ -5361,6 +5495,7 @@ static int run(struct mddev *mddev)
>  
>  	if (mddev->queue) {
>  		int chunk_size;
> +		bool discard_supported = true;
>  		/* read-ahead size must cover two whole stripes, which
>  		 * is 2 * (datadisks) * chunksize where 'n' is the
>  		 * number of raid devices
> @@ -5380,13 +5515,48 @@ static int run(struct mddev *mddev)
>  		blk_queue_io_min(mddev->queue, chunk_size);
>  		blk_queue_io_opt(mddev->queue, chunk_size *
>  				 (conf->raid_disks - conf->max_degraded));
> +		/*
> +		 * We can only discard a whole stripe. It doesn't make sense to
> +		 * discard data disk but write parity disk
> +		 */
> +		stripe = stripe * PAGE_SIZE;
> +		mddev->queue->limits.discard_alignment = stripe;
> +		mddev->queue->limits.discard_granularity = stripe;
> +		/*
> +		 * unaligned part of discard request will be ignored, so can't
> +		 * guarantee discard_zerors_data
> +		 */
> +		mddev->queue->limits.discard_zeroes_data = 0;
>  
>  		rdev_for_each(rdev, mddev) {
>  			disk_stack_limits(mddev->gendisk, rdev->bdev,
>  					  rdev->data_offset << 9);
>  			disk_stack_limits(mddev->gendisk, rdev->bdev,
>  					  rdev->new_data_offset << 9);
> +			/*
> +			 * discard_zeroes_data is required, otherwise data
> +			 * could be lost. Consider a scenario: discard a stripe
> +			 * (the stripe could be inconsistent if
> +			 * discard_zeroes_data is 0); write one disk of the
> +			 * stripe (the stripe could be inconsistent again
> +			 * depending on which disks are used to calculate
> +			 * parity); the disk is broken; The stripe data of this
> +			 * disk is lost.
> +			 */
> +			if (!blk_queue_discard(bdev_get_queue(rdev->bdev)) ||
> +			    !bdev_get_queue(rdev->bdev)->
> +						limits.discard_zeroes_data)
> +				discard_supported = false;
>  		}
> +
> +		if (discard_supported &&
> +		   mddev->queue->limits.max_discard_sectors >= stripe &&
> +		   mddev->queue->limits.discard_granularity >= stripe)
> +			queue_flag_set_unlocked(QUEUE_FLAG_DISCARD,
> +						mddev->queue);
> +		else
> +			queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD,
> +						mddev->queue);
>  	}
>  
>  	return 0;
> Index: linux/drivers/md/raid5.h
> ===================================================================
> --- linux.orig/drivers/md/raid5.h	2012-09-18 16:15:51.235353157 +0800
> +++ linux/drivers/md/raid5.h	2012-09-18 16:15:55.471299904 +0800
> @@ -298,6 +298,7 @@ enum r5dev_flags {
>  	R5_WantReplace, /* We need to update the replacement, we have read
>  			 * data in, and now is a good time to write it out.
>  			 */
> +	R5_Discard,	/* Discard the stripe */
>  };
>  
>  /*
> --
> To unsubscribe from this list: send the line "unsubscribe linux-raid" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html


[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 828 bytes --]

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [patch 1/2]MD: raid5 trim support
  2012-09-20  1:15 ` NeilBrown
@ 2012-09-20  1:36   ` Shaohua Li
  2012-09-20  1:47     ` NeilBrown
  0 siblings, 1 reply; 10+ messages in thread
From: Shaohua Li @ 2012-09-20  1:36 UTC (permalink / raw)
  To: NeilBrown; +Cc: linux-raid

On Thu, Sep 20, 2012 at 11:15:17AM +1000, NeilBrown wrote:
> On Tue, 18 Sep 2012 16:25:11 +0800 Shaohua Li <shli@kernel.org> wrote:
> 
> > Discard for raid4/5/6 has limitation. If discard request size is small, we do
> > discard for one disk, but we need calculate parity and write parity disk.  To
> > correctly calculate parity, zero_after_discard must be guaranteed. Even it's
> > true, we need do discard for one disk but write another disks, which makes the
> > parity disks wear out fast. This doesn't make sense. So an efficient discard
> > for raid4/5/6 should discard all data disks and parity disks, which requires
> > the write pattern to be (A, A+chunk_size, A+chunk_size*2...). If A's size is
> > smaller than chunk_size, such pattern is almost impossible in practice. So in
> > this patch, I only handle the case that A's size equals to chunk_size. That is
> > discard request should be aligned to stripe size and its size is multiple of
> > stripe size.
> > 
> > Since we can only handle request with specific alignment and size (or part of
> > the request fitting stripes), we can't guarantee zero_after_discard even
> > zero_after_discard is true in low level drives.
> > 
> > The block layer doesn't send down correctly aligned requests even correct
> > discard alignment is set, so I must filter out.
> > 
> > For raid4/5/6 parity calculation, if data is 0, parity is 0. So if
> > zero_after_discard is true for all disks, data is consistent after discard.
> > Otherwise, data might be lost. Let's consider a scenario: discard a stripe,
> > write data to one disk and write parity disk. The stripe could be still
> > inconsistent till then depending on using data from other data disks or parity
> > disks to calculate new parity. If the disk is broken, we can't restore it. So
> > in this patch, we only enable discard support if all disks have
> > zero_after_discard.
> > 
> > If discard fails in one disk, we face the similar inconsistent issue above. The
> > patch will make discard follow the same path as normal write request. If
> > discard fails, a resync will be scheduled to make the data consistent. This
> > isn't good to have extra writes, but data consistency is important.
> > 
> > If a subsequent read/write request hits raid5 cache of a discarded stripe, the
> > discarded dev page should have zero filled, so the data is consistent. This
> > patch will always zero dev page for discarded request stripe. This isn't
> > optimal because discard request doesn't need such payload. Next patch will
> > avoid it.
> > 
> > Signed-off-by: Shaohua Li <shli@fusionio.com>
> > ---
> >  drivers/md/raid5.c |  176 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
> >  drivers/md/raid5.h |    1 
> >  2 files changed, 174 insertions(+), 3 deletions(-)
> > 
> > Index: linux/drivers/md/raid5.c
> > ===================================================================
> > --- linux.orig/drivers/md/raid5.c	2012-09-18 16:15:51.219353357 +0800
> > +++ linux/drivers/md/raid5.c	2012-09-18 16:15:55.471299904 +0800
> > @@ -547,6 +547,8 @@ static void ops_run_io(struct stripe_hea
> >  				rw = WRITE_FUA;
> >  			else
> >  				rw = WRITE;
> > +			if (test_and_clear_bit(R5_Discard, &sh->dev[i].flags))
> > +				rw |= REQ_DISCARD;
> >  		} else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
> >  			rw = READ;
> >  		else if (test_and_clear_bit(R5_WantReplace,
> > @@ -1170,8 +1172,13 @@ ops_run_biodrain(struct stripe_head *sh,
> >  					set_bit(R5_WantFUA, &dev->flags);
> >  				if (wbi->bi_rw & REQ_SYNC)
> >  					set_bit(R5_SyncIO, &dev->flags);
> > -				tx = async_copy_data(1, wbi, dev->page,
> > -					dev->sector, tx);
> > +				if (wbi->bi_rw & REQ_DISCARD) {
> > +					memset(page_address(dev->page), 0,
> > +						STRIPE_SECTORS << 9);
> > +					set_bit(R5_Discard, &dev->flags);
> > +				} else
> > +					tx = async_copy_data(1, wbi, dev->page,
> > +						dev->sector, tx);
> >  				wbi = r5_next_bio(wbi, dev->sector);
> >  			}
> >  		}
> > @@ -1237,6 +1244,20 @@ ops_run_reconstruct5(struct stripe_head
> >  	pr_debug("%s: stripe %llu\n", __func__,
> >  		(unsigned long long)sh->sector);
> >  
> > +	for (i = 0; i < sh->disks; i++) {
> > +		if (pd_idx == i)
> > +			continue;
> > +		if (!test_bit(R5_Discard, &sh->dev[i].flags))
> > +			break;
> > +	}
> > +	if (i >= sh->disks) {
> > +		atomic_inc(&sh->count);
> > +		memset(page_address(sh->dev[pd_idx].page), 0,
> > +			STRIPE_SECTORS << 9);
> > +		set_bit(R5_Discard, &sh->dev[pd_idx].flags);
> > +		ops_complete_reconstruct(sh);
> > +		return;
> > +	}
> >  	/* check if prexor is active which means only process blocks
> >  	 * that are part of a read-modify-write (written)
> >  	 */
> > @@ -1281,10 +1302,28 @@ ops_run_reconstruct6(struct stripe_head
> >  {
> >  	struct async_submit_ctl submit;
> >  	struct page **blocks = percpu->scribble;
> > -	int count;
> > +	int count, i;
> >  
> >  	pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector);
> >  
> > +	for (i = 0; i < sh->disks; i++) {
> > +		if (sh->pd_idx == i || sh->qd_idx == i)
> > +			continue;
> > +		if (!test_bit(R5_Discard, &sh->dev[i].flags))
> > +			break;
> > +	}
> > +	if (i >= sh->disks) {
> > +		atomic_inc(&sh->count);
> > +		memset(page_address(sh->dev[sh->pd_idx].page), 0,
> > +			STRIPE_SECTORS << 9);
> > +		memset(page_address(sh->dev[sh->qd_idx].page), 0,
> > +			STRIPE_SECTORS << 9);
> > +		set_bit(R5_Discard, &sh->dev[sh->pd_idx].flags);
> > +		set_bit(R5_Discard, &sh->dev[sh->qd_idx].flags);
> > +		ops_complete_reconstruct(sh);
> > +		return;
> > +	}
> > +
> >  	count = set_syndrome_sources(blocks, sh);
> >  
> >  	atomic_inc(&sh->count);
> > @@ -4067,6 +4106,96 @@ static void release_stripe_plug(struct m
> >  		release_stripe(sh);
> >  }
> >  
> > +static void make_discard_request(struct mddev *mddev, struct bio *bi)
> > +{
> > +	struct r5conf *conf = mddev->private;
> > +	sector_t logical_sector, last_sector;
> > +	struct stripe_head *sh;
> > +	int remaining;
> > +	int stripe_sectors;
> > +
> > +	if (mddev->reshape_position != MaxSector)
> > +		/* Skip discard while reshape is happening */
> > +		return;
> > +
> > +	logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
> > +	last_sector = bi->bi_sector + (bi->bi_size>>9);
> > +
> > +	bi->bi_next = NULL;
> > +	bi->bi_phys_segments = 1; /* over-loaded to count active stripes */
> > +
> > +	stripe_sectors = conf->chunk_sectors *
> > +		(conf->raid_disks - conf->max_degraded);
> > +	logical_sector = DIV_ROUND_UP_SECTOR_T(logical_sector,
> > +			stripe_sectors);
> > +	sector_div(last_sector, stripe_sectors);
> > +
> > +	logical_sector *= stripe_sectors;
> > +	last_sector *= stripe_sectors;
> > +
> > +	for (;logical_sector < last_sector;
> > +					logical_sector += STRIPE_SECTORS) {
> > +		DEFINE_WAIT(w);
> > +		sector_t new_sector;
> > +		int d;
> > +
> > +		new_sector = raid5_compute_sector(conf, logical_sector,
> > +						  0, &d, NULL);
> 
> This is pointless.  Look at the patch I posted again.  You don't need to call
> raid5_compute_sector().  It essentially just divides logical_sector by
> stripe_sectors.  It is cleaner not to do the multiple in the first place.

in my test, without it, wrong sectors are trimmed.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [patch 1/2]MD: raid5 trim support
  2012-09-20  1:36   ` Shaohua Li
@ 2012-09-20  1:47     ` NeilBrown
  2012-09-20  2:06       ` Shaohua Li
  2012-09-20  2:27       ` Shaohua Li
  0 siblings, 2 replies; 10+ messages in thread
From: NeilBrown @ 2012-09-20  1:47 UTC (permalink / raw)
  To: Shaohua Li; +Cc: linux-raid

[-- Attachment #1: Type: text/plain, Size: 8046 bytes --]

On Thu, 20 Sep 2012 09:36:42 +0800 Shaohua Li <shli@kernel.org> wrote:

> On Thu, Sep 20, 2012 at 11:15:17AM +1000, NeilBrown wrote:
> > On Tue, 18 Sep 2012 16:25:11 +0800 Shaohua Li <shli@kernel.org> wrote:
> > 
> > > Discard for raid4/5/6 has limitation. If discard request size is small, we do
> > > discard for one disk, but we need calculate parity and write parity disk.  To
> > > correctly calculate parity, zero_after_discard must be guaranteed. Even it's
> > > true, we need do discard for one disk but write another disks, which makes the
> > > parity disks wear out fast. This doesn't make sense. So an efficient discard
> > > for raid4/5/6 should discard all data disks and parity disks, which requires
> > > the write pattern to be (A, A+chunk_size, A+chunk_size*2...). If A's size is
> > > smaller than chunk_size, such pattern is almost impossible in practice. So in
> > > this patch, I only handle the case that A's size equals to chunk_size. That is
> > > discard request should be aligned to stripe size and its size is multiple of
> > > stripe size.
> > > 
> > > Since we can only handle request with specific alignment and size (or part of
> > > the request fitting stripes), we can't guarantee zero_after_discard even
> > > zero_after_discard is true in low level drives.
> > > 
> > > The block layer doesn't send down correctly aligned requests even correct
> > > discard alignment is set, so I must filter out.
> > > 
> > > For raid4/5/6 parity calculation, if data is 0, parity is 0. So if
> > > zero_after_discard is true for all disks, data is consistent after discard.
> > > Otherwise, data might be lost. Let's consider a scenario: discard a stripe,
> > > write data to one disk and write parity disk. The stripe could be still
> > > inconsistent till then depending on using data from other data disks or parity
> > > disks to calculate new parity. If the disk is broken, we can't restore it. So
> > > in this patch, we only enable discard support if all disks have
> > > zero_after_discard.
> > > 
> > > If discard fails in one disk, we face the similar inconsistent issue above. The
> > > patch will make discard follow the same path as normal write request. If
> > > discard fails, a resync will be scheduled to make the data consistent. This
> > > isn't good to have extra writes, but data consistency is important.
> > > 
> > > If a subsequent read/write request hits raid5 cache of a discarded stripe, the
> > > discarded dev page should have zero filled, so the data is consistent. This
> > > patch will always zero dev page for discarded request stripe. This isn't
> > > optimal because discard request doesn't need such payload. Next patch will
> > > avoid it.
> > > 
> > > Signed-off-by: Shaohua Li <shli@fusionio.com>
> > > ---
> > >  drivers/md/raid5.c |  176 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
> > >  drivers/md/raid5.h |    1 
> > >  2 files changed, 174 insertions(+), 3 deletions(-)
> > > 
> > > Index: linux/drivers/md/raid5.c
> > > ===================================================================
> > > --- linux.orig/drivers/md/raid5.c	2012-09-18 16:15:51.219353357 +0800
> > > +++ linux/drivers/md/raid5.c	2012-09-18 16:15:55.471299904 +0800
> > > @@ -547,6 +547,8 @@ static void ops_run_io(struct stripe_hea
> > >  				rw = WRITE_FUA;
> > >  			else
> > >  				rw = WRITE;
> > > +			if (test_and_clear_bit(R5_Discard, &sh->dev[i].flags))
> > > +				rw |= REQ_DISCARD;
> > >  		} else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
> > >  			rw = READ;
> > >  		else if (test_and_clear_bit(R5_WantReplace,
> > > @@ -1170,8 +1172,13 @@ ops_run_biodrain(struct stripe_head *sh,
> > >  					set_bit(R5_WantFUA, &dev->flags);
> > >  				if (wbi->bi_rw & REQ_SYNC)
> > >  					set_bit(R5_SyncIO, &dev->flags);
> > > -				tx = async_copy_data(1, wbi, dev->page,
> > > -					dev->sector, tx);
> > > +				if (wbi->bi_rw & REQ_DISCARD) {
> > > +					memset(page_address(dev->page), 0,
> > > +						STRIPE_SECTORS << 9);
> > > +					set_bit(R5_Discard, &dev->flags);
> > > +				} else
> > > +					tx = async_copy_data(1, wbi, dev->page,
> > > +						dev->sector, tx);
> > >  				wbi = r5_next_bio(wbi, dev->sector);
> > >  			}
> > >  		}
> > > @@ -1237,6 +1244,20 @@ ops_run_reconstruct5(struct stripe_head
> > >  	pr_debug("%s: stripe %llu\n", __func__,
> > >  		(unsigned long long)sh->sector);
> > >  
> > > +	for (i = 0; i < sh->disks; i++) {
> > > +		if (pd_idx == i)
> > > +			continue;
> > > +		if (!test_bit(R5_Discard, &sh->dev[i].flags))
> > > +			break;
> > > +	}
> > > +	if (i >= sh->disks) {
> > > +		atomic_inc(&sh->count);
> > > +		memset(page_address(sh->dev[pd_idx].page), 0,
> > > +			STRIPE_SECTORS << 9);
> > > +		set_bit(R5_Discard, &sh->dev[pd_idx].flags);
> > > +		ops_complete_reconstruct(sh);
> > > +		return;
> > > +	}
> > >  	/* check if prexor is active which means only process blocks
> > >  	 * that are part of a read-modify-write (written)
> > >  	 */
> > > @@ -1281,10 +1302,28 @@ ops_run_reconstruct6(struct stripe_head
> > >  {
> > >  	struct async_submit_ctl submit;
> > >  	struct page **blocks = percpu->scribble;
> > > -	int count;
> > > +	int count, i;
> > >  
> > >  	pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector);
> > >  
> > > +	for (i = 0; i < sh->disks; i++) {
> > > +		if (sh->pd_idx == i || sh->qd_idx == i)
> > > +			continue;
> > > +		if (!test_bit(R5_Discard, &sh->dev[i].flags))
> > > +			break;
> > > +	}
> > > +	if (i >= sh->disks) {
> > > +		atomic_inc(&sh->count);
> > > +		memset(page_address(sh->dev[sh->pd_idx].page), 0,
> > > +			STRIPE_SECTORS << 9);
> > > +		memset(page_address(sh->dev[sh->qd_idx].page), 0,
> > > +			STRIPE_SECTORS << 9);
> > > +		set_bit(R5_Discard, &sh->dev[sh->pd_idx].flags);
> > > +		set_bit(R5_Discard, &sh->dev[sh->qd_idx].flags);
> > > +		ops_complete_reconstruct(sh);
> > > +		return;
> > > +	}
> > > +
> > >  	count = set_syndrome_sources(blocks, sh);
> > >  
> > >  	atomic_inc(&sh->count);
> > > @@ -4067,6 +4106,96 @@ static void release_stripe_plug(struct m
> > >  		release_stripe(sh);
> > >  }
> > >  
> > > +static void make_discard_request(struct mddev *mddev, struct bio *bi)
> > > +{
> > > +	struct r5conf *conf = mddev->private;
> > > +	sector_t logical_sector, last_sector;
> > > +	struct stripe_head *sh;
> > > +	int remaining;
> > > +	int stripe_sectors;
> > > +
> > > +	if (mddev->reshape_position != MaxSector)
> > > +		/* Skip discard while reshape is happening */
> > > +		return;
> > > +
> > > +	logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
> > > +	last_sector = bi->bi_sector + (bi->bi_size>>9);
> > > +
> > > +	bi->bi_next = NULL;
> > > +	bi->bi_phys_segments = 1; /* over-loaded to count active stripes */
> > > +
> > > +	stripe_sectors = conf->chunk_sectors *
> > > +		(conf->raid_disks - conf->max_degraded);
> > > +	logical_sector = DIV_ROUND_UP_SECTOR_T(logical_sector,
> > > +			stripe_sectors);
> > > +	sector_div(last_sector, stripe_sectors);
> > > +
> > > +	logical_sector *= stripe_sectors;
> > > +	last_sector *= stripe_sectors;
> > > +
> > > +	for (;logical_sector < last_sector;
> > > +					logical_sector += STRIPE_SECTORS) {
> > > +		DEFINE_WAIT(w);
> > > +		sector_t new_sector;
> > > +		int d;
> > > +
> > > +		new_sector = raid5_compute_sector(conf, logical_sector,
> > > +						  0, &d, NULL);
> > 
> > This is pointless.  Look at the patch I posted again.  You don't need to call
> > raid5_compute_sector().  It essentially just divides logical_sector by
> > stripe_sectors.  It is cleaner not to do the multiple in the first place.
> 
> in my test, without it, wrong sectors are trimmed.

Which tells me that the code you tested was wrong.
However the code you posted was wrong too.

Maybe if you post the code you tested and which looked more like mine, and
explain which wrong sectors were trimmed....

NeilBrown

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 828 bytes --]

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [patch 1/2]MD: raid5 trim support
  2012-09-20  1:47     ` NeilBrown
@ 2012-09-20  2:06       ` Shaohua Li
  2012-09-20  2:27       ` Shaohua Li
  1 sibling, 0 replies; 10+ messages in thread
From: Shaohua Li @ 2012-09-20  2:06 UTC (permalink / raw)
  To: NeilBrown; +Cc: linux-raid

On Thu, Sep 20, 2012 at 11:47:40AM +1000, NeilBrown wrote:
> On Thu, 20 Sep 2012 09:36:42 +0800 Shaohua Li <shli@kernel.org> wrote:
> 
> > On Thu, Sep 20, 2012 at 11:15:17AM +1000, NeilBrown wrote:
> > > On Tue, 18 Sep 2012 16:25:11 +0800 Shaohua Li <shli@kernel.org> wrote:
> > > 
> > > > Discard for raid4/5/6 has limitation. If discard request size is small, we do
> > > > discard for one disk, but we need calculate parity and write parity disk.  To
> > > > correctly calculate parity, zero_after_discard must be guaranteed. Even it's
> > > > true, we need do discard for one disk but write another disks, which makes the
> > > > parity disks wear out fast. This doesn't make sense. So an efficient discard
> > > > for raid4/5/6 should discard all data disks and parity disks, which requires
> > > > the write pattern to be (A, A+chunk_size, A+chunk_size*2...). If A's size is
> > > > smaller than chunk_size, such pattern is almost impossible in practice. So in
> > > > this patch, I only handle the case that A's size equals to chunk_size. That is
> > > > discard request should be aligned to stripe size and its size is multiple of
> > > > stripe size.
> > > > 
> > > > Since we can only handle request with specific alignment and size (or part of
> > > > the request fitting stripes), we can't guarantee zero_after_discard even
> > > > zero_after_discard is true in low level drives.
> > > > 
> > > > The block layer doesn't send down correctly aligned requests even correct
> > > > discard alignment is set, so I must filter out.
> > > > 
> > > > For raid4/5/6 parity calculation, if data is 0, parity is 0. So if
> > > > zero_after_discard is true for all disks, data is consistent after discard.
> > > > Otherwise, data might be lost. Let's consider a scenario: discard a stripe,
> > > > write data to one disk and write parity disk. The stripe could be still
> > > > inconsistent till then depending on using data from other data disks or parity
> > > > disks to calculate new parity. If the disk is broken, we can't restore it. So
> > > > in this patch, we only enable discard support if all disks have
> > > > zero_after_discard.
> > > > 
> > > > If discard fails in one disk, we face the similar inconsistent issue above. The
> > > > patch will make discard follow the same path as normal write request. If
> > > > discard fails, a resync will be scheduled to make the data consistent. This
> > > > isn't good to have extra writes, but data consistency is important.
> > > > 
> > > > If a subsequent read/write request hits raid5 cache of a discarded stripe, the
> > > > discarded dev page should have zero filled, so the data is consistent. This
> > > > patch will always zero dev page for discarded request stripe. This isn't
> > > > optimal because discard request doesn't need such payload. Next patch will
> > > > avoid it.
> > > > 
> > > > Signed-off-by: Shaohua Li <shli@fusionio.com>
> > > > ---
> > > >  drivers/md/raid5.c |  176 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
> > > >  drivers/md/raid5.h |    1 
> > > >  2 files changed, 174 insertions(+), 3 deletions(-)
> > > > 
> > > > Index: linux/drivers/md/raid5.c
> > > > ===================================================================
> > > > --- linux.orig/drivers/md/raid5.c	2012-09-18 16:15:51.219353357 +0800
> > > > +++ linux/drivers/md/raid5.c	2012-09-18 16:15:55.471299904 +0800
> > > > @@ -547,6 +547,8 @@ static void ops_run_io(struct stripe_hea
> > > >  				rw = WRITE_FUA;
> > > >  			else
> > > >  				rw = WRITE;
> > > > +			if (test_and_clear_bit(R5_Discard, &sh->dev[i].flags))
> > > > +				rw |= REQ_DISCARD;
> > > >  		} else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
> > > >  			rw = READ;
> > > >  		else if (test_and_clear_bit(R5_WantReplace,
> > > > @@ -1170,8 +1172,13 @@ ops_run_biodrain(struct stripe_head *sh,
> > > >  					set_bit(R5_WantFUA, &dev->flags);
> > > >  				if (wbi->bi_rw & REQ_SYNC)
> > > >  					set_bit(R5_SyncIO, &dev->flags);
> > > > -				tx = async_copy_data(1, wbi, dev->page,
> > > > -					dev->sector, tx);
> > > > +				if (wbi->bi_rw & REQ_DISCARD) {
> > > > +					memset(page_address(dev->page), 0,
> > > > +						STRIPE_SECTORS << 9);
> > > > +					set_bit(R5_Discard, &dev->flags);
> > > > +				} else
> > > > +					tx = async_copy_data(1, wbi, dev->page,
> > > > +						dev->sector, tx);
> > > >  				wbi = r5_next_bio(wbi, dev->sector);
> > > >  			}
> > > >  		}
> > > > @@ -1237,6 +1244,20 @@ ops_run_reconstruct5(struct stripe_head
> > > >  	pr_debug("%s: stripe %llu\n", __func__,
> > > >  		(unsigned long long)sh->sector);
> > > >  
> > > > +	for (i = 0; i < sh->disks; i++) {
> > > > +		if (pd_idx == i)
> > > > +			continue;
> > > > +		if (!test_bit(R5_Discard, &sh->dev[i].flags))
> > > > +			break;
> > > > +	}
> > > > +	if (i >= sh->disks) {
> > > > +		atomic_inc(&sh->count);
> > > > +		memset(page_address(sh->dev[pd_idx].page), 0,
> > > > +			STRIPE_SECTORS << 9);
> > > > +		set_bit(R5_Discard, &sh->dev[pd_idx].flags);
> > > > +		ops_complete_reconstruct(sh);
> > > > +		return;
> > > > +	}
> > > >  	/* check if prexor is active which means only process blocks
> > > >  	 * that are part of a read-modify-write (written)
> > > >  	 */
> > > > @@ -1281,10 +1302,28 @@ ops_run_reconstruct6(struct stripe_head
> > > >  {
> > > >  	struct async_submit_ctl submit;
> > > >  	struct page **blocks = percpu->scribble;
> > > > -	int count;
> > > > +	int count, i;
> > > >  
> > > >  	pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector);
> > > >  
> > > > +	for (i = 0; i < sh->disks; i++) {
> > > > +		if (sh->pd_idx == i || sh->qd_idx == i)
> > > > +			continue;
> > > > +		if (!test_bit(R5_Discard, &sh->dev[i].flags))
> > > > +			break;
> > > > +	}
> > > > +	if (i >= sh->disks) {
> > > > +		atomic_inc(&sh->count);
> > > > +		memset(page_address(sh->dev[sh->pd_idx].page), 0,
> > > > +			STRIPE_SECTORS << 9);
> > > > +		memset(page_address(sh->dev[sh->qd_idx].page), 0,
> > > > +			STRIPE_SECTORS << 9);
> > > > +		set_bit(R5_Discard, &sh->dev[sh->pd_idx].flags);
> > > > +		set_bit(R5_Discard, &sh->dev[sh->qd_idx].flags);
> > > > +		ops_complete_reconstruct(sh);
> > > > +		return;
> > > > +	}
> > > > +
> > > >  	count = set_syndrome_sources(blocks, sh);
> > > >  
> > > >  	atomic_inc(&sh->count);
> > > > @@ -4067,6 +4106,96 @@ static void release_stripe_plug(struct m
> > > >  		release_stripe(sh);
> > > >  }
> > > >  
> > > > +static void make_discard_request(struct mddev *mddev, struct bio *bi)
> > > > +{
> > > > +	struct r5conf *conf = mddev->private;
> > > > +	sector_t logical_sector, last_sector;
> > > > +	struct stripe_head *sh;
> > > > +	int remaining;
> > > > +	int stripe_sectors;
> > > > +
> > > > +	if (mddev->reshape_position != MaxSector)
> > > > +		/* Skip discard while reshape is happening */
> > > > +		return;
> > > > +
> > > > +	logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
> > > > +	last_sector = bi->bi_sector + (bi->bi_size>>9);
> > > > +
> > > > +	bi->bi_next = NULL;
> > > > +	bi->bi_phys_segments = 1; /* over-loaded to count active stripes */
> > > > +
> > > > +	stripe_sectors = conf->chunk_sectors *
> > > > +		(conf->raid_disks - conf->max_degraded);
> > > > +	logical_sector = DIV_ROUND_UP_SECTOR_T(logical_sector,
> > > > +			stripe_sectors);
> > > > +	sector_div(last_sector, stripe_sectors);
> > > > +
> > > > +	logical_sector *= stripe_sectors;
> > > > +	last_sector *= stripe_sectors;
> > > > +
> > > > +	for (;logical_sector < last_sector;
> > > > +					logical_sector += STRIPE_SECTORS) {
> > > > +		DEFINE_WAIT(w);
> > > > +		sector_t new_sector;
> > > > +		int d;
> > > > +
> > > > +		new_sector = raid5_compute_sector(conf, logical_sector,
> > > > +						  0, &d, NULL);
> > > 
> > > This is pointless.  Look at the patch I posted again.  You don't need to call
> > > raid5_compute_sector().  It essentially just divides logical_sector by
> > > stripe_sectors.  It is cleaner not to do the multiple in the first place.
> > 
> > in my test, without it, wrong sectors are trimmed.
> 
> Which tells me that the code you tested was wrong.
> However the code you posted was wrong too.
> 
> Maybe if you post the code you tested and which looked more like mine, and
> explain which wrong sectors were trimmed....

Ah, I missed 'It is cleaner not to do the multiple in the first place.'.  Let
me check again.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [patch 1/2]MD: raid5 trim support
  2012-09-20  1:47     ` NeilBrown
  2012-09-20  2:06       ` Shaohua Li
@ 2012-09-20  2:27       ` Shaohua Li
  2012-09-20  3:59         ` NeilBrown
  1 sibling, 1 reply; 10+ messages in thread
From: Shaohua Li @ 2012-09-20  2:27 UTC (permalink / raw)
  To: NeilBrown; +Cc: linux-raid

On Thu, Sep 20, 2012 at 11:47:40AM +1000, NeilBrown wrote:
> On Thu, 20 Sep 2012 09:36:42 +0800 Shaohua Li <shli@kernel.org> wrote:
> 
> > On Thu, Sep 20, 2012 at 11:15:17AM +1000, NeilBrown wrote:
> > > On Tue, 18 Sep 2012 16:25:11 +0800 Shaohua Li <shli@kernel.org> wrote:
> > > 
> > > > Discard for raid4/5/6 has limitation. If discard request size is small, we do
> > > > discard for one disk, but we need calculate parity and write parity disk.  To
> > > > correctly calculate parity, zero_after_discard must be guaranteed. Even it's
> > > > true, we need do discard for one disk but write another disks, which makes the
> > > > parity disks wear out fast. This doesn't make sense. So an efficient discard
> > > > for raid4/5/6 should discard all data disks and parity disks, which requires
> > > > the write pattern to be (A, A+chunk_size, A+chunk_size*2...). If A's size is
> > > > smaller than chunk_size, such pattern is almost impossible in practice. So in
> > > > this patch, I only handle the case that A's size equals to chunk_size. That is
> > > > discard request should be aligned to stripe size and its size is multiple of
> > > > stripe size.
> > > > 
> > > > Since we can only handle request with specific alignment and size (or part of
> > > > the request fitting stripes), we can't guarantee zero_after_discard even
> > > > zero_after_discard is true in low level drives.
> > > > 
> > > > The block layer doesn't send down correctly aligned requests even correct
> > > > discard alignment is set, so I must filter out.
> > > > 
> > > > For raid4/5/6 parity calculation, if data is 0, parity is 0. So if
> > > > zero_after_discard is true for all disks, data is consistent after discard.
> > > > Otherwise, data might be lost. Let's consider a scenario: discard a stripe,
> > > > write data to one disk and write parity disk. The stripe could be still
> > > > inconsistent till then depending on using data from other data disks or parity
> > > > disks to calculate new parity. If the disk is broken, we can't restore it. So
> > > > in this patch, we only enable discard support if all disks have
> > > > zero_after_discard.
> > > > 
> > > > If discard fails in one disk, we face the similar inconsistent issue above. The
> > > > patch will make discard follow the same path as normal write request. If
> > > > discard fails, a resync will be scheduled to make the data consistent. This
> > > > isn't good to have extra writes, but data consistency is important.
> > > > 
> > > > If a subsequent read/write request hits raid5 cache of a discarded stripe, the
> > > > discarded dev page should have zero filled, so the data is consistent. This
> > > > patch will always zero dev page for discarded request stripe. This isn't
> > > > optimal because discard request doesn't need such payload. Next patch will
> > > > avoid it.
> > > > 
> > > > Signed-off-by: Shaohua Li <shli@fusionio.com>
> > > > ---
> > > >  drivers/md/raid5.c |  176 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
> > > >  drivers/md/raid5.h |    1 
> > > >  2 files changed, 174 insertions(+), 3 deletions(-)
> > > > 
> > > > Index: linux/drivers/md/raid5.c
> > > > ===================================================================
> > > > --- linux.orig/drivers/md/raid5.c	2012-09-18 16:15:51.219353357 +0800
> > > > +++ linux/drivers/md/raid5.c	2012-09-18 16:15:55.471299904 +0800
> > > > @@ -547,6 +547,8 @@ static void ops_run_io(struct stripe_hea
> > > >  				rw = WRITE_FUA;
> > > >  			else
> > > >  				rw = WRITE;
> > > > +			if (test_and_clear_bit(R5_Discard, &sh->dev[i].flags))
> > > > +				rw |= REQ_DISCARD;
> > > >  		} else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
> > > >  			rw = READ;
> > > >  		else if (test_and_clear_bit(R5_WantReplace,
> > > > @@ -1170,8 +1172,13 @@ ops_run_biodrain(struct stripe_head *sh,
> > > >  					set_bit(R5_WantFUA, &dev->flags);
> > > >  				if (wbi->bi_rw & REQ_SYNC)
> > > >  					set_bit(R5_SyncIO, &dev->flags);
> > > > -				tx = async_copy_data(1, wbi, dev->page,
> > > > -					dev->sector, tx);
> > > > +				if (wbi->bi_rw & REQ_DISCARD) {
> > > > +					memset(page_address(dev->page), 0,
> > > > +						STRIPE_SECTORS << 9);
> > > > +					set_bit(R5_Discard, &dev->flags);
> > > > +				} else
> > > > +					tx = async_copy_data(1, wbi, dev->page,
> > > > +						dev->sector, tx);
> > > >  				wbi = r5_next_bio(wbi, dev->sector);
> > > >  			}
> > > >  		}
> > > > @@ -1237,6 +1244,20 @@ ops_run_reconstruct5(struct stripe_head
> > > >  	pr_debug("%s: stripe %llu\n", __func__,
> > > >  		(unsigned long long)sh->sector);
> > > >  
> > > > +	for (i = 0; i < sh->disks; i++) {
> > > > +		if (pd_idx == i)
> > > > +			continue;
> > > > +		if (!test_bit(R5_Discard, &sh->dev[i].flags))
> > > > +			break;
> > > > +	}
> > > > +	if (i >= sh->disks) {
> > > > +		atomic_inc(&sh->count);
> > > > +		memset(page_address(sh->dev[pd_idx].page), 0,
> > > > +			STRIPE_SECTORS << 9);
> > > > +		set_bit(R5_Discard, &sh->dev[pd_idx].flags);
> > > > +		ops_complete_reconstruct(sh);
> > > > +		return;
> > > > +	}
> > > >  	/* check if prexor is active which means only process blocks
> > > >  	 * that are part of a read-modify-write (written)
> > > >  	 */
> > > > @@ -1281,10 +1302,28 @@ ops_run_reconstruct6(struct stripe_head
> > > >  {
> > > >  	struct async_submit_ctl submit;
> > > >  	struct page **blocks = percpu->scribble;
> > > > -	int count;
> > > > +	int count, i;
> > > >  
> > > >  	pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector);
> > > >  
> > > > +	for (i = 0; i < sh->disks; i++) {
> > > > +		if (sh->pd_idx == i || sh->qd_idx == i)
> > > > +			continue;
> > > > +		if (!test_bit(R5_Discard, &sh->dev[i].flags))
> > > > +			break;
> > > > +	}
> > > > +	if (i >= sh->disks) {
> > > > +		atomic_inc(&sh->count);
> > > > +		memset(page_address(sh->dev[sh->pd_idx].page), 0,
> > > > +			STRIPE_SECTORS << 9);
> > > > +		memset(page_address(sh->dev[sh->qd_idx].page), 0,
> > > > +			STRIPE_SECTORS << 9);
> > > > +		set_bit(R5_Discard, &sh->dev[sh->pd_idx].flags);
> > > > +		set_bit(R5_Discard, &sh->dev[sh->qd_idx].flags);
> > > > +		ops_complete_reconstruct(sh);
> > > > +		return;
> > > > +	}
> > > > +
> > > >  	count = set_syndrome_sources(blocks, sh);
> > > >  
> > > >  	atomic_inc(&sh->count);
> > > > @@ -4067,6 +4106,96 @@ static void release_stripe_plug(struct m
> > > >  		release_stripe(sh);
> > > >  }
> > > >  
> > > > +static void make_discard_request(struct mddev *mddev, struct bio *bi)
> > > > +{
> > > > +	struct r5conf *conf = mddev->private;
> > > > +	sector_t logical_sector, last_sector;
> > > > +	struct stripe_head *sh;
> > > > +	int remaining;
> > > > +	int stripe_sectors;
> > > > +
> > > > +	if (mddev->reshape_position != MaxSector)
> > > > +		/* Skip discard while reshape is happening */
> > > > +		return;
> > > > +
> > > > +	logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
> > > > +	last_sector = bi->bi_sector + (bi->bi_size>>9);
> > > > +
> > > > +	bi->bi_next = NULL;
> > > > +	bi->bi_phys_segments = 1; /* over-loaded to count active stripes */
> > > > +
> > > > +	stripe_sectors = conf->chunk_sectors *
> > > > +		(conf->raid_disks - conf->max_degraded);
> > > > +	logical_sector = DIV_ROUND_UP_SECTOR_T(logical_sector,
> > > > +			stripe_sectors);
> > > > +	sector_div(last_sector, stripe_sectors);
> > > > +
> > > > +	logical_sector *= stripe_sectors;
> > > > +	last_sector *= stripe_sectors;
> > > > +
> > > > +	for (;logical_sector < last_sector;
> > > > +					logical_sector += STRIPE_SECTORS) {
> > > > +		DEFINE_WAIT(w);
> > > > +		sector_t new_sector;
> > > > +		int d;
> > > > +
> > > > +		new_sector = raid5_compute_sector(conf, logical_sector,
> > > > +						  0, &d, NULL);
> > > 
> > > This is pointless.  Look at the patch I posted again.  You don't need to call
> > > raid5_compute_sector().  It essentially just divides logical_sector by
> > > stripe_sectors.  It is cleaner not to do the multiple in the first place.
> > 
> > in my test, without it, wrong sectors are trimmed.
> 
> Which tells me that the code you tested was wrong.
> However the code you posted was wrong too.
> 
> Maybe if you post the code you tested and which looked more like mine, and
> explain which wrong sectors were trimmed....

Ok, just confirmed, delete raid5_compute_sector is ok if I adjust
logical_sector calculation. Here is the new patch.


Subject: MD: raid5 trim support

Discard for raid4/5/6 has limitation. If discard request size is small, we do
discard for one disk, but we need calculate parity and write parity disk.  To
correctly calculate parity, zero_after_discard must be guaranteed. Even it's
true, we need do discard for one disk but write another disks, which makes the
parity disks wear out fast. This doesn't make sense. So an efficient discard
for raid4/5/6 should discard all data disks and parity disks, which requires
the write pattern to be (A, A+chunk_size, A+chunk_size*2...). If A's size is
smaller than chunk_size, such pattern is almost impossible in practice. So in
this patch, I only handle the case that A's size equals to chunk_size. That is
discard request should be aligned to stripe size and its size is multiple of
stripe size.

Since we can only handle request with specific alignment and size (or part of
the request fitting stripes), we can't guarantee zero_after_discard even
zero_after_discard is true in low level drives.

The block layer doesn't send down correctly aligned requests even correct
discard alignment is set, so I must filter out.

For raid4/5/6 parity calculation, if data is 0, parity is 0. So if
zero_after_discard is true for all disks, data is consistent after discard.
Otherwise, data might be lost. Let's consider a scenario: discard a stripe,
write data to one disk and write parity disk. The stripe could be still
inconsistent till then depending on using data from other data disks or parity
disks to calculate new parity. If the disk is broken, we can't restore it. So
in this patch, we only enable discard support if all disks have
zero_after_discard.

If discard fails in one disk, we face the similar inconsistent issue above. The
patch will make discard follow the same path as normal write request. If
discard fails, a resync will be scheduled to make the data consistent. This
isn't good to have extra writes, but data consistency is important.

If a subsequent read/write request hits raid5 cache of a discarded stripe, the
discarded dev page should have zero filled, so the data is consistent. This
patch will always zero dev page for discarded request stripe. This isn't
optimal because discard request doesn't need such payload. Next patch will
avoid it.

Signed-off-by: Shaohua Li <shli@fusionio.com>
---
 drivers/md/raid5.c |  168 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
 drivers/md/raid5.h |    1 
 2 files changed, 166 insertions(+), 3 deletions(-)

Index: linux/drivers/md/raid5.c
===================================================================
--- linux.orig/drivers/md/raid5.c	2012-09-18 16:15:51.219353357 +0800
+++ linux/drivers/md/raid5.c	2012-09-20 10:22:35.551887189 +0800
@@ -547,6 +547,8 @@ static void ops_run_io(struct stripe_hea
 				rw = WRITE_FUA;
 			else
 				rw = WRITE;
+			if (test_and_clear_bit(R5_Discard, &sh->dev[i].flags))
+				rw |= REQ_DISCARD;
 		} else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
 			rw = READ;
 		else if (test_and_clear_bit(R5_WantReplace,
@@ -1170,8 +1172,13 @@ ops_run_biodrain(struct stripe_head *sh,
 					set_bit(R5_WantFUA, &dev->flags);
 				if (wbi->bi_rw & REQ_SYNC)
 					set_bit(R5_SyncIO, &dev->flags);
-				tx = async_copy_data(1, wbi, dev->page,
-					dev->sector, tx);
+				if (wbi->bi_rw & REQ_DISCARD) {
+					memset(page_address(dev->page), 0,
+						STRIPE_SECTORS << 9);
+					set_bit(R5_Discard, &dev->flags);
+				} else
+					tx = async_copy_data(1, wbi, dev->page,
+						dev->sector, tx);
 				wbi = r5_next_bio(wbi, dev->sector);
 			}
 		}
@@ -1237,6 +1244,20 @@ ops_run_reconstruct5(struct stripe_head
 	pr_debug("%s: stripe %llu\n", __func__,
 		(unsigned long long)sh->sector);
 
+	for (i = 0; i < sh->disks; i++) {
+		if (pd_idx == i)
+			continue;
+		if (!test_bit(R5_Discard, &sh->dev[i].flags))
+			break;
+	}
+	if (i >= sh->disks) {
+		atomic_inc(&sh->count);
+		memset(page_address(sh->dev[pd_idx].page), 0,
+			STRIPE_SECTORS << 9);
+		set_bit(R5_Discard, &sh->dev[pd_idx].flags);
+		ops_complete_reconstruct(sh);
+		return;
+	}
 	/* check if prexor is active which means only process blocks
 	 * that are part of a read-modify-write (written)
 	 */
@@ -1281,10 +1302,28 @@ ops_run_reconstruct6(struct stripe_head
 {
 	struct async_submit_ctl submit;
 	struct page **blocks = percpu->scribble;
-	int count;
+	int count, i;
 
 	pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector);
 
+	for (i = 0; i < sh->disks; i++) {
+		if (sh->pd_idx == i || sh->qd_idx == i)
+			continue;
+		if (!test_bit(R5_Discard, &sh->dev[i].flags))
+			break;
+	}
+	if (i >= sh->disks) {
+		atomic_inc(&sh->count);
+		memset(page_address(sh->dev[sh->pd_idx].page), 0,
+			STRIPE_SECTORS << 9);
+		memset(page_address(sh->dev[sh->qd_idx].page), 0,
+			STRIPE_SECTORS << 9);
+		set_bit(R5_Discard, &sh->dev[sh->pd_idx].flags);
+		set_bit(R5_Discard, &sh->dev[sh->qd_idx].flags);
+		ops_complete_reconstruct(sh);
+		return;
+	}
+
 	count = set_syndrome_sources(blocks, sh);
 
 	atomic_inc(&sh->count);
@@ -4067,6 +4106,88 @@ static void release_stripe_plug(struct m
 		release_stripe(sh);
 }
 
+static void make_discard_request(struct mddev *mddev, struct bio *bi)
+{
+	struct r5conf *conf = mddev->private;
+	sector_t logical_sector, last_sector;
+	struct stripe_head *sh;
+	int remaining;
+	int stripe_sectors;
+
+	if (mddev->reshape_position != MaxSector)
+		/* Skip discard while reshape is happening */
+		return;
+
+	logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
+	last_sector = bi->bi_sector + (bi->bi_size>>9);
+
+	bi->bi_next = NULL;
+	bi->bi_phys_segments = 1; /* over-loaded to count active stripes */
+
+	stripe_sectors = conf->chunk_sectors *
+		(conf->raid_disks - conf->max_degraded);
+	logical_sector = DIV_ROUND_UP_SECTOR_T(logical_sector,
+			stripe_sectors);
+	sector_div(last_sector, stripe_sectors);
+
+	logical_sector *= conf->chunk_sectors;
+	last_sector *= conf->chunk_sectors;
+
+	for (;logical_sector < last_sector;
+					logical_sector += STRIPE_SECTORS) {
+		DEFINE_WAIT(w);
+		int d;
+
+again:
+		sh = get_active_stripe(conf, logical_sector, 0, 0, 0);
+		prepare_to_wait(&conf->wait_for_overlap, &w,
+						TASK_UNINTERRUPTIBLE);
+		spin_lock_irq(&sh->stripe_lock);
+		for (d = 0; d < conf->raid_disks; d++) {
+			if (d == sh->pd_idx || d == sh->qd_idx)
+				continue;
+			if (sh->dev[d].towrite || sh->dev[d].toread) {
+				set_bit(R5_Overlap, &sh->dev[d].flags);
+				spin_unlock_irq(&sh->stripe_lock);
+				release_stripe(sh);
+				schedule();
+				goto again;
+			}
+		}
+		finish_wait(&conf->wait_for_overlap, &w);
+		for (d = 0; d < conf->raid_disks; d++) {
+			if (d == sh->pd_idx || d == sh->qd_idx)
+				continue;
+			sh->dev[d].towrite = bi;
+			set_bit(R5_OVERWRITE, &sh->dev[d].flags);
+			raid5_inc_bi_active_stripes(bi);
+		}
+		spin_unlock_irq(&sh->stripe_lock);
+		if (conf->mddev->bitmap) {
+			for (d = 0; d < conf->raid_disks - conf->max_degraded;
+									d++)
+				bitmap_startwrite(mddev->bitmap,
+						sh->sector,
+						STRIPE_SECTORS,
+						0);
+			sh->bm_seq = conf->seq_flush + 1;
+			set_bit(STRIPE_BIT_DELAY, &sh->state);
+		}
+
+		set_bit(STRIPE_HANDLE, &sh->state);
+		clear_bit(STRIPE_DELAYED, &sh->state);
+		if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
+			atomic_inc(&conf->preread_active_stripes);
+		release_stripe_plug(mddev, sh);
+	}
+
+	remaining = raid5_dec_bi_active_stripes(bi);
+	if (remaining == 0) {
+		md_write_end(mddev);
+		bio_endio(bi, 0);
+	}
+}
+
 static void make_request(struct mddev *mddev, struct bio * bi)
 {
 	struct r5conf *conf = mddev->private;
@@ -4089,6 +4210,11 @@ static void make_request(struct mddev *m
 	     chunk_aligned_read(mddev,bi))
 		return;
 
+	if (unlikely(bi->bi_rw & REQ_DISCARD)) {
+		make_discard_request(mddev, bi);
+		return;
+	}
+
 	logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
 	last_sector = bi->bi_sector + (bi->bi_size>>9);
 	bi->bi_next = NULL;
@@ -5361,6 +5487,7 @@ static int run(struct mddev *mddev)
 
 	if (mddev->queue) {
 		int chunk_size;
+		bool discard_supported = true;
 		/* read-ahead size must cover two whole stripes, which
 		 * is 2 * (datadisks) * chunksize where 'n' is the
 		 * number of raid devices
@@ -5380,13 +5507,48 @@ static int run(struct mddev *mddev)
 		blk_queue_io_min(mddev->queue, chunk_size);
 		blk_queue_io_opt(mddev->queue, chunk_size *
 				 (conf->raid_disks - conf->max_degraded));
+		/*
+		 * We can only discard a whole stripe. It doesn't make sense to
+		 * discard data disk but write parity disk
+		 */
+		stripe = stripe * PAGE_SIZE;
+		mddev->queue->limits.discard_alignment = stripe;
+		mddev->queue->limits.discard_granularity = stripe;
+		/*
+		 * unaligned part of discard request will be ignored, so can't
+		 * guarantee discard_zerors_data
+		 */
+		mddev->queue->limits.discard_zeroes_data = 0;
 
 		rdev_for_each(rdev, mddev) {
 			disk_stack_limits(mddev->gendisk, rdev->bdev,
 					  rdev->data_offset << 9);
 			disk_stack_limits(mddev->gendisk, rdev->bdev,
 					  rdev->new_data_offset << 9);
+			/*
+			 * discard_zeroes_data is required, otherwise data
+			 * could be lost. Consider a scenario: discard a stripe
+			 * (the stripe could be inconsistent if
+			 * discard_zeroes_data is 0); write one disk of the
+			 * stripe (the stripe could be inconsistent again
+			 * depending on which disks are used to calculate
+			 * parity); the disk is broken; The stripe data of this
+			 * disk is lost.
+			 */
+			if (!blk_queue_discard(bdev_get_queue(rdev->bdev)) ||
+			    !bdev_get_queue(rdev->bdev)->
+						limits.discard_zeroes_data)
+				discard_supported = false;
 		}
+
+		if (discard_supported &&
+		   mddev->queue->limits.max_discard_sectors >= stripe &&
+		   mddev->queue->limits.discard_granularity >= stripe)
+			queue_flag_set_unlocked(QUEUE_FLAG_DISCARD,
+						mddev->queue);
+		else
+			queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD,
+						mddev->queue);
 	}
 
 	return 0;
Index: linux/drivers/md/raid5.h
===================================================================
--- linux.orig/drivers/md/raid5.h	2012-09-18 16:15:51.235353157 +0800
+++ linux/drivers/md/raid5.h	2012-09-20 10:21:24.072786455 +0800
@@ -298,6 +298,7 @@ enum r5dev_flags {
 	R5_WantReplace, /* We need to update the replacement, we have read
 			 * data in, and now is a good time to write it out.
 			 */
+	R5_Discard,	/* Discard the stripe */
 };
 
 /*

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [patch 1/2]MD: raid5 trim support
  2012-09-20  2:27       ` Shaohua Li
@ 2012-09-20  3:59         ` NeilBrown
  2012-09-20  4:25           ` Shaohua Li
  0 siblings, 1 reply; 10+ messages in thread
From: NeilBrown @ 2012-09-20  3:59 UTC (permalink / raw)
  To: Shaohua Li; +Cc: linux-raid

[-- Attachment #1: Type: text/plain, Size: 696 bytes --]

On Thu, 20 Sep 2012 10:27:17 +0800 Shaohua Li <shli@kernel.org> wrote:

in which wrong sectors were trimmed....
> 
> Ok, just confirmed, delete raid5_compute_sector is ok if I adjust
> logical_sector calculation. Here is the new patch.
> 

Thanks.  That looks better.  I've applied it with some minor formatting
changes.

I then went to look at the follow-up page and .....
I count 11 separate places where you test the new flag and possibly memset a
page to zero.  This doesn't seem like an improvement to me.

Why don't we just mark the page as not up-to-date when we discard it?  That
would avoid storing inconsistent data, and would avoid needing to zero pages.

NeilBrown

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 828 bytes --]

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [patch 1/2]MD: raid5 trim support
  2012-09-20  3:59         ` NeilBrown
@ 2012-09-20  4:25           ` Shaohua Li
  2012-09-20 10:31             ` Shaohua Li
  0 siblings, 1 reply; 10+ messages in thread
From: Shaohua Li @ 2012-09-20  4:25 UTC (permalink / raw)
  To: NeilBrown; +Cc: linux-raid

On Thu, Sep 20, 2012 at 01:59:14PM +1000, NeilBrown wrote:
> On Thu, 20 Sep 2012 10:27:17 +0800 Shaohua Li <shli@kernel.org> wrote:
> 
> in which wrong sectors were trimmed....
> > 
> > Ok, just confirmed, delete raid5_compute_sector is ok if I adjust
> > logical_sector calculation. Here is the new patch.
> > 
> 
> Thanks.  That looks better.  I've applied it with some minor formatting
> changes.
> 
> I then went to look at the follow-up page and .....
> I count 11 separate places where you test the new flag and possibly memset a
> page to zero.  This doesn't seem like an improvement to me.

We do the zero page just before the stripe is hit in cache, which is rare case.
 
> Why don't we just mark the page as not up-to-date when we discard it?  That
> would avoid storing inconsistent data, and would avoid needing to zero pages.

We need re-read the strip if it's hit in cache, but it's rare case, we don't
care. So when we clear the up-to-date flag? I saw a lot of places checking
up-to-date flag in the write path. Need close look to check if there is race.

Thanks,
Shaohua

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [patch 1/2]MD: raid5 trim support
  2012-09-20  4:25           ` Shaohua Li
@ 2012-09-20 10:31             ` Shaohua Li
  2012-09-25  7:00               ` NeilBrown
  0 siblings, 1 reply; 10+ messages in thread
From: Shaohua Li @ 2012-09-20 10:31 UTC (permalink / raw)
  To: NeilBrown; +Cc: linux-raid

On Thu, Sep 20, 2012 at 12:25:41PM +0800, Shaohua Li wrote:
> On Thu, Sep 20, 2012 at 01:59:14PM +1000, NeilBrown wrote:
> > On Thu, 20 Sep 2012 10:27:17 +0800 Shaohua Li <shli@kernel.org> wrote:
> > 
> > in which wrong sectors were trimmed....
> > > 
> > > Ok, just confirmed, delete raid5_compute_sector is ok if I adjust
> > > logical_sector calculation. Here is the new patch.
> > > 
> > 
> > Thanks.  That looks better.  I've applied it with some minor formatting
> > changes.
> > 
> > I then went to look at the follow-up page and .....
> > I count 11 separate places where you test the new flag and possibly memset a
> > page to zero.  This doesn't seem like an improvement to me.
> 
> We do the zero page just before the stripe is hit in cache, which is rare case.
>  
> > Why don't we just mark the page as not up-to-date when we discard it?  That
> > would avoid storing inconsistent data, and would avoid needing to zero pages.
> 
> We need re-read the strip if it's hit in cache, but it's rare case, we don't
> care. So when we clear the up-to-date flag? I saw a lot of places checking
> up-to-date flag in the write path. Need close look to check if there is race.

Alright, appears ok to use the uptodate flag. Here is the new patch.


Subject: MD: raid5 avoid unnecessary zero page for trim

We want to avoid zero discarded dev page, because it's useless for discard.
But if we don't zero it, another read/write hit such page in the cache and will
get inconsistent data.

To avoid zero the page, we don't set R5_UPTODATE flag after construction is
done. In this way, discard write request is still issued and finished, but read
will not hit the page. If the stripe gets accessed soon, we need reread the
stripe, but since the chance is low, the reread isn't a big deal.

Signed-off-by: Shaohua Li <shli@fusionio.com>
---
 drivers/md/raid5.c |   35 +++++++++++++++++------------------
 1 file changed, 17 insertions(+), 18 deletions(-)

Index: linux/drivers/md/raid5.c
===================================================================
--- linux.orig/drivers/md/raid5.c	2012-09-20 18:10:38.546836309 +0800
+++ linux/drivers/md/raid5.c	2012-09-20 18:17:50.849399945 +0800
@@ -547,7 +547,7 @@ static void ops_run_io(struct stripe_hea
 				rw = WRITE_FUA;
 			else
 				rw = WRITE;
-			if (test_and_clear_bit(R5_Discard, &sh->dev[i].flags))
+			if (test_bit(R5_Discard, &sh->dev[i].flags))
 				rw |= REQ_DISCARD;
 		} else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
 			rw = READ;
@@ -1172,11 +1172,9 @@ ops_run_biodrain(struct stripe_head *sh,
 					set_bit(R5_WantFUA, &dev->flags);
 				if (wbi->bi_rw & REQ_SYNC)
 					set_bit(R5_SyncIO, &dev->flags);
-				if (wbi->bi_rw & REQ_DISCARD) {
-					memset(page_address(dev->page), 0,
-						STRIPE_SECTORS << 9);
+				if (wbi->bi_rw & REQ_DISCARD)
 					set_bit(R5_Discard, &dev->flags);
-				} else
+				else
 					tx = async_copy_data(1, wbi, dev->page,
 						dev->sector, tx);
 				wbi = r5_next_bio(wbi, dev->sector);
@@ -1194,7 +1192,7 @@ static void ops_complete_reconstruct(voi
 	int pd_idx = sh->pd_idx;
 	int qd_idx = sh->qd_idx;
 	int i;
-	bool fua = false, sync = false;
+	bool fua = false, sync = false, discard = false;
 
 	pr_debug("%s: stripe %llu\n", __func__,
 		(unsigned long long)sh->sector);
@@ -1202,13 +1200,15 @@ static void ops_complete_reconstruct(voi
 	for (i = disks; i--; ) {
 		fua |= test_bit(R5_WantFUA, &sh->dev[i].flags);
 		sync |= test_bit(R5_SyncIO, &sh->dev[i].flags);
+		discard |= test_bit(R5_Discard, &sh->dev[i].flags);
 	}
 
 	for (i = disks; i--; ) {
 		struct r5dev *dev = &sh->dev[i];
 
 		if (dev->written || i == pd_idx || i == qd_idx) {
-			set_bit(R5_UPTODATE, &dev->flags);
+			if (!discard)
+				set_bit(R5_UPTODATE, &dev->flags);
 			if (fua)
 				set_bit(R5_WantFUA, &dev->flags);
 			if (sync)
@@ -1252,8 +1252,6 @@ ops_run_reconstruct5(struct stripe_head
 	}
 	if (i >= sh->disks) {
 		atomic_inc(&sh->count);
-		memset(page_address(sh->dev[pd_idx].page), 0,
-			STRIPE_SECTORS << 9);
 		set_bit(R5_Discard, &sh->dev[pd_idx].flags);
 		ops_complete_reconstruct(sh);
 		return;
@@ -1314,10 +1312,6 @@ ops_run_reconstruct6(struct stripe_head
 	}
 	if (i >= sh->disks) {
 		atomic_inc(&sh->count);
-		memset(page_address(sh->dev[sh->pd_idx].page), 0,
-			STRIPE_SECTORS << 9);
-		memset(page_address(sh->dev[sh->qd_idx].page), 0,
-			STRIPE_SECTORS << 9);
 		set_bit(R5_Discard, &sh->dev[sh->pd_idx].flags);
 		set_bit(R5_Discard, &sh->dev[sh->qd_idx].flags);
 		ops_complete_reconstruct(sh);
@@ -2775,7 +2769,8 @@ static void handle_stripe_clean_event(st
 		if (sh->dev[i].written) {
 			dev = &sh->dev[i];
 			if (!test_bit(R5_LOCKED, &dev->flags) &&
-				test_bit(R5_UPTODATE, &dev->flags)) {
+			    (test_bit(R5_UPTODATE, &dev->flags) ||
+			     test_and_clear_bit(R5_Discard, &dev->flags))) {
 				/* We can return any write requests */
 				struct bio *wbi, *wbi2;
 				pr_debug("Return write for disc %d\n", i);
@@ -3493,10 +3488,12 @@ static void handle_stripe(struct stripe_
 	if (s.written &&
 	    (s.p_failed || ((test_bit(R5_Insync, &pdev->flags)
 			     && !test_bit(R5_LOCKED, &pdev->flags)
-			     && test_bit(R5_UPTODATE, &pdev->flags)))) &&
+			     && (test_bit(R5_UPTODATE, &pdev->flags) ||
+				 test_bit(R5_Discard, &pdev->flags))))) &&
 	    (s.q_failed || ((test_bit(R5_Insync, &qdev->flags)
 			     && !test_bit(R5_LOCKED, &qdev->flags)
-			     && test_bit(R5_UPTODATE, &qdev->flags)))))
+			     && (test_bit(R5_UPTODATE, &qdev->flags) ||
+				 test_bit(R5_Discard, &qdev->flags))))))
 		handle_stripe_clean_event(conf, sh, disks, &s.return_bi);
 
 	/* Now we might consider reading some blocks, either to check/generate
@@ -3523,9 +3520,11 @@ static void handle_stripe(struct stripe_
 		/* All the 'written' buffers and the parity block are ready to
 		 * be written back to disk
 		 */
-		BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags));
+		BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags) &&
+		       !test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags));
 		BUG_ON(sh->qd_idx >= 0 &&
-		       !test_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags));
+		       !test_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags) &&
+		       !test_bit(R5_Discard, &sh->dev[sh->qd_idx].flags));
 		for (i = disks; i--; ) {
 			struct r5dev *dev = &sh->dev[i];
 			if (test_bit(R5_LOCKED, &dev->flags) &&

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [patch 1/2]MD: raid5 trim support
  2012-09-20 10:31             ` Shaohua Li
@ 2012-09-25  7:00               ` NeilBrown
  0 siblings, 0 replies; 10+ messages in thread
From: NeilBrown @ 2012-09-25  7:00 UTC (permalink / raw)
  To: Shaohua Li; +Cc: linux-raid

[-- Attachment #1: Type: text/plain, Size: 7057 bytes --]

On Thu, 20 Sep 2012 18:31:38 +0800 Shaohua Li <shli@kernel.org> wrote:

> On Thu, Sep 20, 2012 at 12:25:41PM +0800, Shaohua Li wrote:
> > On Thu, Sep 20, 2012 at 01:59:14PM +1000, NeilBrown wrote:
> > > On Thu, 20 Sep 2012 10:27:17 +0800 Shaohua Li <shli@kernel.org> wrote:
> > > 
> > > in which wrong sectors were trimmed....
> > > > 
> > > > Ok, just confirmed, delete raid5_compute_sector is ok if I adjust
> > > > logical_sector calculation. Here is the new patch.
> > > > 
> > > 
> > > Thanks.  That looks better.  I've applied it with some minor formatting
> > > changes.
> > > 
> > > I then went to look at the follow-up page and .....
> > > I count 11 separate places where you test the new flag and possibly memset a
> > > page to zero.  This doesn't seem like an improvement to me.
> > 
> > We do the zero page just before the stripe is hit in cache, which is rare case.
> >  
> > > Why don't we just mark the page as not up-to-date when we discard it?  That
> > > would avoid storing inconsistent data, and would avoid needing to zero pages.
> > 
> > We need re-read the strip if it's hit in cache, but it's rare case, we don't
> > care. So when we clear the up-to-date flag? I saw a lot of places checking
> > up-to-date flag in the write path. Need close look to check if there is race.
> 
> Alright, appears ok to use the uptodate flag. Here is the new patch.
> 
> 
> Subject: MD: raid5 avoid unnecessary zero page for trim
> 
> We want to avoid zero discarded dev page, because it's useless for discard.
> But if we don't zero it, another read/write hit such page in the cache and will
> get inconsistent data.
> 
> To avoid zero the page, we don't set R5_UPTODATE flag after construction is
> done. In this way, discard write request is still issued and finished, but read
> will not hit the page. If the stripe gets accessed soon, we need reread the
> stripe, but since the chance is low, the reread isn't a big deal.
> 
> Signed-off-by: Shaohua Li <shli@fusionio.com>

Thanks.  It didn't turn out quite as clean as I hoped, but I suspect it is
the best we will get.

applied, thanks.
NeilBrown






> ---
>  drivers/md/raid5.c |   35 +++++++++++++++++------------------
>  1 file changed, 17 insertions(+), 18 deletions(-)
> 
> Index: linux/drivers/md/raid5.c
> ===================================================================
> --- linux.orig/drivers/md/raid5.c	2012-09-20 18:10:38.546836309 +0800
> +++ linux/drivers/md/raid5.c	2012-09-20 18:17:50.849399945 +0800
> @@ -547,7 +547,7 @@ static void ops_run_io(struct stripe_hea
>  				rw = WRITE_FUA;
>  			else
>  				rw = WRITE;
> -			if (test_and_clear_bit(R5_Discard, &sh->dev[i].flags))
> +			if (test_bit(R5_Discard, &sh->dev[i].flags))
>  				rw |= REQ_DISCARD;
>  		} else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
>  			rw = READ;
> @@ -1172,11 +1172,9 @@ ops_run_biodrain(struct stripe_head *sh,
>  					set_bit(R5_WantFUA, &dev->flags);
>  				if (wbi->bi_rw & REQ_SYNC)
>  					set_bit(R5_SyncIO, &dev->flags);
> -				if (wbi->bi_rw & REQ_DISCARD) {
> -					memset(page_address(dev->page), 0,
> -						STRIPE_SECTORS << 9);
> +				if (wbi->bi_rw & REQ_DISCARD)
>  					set_bit(R5_Discard, &dev->flags);
> -				} else
> +				else
>  					tx = async_copy_data(1, wbi, dev->page,
>  						dev->sector, tx);
>  				wbi = r5_next_bio(wbi, dev->sector);
> @@ -1194,7 +1192,7 @@ static void ops_complete_reconstruct(voi
>  	int pd_idx = sh->pd_idx;
>  	int qd_idx = sh->qd_idx;
>  	int i;
> -	bool fua = false, sync = false;
> +	bool fua = false, sync = false, discard = false;
>  
>  	pr_debug("%s: stripe %llu\n", __func__,
>  		(unsigned long long)sh->sector);
> @@ -1202,13 +1200,15 @@ static void ops_complete_reconstruct(voi
>  	for (i = disks; i--; ) {
>  		fua |= test_bit(R5_WantFUA, &sh->dev[i].flags);
>  		sync |= test_bit(R5_SyncIO, &sh->dev[i].flags);
> +		discard |= test_bit(R5_Discard, &sh->dev[i].flags);
>  	}
>  
>  	for (i = disks; i--; ) {
>  		struct r5dev *dev = &sh->dev[i];
>  
>  		if (dev->written || i == pd_idx || i == qd_idx) {
> -			set_bit(R5_UPTODATE, &dev->flags);
> +			if (!discard)
> +				set_bit(R5_UPTODATE, &dev->flags);
>  			if (fua)
>  				set_bit(R5_WantFUA, &dev->flags);
>  			if (sync)
> @@ -1252,8 +1252,6 @@ ops_run_reconstruct5(struct stripe_head
>  	}
>  	if (i >= sh->disks) {
>  		atomic_inc(&sh->count);
> -		memset(page_address(sh->dev[pd_idx].page), 0,
> -			STRIPE_SECTORS << 9);
>  		set_bit(R5_Discard, &sh->dev[pd_idx].flags);
>  		ops_complete_reconstruct(sh);
>  		return;
> @@ -1314,10 +1312,6 @@ ops_run_reconstruct6(struct stripe_head
>  	}
>  	if (i >= sh->disks) {
>  		atomic_inc(&sh->count);
> -		memset(page_address(sh->dev[sh->pd_idx].page), 0,
> -			STRIPE_SECTORS << 9);
> -		memset(page_address(sh->dev[sh->qd_idx].page), 0,
> -			STRIPE_SECTORS << 9);
>  		set_bit(R5_Discard, &sh->dev[sh->pd_idx].flags);
>  		set_bit(R5_Discard, &sh->dev[sh->qd_idx].flags);
>  		ops_complete_reconstruct(sh);
> @@ -2775,7 +2769,8 @@ static void handle_stripe_clean_event(st
>  		if (sh->dev[i].written) {
>  			dev = &sh->dev[i];
>  			if (!test_bit(R5_LOCKED, &dev->flags) &&
> -				test_bit(R5_UPTODATE, &dev->flags)) {
> +			    (test_bit(R5_UPTODATE, &dev->flags) ||
> +			     test_and_clear_bit(R5_Discard, &dev->flags))) {
>  				/* We can return any write requests */
>  				struct bio *wbi, *wbi2;
>  				pr_debug("Return write for disc %d\n", i);
> @@ -3493,10 +3488,12 @@ static void handle_stripe(struct stripe_
>  	if (s.written &&
>  	    (s.p_failed || ((test_bit(R5_Insync, &pdev->flags)
>  			     && !test_bit(R5_LOCKED, &pdev->flags)
> -			     && test_bit(R5_UPTODATE, &pdev->flags)))) &&
> +			     && (test_bit(R5_UPTODATE, &pdev->flags) ||
> +				 test_bit(R5_Discard, &pdev->flags))))) &&
>  	    (s.q_failed || ((test_bit(R5_Insync, &qdev->flags)
>  			     && !test_bit(R5_LOCKED, &qdev->flags)
> -			     && test_bit(R5_UPTODATE, &qdev->flags)))))
> +			     && (test_bit(R5_UPTODATE, &qdev->flags) ||
> +				 test_bit(R5_Discard, &qdev->flags))))))
>  		handle_stripe_clean_event(conf, sh, disks, &s.return_bi);
>  
>  	/* Now we might consider reading some blocks, either to check/generate
> @@ -3523,9 +3520,11 @@ static void handle_stripe(struct stripe_
>  		/* All the 'written' buffers and the parity block are ready to
>  		 * be written back to disk
>  		 */
> -		BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags));
> +		BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags) &&
> +		       !test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags));
>  		BUG_ON(sh->qd_idx >= 0 &&
> -		       !test_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags));
> +		       !test_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags) &&
> +		       !test_bit(R5_Discard, &sh->dev[sh->qd_idx].flags));
>  		for (i = disks; i--; ) {
>  			struct r5dev *dev = &sh->dev[i];
>  			if (test_bit(R5_LOCKED, &dev->flags) &&


[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 828 bytes --]

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2012-09-25  7:00 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2012-09-18  8:25 [patch 1/2]MD: raid5 trim support Shaohua Li
2012-09-20  1:15 ` NeilBrown
2012-09-20  1:36   ` Shaohua Li
2012-09-20  1:47     ` NeilBrown
2012-09-20  2:06       ` Shaohua Li
2012-09-20  2:27       ` Shaohua Li
2012-09-20  3:59         ` NeilBrown
2012-09-20  4:25           ` Shaohua Li
2012-09-20 10:31             ` Shaohua Li
2012-09-25  7:00               ` NeilBrown

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).