[md PATCH 14/23] md/raid1: handle merge_bvec_fn in member devices.

linux-raid.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

From: NeilBrown <neilb@suse.de>
To: linux-raid@vger.kernel.org
Subject: [md PATCH 14/23] md/raid1: handle merge_bvec_fn in member devices.
Date: Wed, 14 Mar 2012 15:40:40 +1100	[thread overview]
Message-ID: <20120314044040.7978.27509.stgit@notabene.brown> (raw)
In-Reply-To: <20120314043555.7978.75486.stgit@notabene.brown>

Currently we don't honour merge_bvec_fn in member devices so if there
is one, we force all requests to be single-page at most.
This is not ideal.

So create a raid1 merge_bvec_fn to check that function in children
as well.

This introduces a small problem.  There is no locking around calls
the ->merge_bvec_fn and subsequent calls to ->make_request.  So a
device added between these could end up getting a request which
violates its merge_bvec_fn.

Currently the best we can do is synchronize_sched().  This will work
providing no preemption happens.  If there is is preemption, we just
have to hope that new devices are largely consistent with old devices.

Signed-off-by: NeilBrown <neilb@suse.de>
---

 drivers/md/raid1.c |   77 ++++++++++++++++++++++++++++++++++++++--------------
 1 files changed, 56 insertions(+), 21 deletions(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index c0d3ffb..fa4d840 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -523,6 +523,7 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
 		rdev = rcu_dereference(conf->mirrors[disk].rdev);
 		if (r1_bio->bios[disk] == IO_BLOCKED
 		    || rdev == NULL
+		    || test_bit(Unmerged, &rdev->flags)
 		    || test_bit(Faulty, &rdev->flags))
 			continue;
 		if (!test_bit(In_sync, &rdev->flags) &&
@@ -614,6 +615,39 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
 	return best_disk;
 }
 
+static int raid1_mergeable_bvec(struct request_queue *q,
+				struct bvec_merge_data *bvm,
+				struct bio_vec *biovec)
+{
+	struct mddev *mddev = q->queuedata;
+	struct r1conf *conf = mddev->private;
+	sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
+	int max = biovec->bv_len;
+
+	if (mddev->merge_check_needed) {
+		int disk;
+		rcu_read_lock();
+		for (disk = 0; disk < conf->raid_disks * 2; disk++) {
+			struct md_rdev *rdev = rcu_dereference(
+				conf->mirrors[disk].rdev);
+			if (rdev && !test_bit(Faulty, &rdev->flags)) {
+				struct request_queue *q =
+					bdev_get_queue(rdev->bdev);
+				if (q->merge_bvec_fn) {
+					bvm->bi_sector = sector +
+						rdev->data_offset;
+					bvm->bi_bdev = rdev->bdev;
+					max = min(max, q->merge_bvec_fn(
+							  q, bvm, biovec));
+				}
+			}
+		}
+		rcu_read_unlock();
+	}
+	return max;
+
+}
+
 int md_raid1_congested(struct mddev *mddev, int bits)
 {
 	struct r1conf *conf = mddev->private;
@@ -1015,7 +1049,8 @@ read_again:
 			break;
 		}
 		r1_bio->bios[i] = NULL;
-		if (!rdev || test_bit(Faulty, &rdev->flags)) {
+		if (!rdev || test_bit(Faulty, &rdev->flags)
+		    || test_bit(Unmerged, &rdev->flags)) {
 			if (i < conf->raid_disks)
 				set_bit(R1BIO_Degraded, &r1_bio->state);
 			continue;
@@ -1336,6 +1371,7 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 	struct mirror_info *p;
 	int first = 0;
 	int last = conf->raid_disks - 1;
+	struct request_queue *q = bdev_get_queue(rdev->bdev);
 
 	if (mddev->recovery_disabled == conf->recovery_disabled)
 		return -EBUSY;
@@ -1343,23 +1379,17 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 	if (rdev->raid_disk >= 0)
 		first = last = rdev->raid_disk;
 
+	if (q->merge_bvec_fn) {
+		set_bit(Unmerged, &rdev->flags);
+		mddev->merge_check_needed = 1;
+	}
+
 	for (mirror = first; mirror <= last; mirror++) {
 		p = conf->mirrors+mirror;
 		if (!p->rdev) {
 
 			disk_stack_limits(mddev->gendisk, rdev->bdev,
 					  rdev->data_offset << 9);
-			/* as we don't honour merge_bvec_fn, we must
-			 * never risk violating it, so limit
-			 * ->max_segments to one lying with a single
-			 * page, as a one page request is never in
-			 * violation.
-			 */
-			if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
-				blk_queue_max_segments(mddev->queue, 1);
-				blk_queue_segment_boundary(mddev->queue,
-							   PAGE_CACHE_SIZE - 1);
-			}
 
 			p->head_position = 0;
 			rdev->raid_disk = mirror;
@@ -1384,6 +1414,19 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 			break;
 		}
 	}
+	if (err == 0 && test_bit(Unmerged, &rdev->flags)) {
+		/* Some requests might not have seen this new
+		 * merge_bvec_fn.  We must wait for them to complete
+		 * before merging the device fully.
+		 * First we make sure any code which has tested
+		 * our function has submitted the request, then
+		 * we wait for all outstanding requests to complete.
+		 */
+		synchronize_sched();
+		raise_barrier(conf);
+		lower_barrier(conf);
+		clear_bit(Unmerged, &rdev->flags);
+	}
 	md_integrity_add_rdev(rdev, mddev);
 	print_conf(conf);
 	return err;
@@ -2628,15 +2671,6 @@ static int run(struct mddev *mddev)
 			continue;
 		disk_stack_limits(mddev->gendisk, rdev->bdev,
 				  rdev->data_offset << 9);
-		/* as we don't honour merge_bvec_fn, we must never risk
-		 * violating it, so limit ->max_segments to 1 lying within
-		 * a single page, as a one page request is never in violation.
-		 */
-		if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
-			blk_queue_max_segments(mddev->queue, 1);
-			blk_queue_segment_boundary(mddev->queue,
-						   PAGE_CACHE_SIZE - 1);
-		}
 	}
 
 	mddev->degraded = 0;
@@ -2670,6 +2704,7 @@ static int run(struct mddev *mddev)
 	if (mddev->queue) {
 		mddev->queue->backing_dev_info.congested_fn = raid1_congested;
 		mddev->queue->backing_dev_info.congested_data = mddev;
+		blk_queue_merge_bvec(mddev->queue, raid1_mergeable_bvec);
 	}
 	return md_integrity_register(mddev);
 }

next prev parent reply	other threads:[~2012-03-14  4:40 UTC|newest]

Thread overview: 42+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2012-03-14  4:40 [md PATCH 00/23] md patches heading for 3.4 NeilBrown
2012-03-14  4:40 ` [md PATCH 05/23] md/raid5: use atomic_dec_return() instead of atomic_dec() and atomic_read() NeilBrown
2012-03-14  4:40 ` [md PATCH 02/23] md/raid10: remove unnecessary smp_mb() from end_sync_write NeilBrown
2012-03-14  4:40 ` [md PATCH 04/23] md: Use existed macros instead of numbers NeilBrown
2012-03-14  4:40 ` [md PATCH 03/23] md/raid5: removed unused 'added_devices' variable NeilBrown
2012-03-14  4:40 ` [md PATCH 06/23] md: allow last device to be forcibly removed from RAID1/RAID10 NeilBrown
2012-03-14  4:40 ` [md PATCH 01/23] md/raid5: make sure reshape_position is cleared on error path NeilBrown
2012-03-14  4:40 ` [md PATCH 10/23] md/raid1, raid10: avoid deadlock during resync/recovery NeilBrown
2012-03-14  4:40 ` [md PATCH 11/23] md: tidy up rdev_for_each usage NeilBrown
2012-03-14  4:40 ` [md PATCH 13/23] md/raid10: handle merge_bvec_fn in member devices NeilBrown
2012-03-14  4:40 ` [md PATCH 07/23] md: allow re-add to failed arrays NeilBrown
2012-03-14  4:40 ` [md PATCH 12/23] md: add proper merge_bvec handling to RAID0 and Linear NeilBrown
2012-03-14  4:40 ` [md PATCH 09/23] md/bitmap: ensure to load bitmap when creating via sysfs NeilBrown
2012-03-14  4:40 ` NeilBrown [this message]
2012-03-14  4:40 ` [md PATCH 08/23] md: don't set md arrays to readonly on shutdown NeilBrown
2012-04-18 15:37   ` Alexander Lyakas
2012-04-18 17:44     ` Paweł Brodacki
2012-04-18 20:53       ` Alexander Lyakas
2012-04-18 22:48     ` NeilBrown
2012-04-19  9:11       ` Alexander Lyakas
2012-04-19  9:57         ` NeilBrown
2012-04-20 11:30           ` Paweł Brodacki
2012-04-20 12:01             ` NeilBrown
2012-04-21 15:18               ` Paweł Brodacki
2012-04-21 20:42                 ` NeilBrown
2012-04-30 10:32                   ` Paweł Brodacki
2012-04-20 16:26           ` John Robinson
2012-03-14  4:40 ` [md PATCH 22/23] md: fix clearing of the 'changed' flags for the bad blocks list NeilBrown
2012-03-14  4:40 ` [md PATCH 15/23] md/raid10 - support resizing some RAID10 arrays NeilBrown
2012-03-14  6:17   ` keld
2012-03-14  6:27     ` NeilBrown
2012-03-14  7:51       ` David Brown
2012-03-14  8:32         ` NeilBrown
2012-03-14 10:20           ` David Brown
2012-03-14 12:37             ` keld
2012-03-14  4:40 ` [md PATCH 20/23] md/bitmap: remove unnecessary indirection when allocating NeilBrown
2012-03-14  4:40 ` [md PATCH 16/23] md/bitmap: remove some unused noise from bitmap.h NeilBrown
2012-03-14  4:40 ` [md PATCH 19/23] md/bitmap: remove some pointless locking NeilBrown
2012-03-14  4:40 ` [md PATCH 17/23] md/bitmap: move printing of bitmap status to bitmap.c NeilBrown
2012-03-14  4:40 ` [md PATCH 21/23] md/bitmap: discard CHUNK_BLOCK_SHIFT macro NeilBrown
2012-03-14  4:40 ` [md PATCH 18/23] md/bitmap: change a 'goto' to a normal 'if' construct NeilBrown
2012-03-14  4:40 ` [md PATCH 23/23] md: Add judgement bb->unacked_exist in function md_ack_all_badblocks() NeilBrown

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:c0d3ffb dfblob:fa4d840 )
 OR (
bs:"[md PATCH 14/23] md/raid1: handle merge_bvec_fn in member devices." )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20120314044040.7978.27509.stgit@notabene.brown \
    --to=neilb@suse.de \
    --cc=linux-raid@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).