Linux RAID subsystem development

Linux RAID subsystem development
 help / color / mirror / Atom feed

* [PATCH v1 10/14] md: raid1: use bio_segments_all()
From: Ming Lei @ 2017-02-24 15:42 UTC (permalink / raw)
  To: Shaohua Li, Jens Axboe, linux-kernel, linux-raid, linux-block,
	Christoph Hellwig
  Cc: Ming Lei
In-Reply-To: <1487950971-1131-1-git-send-email-tom.leiming@gmail.com>

Use this helper, instead of direct access to .bi_vcnt.

Signed-off-by: Ming Lei <tom.leiming@gmail.com>
---
 drivers/md/raid1.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 7363bf56f3b4..391da975e092 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1091,7 +1091,8 @@ static void alloc_behind_pages(struct bio *bio, struct r1bio *r1_bio)
 {
 	int i;
 	struct bio_vec *bvec;
-	struct bio_vec *bvecs = kzalloc(bio->bi_vcnt * sizeof(struct bio_vec),
+	unsigned vcnt = bio_segments_all(bio);
+	struct bio_vec *bvecs = kzalloc(vcnt * sizeof(struct bio_vec),
 					GFP_NOIO);
 	if (unlikely(!bvecs))
 		return;
@@ -1107,12 +1108,12 @@ static void alloc_behind_pages(struct bio *bio, struct r1bio *r1_bio)
 		kunmap(bvec->bv_page);
 	}
 	r1_bio->behind_bvecs = bvecs;
-	r1_bio->behind_page_count = bio->bi_vcnt;
+	r1_bio->behind_page_count = vcnt;
 	set_bit(R1BIO_BehindIO, &r1_bio->state);
 	return;
 
 do_sync_io:
-	for (i = 0; i < bio->bi_vcnt; i++)
+	for (i = 0; i < vcnt; i++)
 		if (bvecs[i].bv_page)
 			put_page(bvecs[i].bv_page);
 	kfree(bvecs);
-- 
2.7.4

^ permalink raw reply related

* [PATCH v1 11/14] md: raid10: refactor code of read reshape's .bi_end_io
From: Ming Lei @ 2017-02-24 15:42 UTC (permalink / raw)
  To: Shaohua Li, Jens Axboe, linux-kernel, linux-raid, linux-block,
	Christoph Hellwig
  Cc: Ming Lei
In-Reply-To: <1487950971-1131-1-git-send-email-tom.leiming@gmail.com>

reshape read request is a bit special and requires one extra
bio which isn't allocated from r10buf_pool.

Refactor the .bi_end_io for read reshape, so that we can use
raid10's resync page mangement approach easily in the following
patches.

Signed-off-by: Ming Lei <tom.leiming@gmail.com>
---
 drivers/md/raid10.c | 28 ++++++++++++++++++----------
 1 file changed, 18 insertions(+), 10 deletions(-)

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 227dd6ad7716..c76e08ea4b92 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1907,17 +1907,9 @@ static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
 	return err;
 }
 
-static void end_sync_read(struct bio *bio)
+static void __end_sync_read(struct r10bio *r10_bio, struct bio *bio, int d)
 {
-	struct r10bio *r10_bio = bio->bi_private;
 	struct r10conf *conf = r10_bio->mddev->private;
-	int d;
-
-	if (bio == r10_bio->master_bio) {
-		/* this is a reshape read */
-		d = r10_bio->read_slot; /* really the read dev */
-	} else
-		d = find_bio_disk(conf, r10_bio, bio, NULL, NULL);
 
 	if (!bio->bi_error)
 		set_bit(R10BIO_Uptodate, &r10_bio->state);
@@ -1941,6 +1933,22 @@ static void end_sync_read(struct bio *bio)
 	}
 }
 
+static void end_sync_read(struct bio *bio)
+{
+	struct r10bio *r10_bio = bio->bi_private;
+	struct r10conf *conf = r10_bio->mddev->private;
+	int d = find_bio_disk(conf, r10_bio, bio, NULL, NULL);
+
+	__end_sync_read(r10_bio, bio, d);
+}
+
+static void end_reshape_read(struct bio *bio)
+{
+	struct r10bio *r10_bio = bio->bi_private;
+
+	__end_sync_read(r10_bio, bio, r10_bio->read_slot);
+}
+
 static void end_sync_request(struct r10bio *r10_bio)
 {
 	struct mddev *mddev = r10_bio->mddev;
@@ -4474,7 +4482,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
 	read_bio->bi_iter.bi_sector = (r10_bio->devs[r10_bio->read_slot].addr
 			       + rdev->data_offset);
 	read_bio->bi_private = r10_bio;
-	read_bio->bi_end_io = end_sync_read;
+	read_bio->bi_end_io = end_reshape_read;
 	bio_set_op_attrs(read_bio, REQ_OP_READ, 0);
 	read_bio->bi_flags &= (~0UL << BIO_RESET_BITS);
 	read_bio->bi_error = 0;
-- 
2.7.4

^ permalink raw reply related

* [PATCH v1 12/14] md: raid10: don't use bio's vec table to manage resync pages
From: Ming Lei @ 2017-02-24 15:42 UTC (permalink / raw)
  To: Shaohua Li, Jens Axboe, linux-kernel, linux-raid, linux-block,
	Christoph Hellwig
  Cc: Ming Lei
In-Reply-To: <1487950971-1131-1-git-send-email-tom.leiming@gmail.com>

Now we allocate one page array for managing resync pages, instead
of using bio's vec table to do that, and the old way is very hacky
and won't work any more if multipage bvec is enabled.

The introduced cost is that we need to allocate (128 + 16) * copies
bytes per r10_bio, and it is fine because the inflight r10_bio for
resync shouldn't be much, as pointed by Shaohua.

Also bio_reset() in raid10_sync_request() and reshape_request()
are removed because all bios are freshly new now in these functions
and not necessary to reset any more.

This patch can be thought as cleanup too.

Suggested-by: Shaohua Li <shli@kernel.org>
Signed-off-by: Ming Lei <tom.leiming@gmail.com>
---
 drivers/md/raid10.c | 127 ++++++++++++++++++++++++++++++----------------------
 1 file changed, 74 insertions(+), 53 deletions(-)

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index c76e08ea4b92..931f5d80608b 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -110,6 +110,16 @@ static void end_reshape(struct r10conf *conf);
 #define raid10_log(md, fmt, args...)				\
 	do { if ((md)->queue) blk_add_trace_msg((md)->queue, "raid10 " fmt, ##args); } while (0)
 
+static inline struct resync_pages *get_resync_pages(struct bio *bio)
+{
+	return bio->bi_private;
+}
+
+static inline struct r10bio *get_resync_r10bio(struct bio *bio)
+{
+	return get_resync_pages(bio)->raid_bio;
+}
+
 static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data)
 {
 	struct r10conf *conf = data;
@@ -140,11 +150,11 @@ static void r10bio_pool_free(void *r10_bio, void *data)
 static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
 {
 	struct r10conf *conf = data;
-	struct page *page;
 	struct r10bio *r10_bio;
 	struct bio *bio;
-	int i, j;
-	int nalloc;
+	int j;
+	int nalloc, nalloc_rp;
+	struct resync_pages *rps;
 
 	r10_bio = r10bio_pool_alloc(gfp_flags, conf);
 	if (!r10_bio)
@@ -156,6 +166,15 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
 	else
 		nalloc = 2; /* recovery */
 
+	/* allocate once for all bios */
+	if (!conf->have_replacement)
+		nalloc_rp = nalloc;
+	else
+		nalloc_rp = nalloc * 2;
+	rps = kmalloc(sizeof(struct resync_pages) * nalloc_rp, gfp_flags);
+	if (!rps)
+		goto out_free_r10bio;
+
 	/*
 	 * Allocate bios.
 	 */
@@ -175,36 +194,40 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
 	 * Allocate RESYNC_PAGES data pages and attach them
 	 * where needed.
 	 */
-	for (j = 0 ; j < nalloc; j++) {
+	for (j = 0; j < nalloc; j++) {
 		struct bio *rbio = r10_bio->devs[j].repl_bio;
+		struct resync_pages *rp, *rp_repl;
+
+		rp = &rps[j];
+		if (rbio)
+			rp_repl = &rps[nalloc + j];
+
 		bio = r10_bio->devs[j].bio;
-		for (i = 0; i < RESYNC_PAGES; i++) {
-			if (j > 0 && !test_bit(MD_RECOVERY_SYNC,
-					       &conf->mddev->recovery)) {
-				/* we can share bv_page's during recovery
-				 * and reshape */
-				struct bio *rbio = r10_bio->devs[0].bio;
-				page = rbio->bi_io_vec[i].bv_page;
-				get_page(page);
-			} else
-				page = alloc_page(gfp_flags);
-			if (unlikely(!page))
+
+		if (!j || test_bit(MD_RECOVERY_SYNC,
+				   &conf->mddev->recovery)) {
+			if (resync_alloc_pages(rp, gfp_flags))
 				goto out_free_pages;
+		} else {
+			memcpy(rp, &rps[0], sizeof(*rp));
+			resync_get_all_pages(rp);
+		}
 
-			bio->bi_io_vec[i].bv_page = page;
-			if (rbio)
-				rbio->bi_io_vec[i].bv_page = page;
+		rp->idx = 0;
+		rp->raid_bio = r10_bio;
+		bio->bi_private = rp;
+		if (rbio) {
+			memcpy(rp_repl, rp, sizeof(*rp));
+			rbio->bi_private = rp_repl;
 		}
 	}
 
 	return r10_bio;
 
 out_free_pages:
-	for ( ; i > 0 ; i--)
-		safe_put_page(bio->bi_io_vec[i-1].bv_page);
-	while (j--)
-		for (i = 0; i < RESYNC_PAGES ; i++)
-			safe_put_page(r10_bio->devs[j].bio->bi_io_vec[i].bv_page);
+	while (--j >= 0)
+		resync_free_pages(&rps[j * 2]);
+
 	j = 0;
 out_free_bio:
 	for ( ; j < nalloc; j++) {
@@ -213,30 +236,34 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
 		if (r10_bio->devs[j].repl_bio)
 			bio_put(r10_bio->devs[j].repl_bio);
 	}
+	kfree(rps);
+out_free_r10bio:
 	r10bio_pool_free(r10_bio, conf);
 	return NULL;
 }
 
 static void r10buf_pool_free(void *__r10_bio, void *data)
 {
-	int i;
 	struct r10conf *conf = data;
 	struct r10bio *r10bio = __r10_bio;
 	int j;
+	struct resync_pages *rp = NULL;
 
-	for (j=0; j < conf->copies; j++) {
+	for (j = conf->copies; j--; ) {
 		struct bio *bio = r10bio->devs[j].bio;
-		if (bio) {
-			for (i = 0; i < RESYNC_PAGES; i++) {
-				safe_put_page(bio->bi_io_vec[i].bv_page);
-				bio->bi_io_vec[i].bv_page = NULL;
-			}
-			bio_put(bio);
-		}
+
+		rp = get_resync_pages(bio);
+		resync_free_pages(rp);
+		bio_put(bio);
+
 		bio = r10bio->devs[j].repl_bio;
 		if (bio)
 			bio_put(bio);
 	}
+
+	/* resync pages array stored in the 1st bio's .bi_private */
+	kfree(rp);
+
 	r10bio_pool_free(r10bio, conf);
 }
 
@@ -1935,7 +1962,7 @@ static void __end_sync_read(struct r10bio *r10_bio, struct bio *bio, int d)
 
 static void end_sync_read(struct bio *bio)
 {
-	struct r10bio *r10_bio = bio->bi_private;
+	struct r10bio *r10_bio = get_resync_r10bio(bio);
 	struct r10conf *conf = r10_bio->mddev->private;
 	int d = find_bio_disk(conf, r10_bio, bio, NULL, NULL);
 
@@ -1944,6 +1971,7 @@ static void end_sync_read(struct bio *bio)
 
 static void end_reshape_read(struct bio *bio)
 {
+	/* reshape read bio isn't allocated from r10buf_pool */
 	struct r10bio *r10_bio = bio->bi_private;
 
 	__end_sync_read(r10_bio, bio, r10_bio->read_slot);
@@ -1978,7 +2006,7 @@ static void end_sync_request(struct r10bio *r10_bio)
 
 static void end_sync_write(struct bio *bio)
 {
-	struct r10bio *r10_bio = bio->bi_private;
+	struct r10bio *r10_bio = get_resync_r10bio(bio);
 	struct mddev *mddev = r10_bio->mddev;
 	struct r10conf *conf = mddev->private;
 	int d;
@@ -2058,6 +2086,7 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
 	for (i=0 ; i < conf->copies ; i++) {
 		int  j, d;
 		struct md_rdev *rdev;
+		struct resync_pages *rp;
 
 		tbio = r10_bio->devs[i].bio;
 
@@ -2099,11 +2128,13 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
 		 * First we need to fixup bv_offset, bv_len and
 		 * bi_vecs, as the read request might have corrupted these
 		 */
+		rp = get_resync_pages(tbio);
 		bio_reset(tbio);
 
 		tbio->bi_vcnt = vcnt;
 		tbio->bi_iter.bi_size = fbio->bi_iter.bi_size;
-		tbio->bi_private = r10_bio;
+		rp->raid_bio = r10_bio;
+		tbio->bi_private = rp;
 		tbio->bi_iter.bi_sector = r10_bio->devs[i].addr;
 		tbio->bi_end_io = end_sync_write;
 		bio_set_op_attrs(tbio, REQ_OP_WRITE, 0);
@@ -3171,10 +3202,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 					}
 				}
 				bio = r10_bio->devs[0].bio;
-				bio_reset(bio);
 				bio->bi_next = biolist;
 				biolist = bio;
-				bio->bi_private = r10_bio;
 				bio->bi_end_io = end_sync_read;
 				bio_set_op_attrs(bio, REQ_OP_READ, 0);
 				if (test_bit(FailFast, &rdev->flags))
@@ -3198,10 +3227,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 
 				if (!test_bit(In_sync, &mrdev->flags)) {
 					bio = r10_bio->devs[1].bio;
-					bio_reset(bio);
 					bio->bi_next = biolist;
 					biolist = bio;
-					bio->bi_private = r10_bio;
 					bio->bi_end_io = end_sync_write;
 					bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
 					bio->bi_iter.bi_sector = to_addr
@@ -3226,10 +3253,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 				if (mreplace == NULL || bio == NULL ||
 				    test_bit(Faulty, &mreplace->flags))
 					break;
-				bio_reset(bio);
 				bio->bi_next = biolist;
 				biolist = bio;
-				bio->bi_private = r10_bio;
 				bio->bi_end_io = end_sync_write;
 				bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
 				bio->bi_iter.bi_sector = to_addr +
@@ -3351,7 +3376,6 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 				r10_bio->devs[i].repl_bio->bi_end_io = NULL;
 
 			bio = r10_bio->devs[i].bio;
-			bio_reset(bio);
 			bio->bi_error = -EIO;
 			rcu_read_lock();
 			rdev = rcu_dereference(conf->mirrors[d].rdev);
@@ -3376,7 +3400,6 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 			atomic_inc(&r10_bio->remaining);
 			bio->bi_next = biolist;
 			biolist = bio;
-			bio->bi_private = r10_bio;
 			bio->bi_end_io = end_sync_read;
 			bio_set_op_attrs(bio, REQ_OP_READ, 0);
 			if (test_bit(FailFast, &conf->mirrors[d].rdev->flags))
@@ -3395,13 +3418,11 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 
 			/* Need to set up for writing to the replacement */
 			bio = r10_bio->devs[i].repl_bio;
-			bio_reset(bio);
 			bio->bi_error = -EIO;
 
 			sector = r10_bio->devs[i].addr;
 			bio->bi_next = biolist;
 			biolist = bio;
-			bio->bi_private = r10_bio;
 			bio->bi_end_io = end_sync_write;
 			bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
 			if (test_bit(FailFast, &conf->mirrors[d].rdev->flags))
@@ -3441,12 +3462,12 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 			break;
 		for (bio= biolist ; bio ; bio=bio->bi_next) {
 			struct bio *bio2;
-			page = bio->bi_io_vec[bio->bi_vcnt].bv_page;
+			page = resync_fetch_page(get_resync_pages(bio));
 			if (bio_add_page(bio, page, len, 0))
 				continue;
 
 			/* stop here */
-			bio->bi_io_vec[bio->bi_vcnt].bv_page = page;
+			resync_store_page(get_resync_pages(bio), page);
 			for (bio2 = biolist;
 			     bio2 && bio2 != bio;
 			     bio2 = bio2->bi_next) {
@@ -3458,7 +3479,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 		}
 		nr_sectors += len>>9;
 		sector_nr += len>>9;
-	} while (biolist->bi_vcnt < RESYNC_PAGES);
+	} while (resync_page_available(get_resync_pages(biolist)));
  bio_full:
 	r10_bio->sectors = nr_sectors;
 
@@ -3467,7 +3488,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 		biolist = biolist->bi_next;
 
 		bio->bi_next = NULL;
-		r10_bio = bio->bi_private;
+		r10_bio = get_resync_r10bio(bio);
 		r10_bio->sectors = nr_sectors;
 
 		if (bio->bi_end_io == end_sync_read) {
@@ -4362,6 +4383,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
 	struct bio *blist;
 	struct bio *bio, *read_bio;
 	int sectors_done = 0;
+	struct page **pages;
 
 	if (sector_nr == 0) {
 		/* If restarting in the middle, skip the initial sectors */
@@ -4512,11 +4534,9 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
 		if (!rdev2 || test_bit(Faulty, &rdev2->flags))
 			continue;
 
-		bio_reset(b);
 		b->bi_bdev = rdev2->bdev;
 		b->bi_iter.bi_sector = r10_bio->devs[s/2].addr +
 			rdev2->new_data_offset;
-		b->bi_private = r10_bio;
 		b->bi_end_io = end_reshape_write;
 		bio_set_op_attrs(b, REQ_OP_WRITE, 0);
 		b->bi_next = blist;
@@ -4526,8 +4546,9 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
 	/* Now add as many pages as possible to all of these bios. */
 
 	nr_sectors = 0;
+	pages = get_resync_pages(r10_bio->devs[0].bio)->pages;
 	for (s = 0 ; s < max_sectors; s += PAGE_SIZE >> 9) {
-		struct page *page = r10_bio->devs[0].bio->bi_io_vec[s/(PAGE_SIZE>>9)].bv_page;
+		struct page *page = pages[s / (PAGE_SIZE >> 9)];
 		int len = (max_sectors - s) << 9;
 		if (len > PAGE_SIZE)
 			len = PAGE_SIZE;
@@ -4720,7 +4741,7 @@ static int handle_reshape_read_error(struct mddev *mddev,
 
 static void end_reshape_write(struct bio *bio)
 {
-	struct r10bio *r10_bio = bio->bi_private;
+	struct r10bio *r10_bio = get_resync_r10bio(bio);
 	struct mddev *mddev = r10_bio->mddev;
 	struct r10conf *conf = mddev->private;
 	int d;
-- 
2.7.4

^ permalink raw reply related

* [PATCH v1 13/14] md: raid10: retrieve page from preallocated resync page array
From: Ming Lei @ 2017-02-24 15:42 UTC (permalink / raw)
  To: Shaohua Li, Jens Axboe, linux-kernel, linux-raid, linux-block,
	Christoph Hellwig
  Cc: Ming Lei
In-Reply-To: <1487950971-1131-1-git-send-email-tom.leiming@gmail.com>

Now one page array is allocated for each resync bio, and we can
retrieve page from this table directly.

Signed-off-by: Ming Lei <tom.leiming@gmail.com>
---
 drivers/md/raid10.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 931f5d80608b..ae162d542bf4 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -2065,6 +2065,7 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
 	int i, first;
 	struct bio *tbio, *fbio;
 	int vcnt;
+	struct page **tpages, **fpages;
 
 	atomic_set(&r10_bio->remaining, 1);
 
@@ -2080,6 +2081,7 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
 	fbio = r10_bio->devs[i].bio;
 	fbio->bi_iter.bi_size = r10_bio->sectors << 9;
 	fbio->bi_iter.bi_idx = 0;
+	fpages = get_resync_pages(fbio)->pages;
 
 	vcnt = (r10_bio->sectors + (PAGE_SIZE >> 9) - 1) >> (PAGE_SHIFT - 9);
 	/* now find blocks with errors */
@@ -2094,6 +2096,8 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
 			continue;
 		if (i == first)
 			continue;
+
+		tpages = get_resync_pages(tbio)->pages;
 		d = r10_bio->devs[i].devnum;
 		rdev = conf->mirrors[d].rdev;
 		if (!r10_bio->devs[i].bio->bi_error) {
@@ -2106,8 +2110,8 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
 				int len = PAGE_SIZE;
 				if (sectors < (len / 512))
 					len = sectors * 512;
-				if (memcmp(page_address(fbio->bi_io_vec[j].bv_page),
-					   page_address(tbio->bi_io_vec[j].bv_page),
+				if (memcmp(page_address(fpages[j]),
+					   page_address(tpages[j]),
 					   len))
 					break;
 				sectors -= len/512;
@@ -2205,6 +2209,7 @@ static void fix_recovery_read_error(struct r10bio *r10_bio)
 	int idx = 0;
 	int dr = r10_bio->devs[0].devnum;
 	int dw = r10_bio->devs[1].devnum;
+	struct page **pages = get_resync_pages(bio)->pages;
 
 	while (sectors) {
 		int s = sectors;
@@ -2220,7 +2225,7 @@ static void fix_recovery_read_error(struct r10bio *r10_bio)
 		ok = sync_page_io(rdev,
 				  addr,
 				  s << 9,
-				  bio->bi_io_vec[idx].bv_page,
+				  pages[idx],
 				  REQ_OP_READ, 0, false);
 		if (ok) {
 			rdev = conf->mirrors[dw].rdev;
@@ -2228,7 +2233,7 @@ static void fix_recovery_read_error(struct r10bio *r10_bio)
 			ok = sync_page_io(rdev,
 					  addr,
 					  s << 9,
-					  bio->bi_io_vec[idx].bv_page,
+					  pages[idx],
 					  REQ_OP_WRITE, 0, false);
 			if (!ok) {
 				set_bit(WriteErrorSeen, &rdev->flags);
-- 
2.7.4

^ permalink raw reply related

* [PATCH v1 14/14] md: raid10: avoid direct access to bvec table in handle_reshape_read_error
From: Ming Lei @ 2017-02-24 15:42 UTC (permalink / raw)
  To: Shaohua Li, Jens Axboe, linux-kernel, linux-raid, linux-block,
	Christoph Hellwig
  Cc: Ming Lei
In-Reply-To: <1487950971-1131-1-git-send-email-tom.leiming@gmail.com>

The cost is 128bytes(8*16) stack space in kernel thread context, and
just use the bio helper to retrieve pages from bio.

Signed-off-by: Ming Lei <tom.leiming@gmail.com>
---
 drivers/md/raid10.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index ae162d542bf4..705cb9af03ef 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -4689,7 +4689,15 @@ static int handle_reshape_read_error(struct mddev *mddev,
 	struct r10bio *r10b = &on_stack.r10_bio;
 	int slot = 0;
 	int idx = 0;
-	struct bio_vec *bvec = r10_bio->master_bio->bi_io_vec;
+	struct bio_vec *bvl;
+	struct page *pages[RESYNC_PAGES];
+
+	/*
+	 * This bio is allocated in reshape_request(), and size
+	 * is still RESYNC_PAGES
+	 */
+	bio_for_each_segment_all(bvl, r10_bio->master_bio, idx)
+		pages[idx] = bvl->bv_page;
 
 	r10b->sector = r10_bio->sector;
 	__raid10_find_phys(&conf->prev, r10b);
@@ -4718,7 +4726,7 @@ static int handle_reshape_read_error(struct mddev *mddev,
 			success = sync_page_io(rdev,
 					       addr,
 					       s << 9,
-					       bvec[idx].bv_page,
+					       pages[idx],
 					       REQ_OP_READ, 0, false);
 			rdev_dec_pending(rdev, mddev);
 			rcu_read_lock();
-- 
2.7.4


^ permalink raw reply related

* [GIT PULL] MD update for 4.11
From: Shaohua Li @ 2017-02-24 15:52 UTC (permalink / raw)
  To: torvalds; +Cc: linux-kernel, linux-raid, neilb

Hi,

please pull MD update for 4.11, this pull mainly fixes bugs and improves
performance:
- Improve scalability for raid1 from Coly
- Improve raid5-cache read performance, disk efficiency and IO pattern from
  Song and me
- Fix a race condition of disk hotplug for linear from Coly
- A few cleanup patches from Ming and Byungchul
- Fix a memory leak from Neil
- Fix WRITE SAME IO failure from me
- Add doc for raid5-cache from me

Thanks,
Shaohua

The following changes since commit 7089db84e356562f8ba737c29e472cc42d530dbc:

  Linux 4.10-rc8 (2017-02-12 13:03:20 -0800)

are available in the git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/shli/md.git for-next

for you to fetch changes up to 1ec492232ed659acde8cc00b9ecc7529778e03e1:

  md/raid1: fix write behind issues introduced by bio_clone_bioset_partial (2017-02-23 11:59:44 -0800)

----------------------------------------------------------------
Byungchul Park (1):
      md/raid5: Don't reinvent the wheel but use existing llist API

Ming Lei (5):
      block: introduce bio_clone_bioset_partial()
      md: fail if mddev->bio_set can't be created
      md/raid1: use bio_clone_bioset_partial() in case of write behind
      md: remove unnecessary check on mddev
      md: fast clone bio in bio_clone_mddev()

NeilBrown (1):
      md: ensure md devices are freed before module is unloaded.

Shaohua Li (10):
      raid5: only dispatch IO from raid5d for harddisk raid
      Documentation: move MD related doc into a separate dir
      MD: add doc for raid5-cache
      md/raid5-cache: stripe reclaim only counts valid stripes
      md/raid5-cache: exclude reclaiming stripes in reclaim check
      md: disable WRITE SAME if it fails in underlayer disks
      md/raid1: fix a use-after-free bug
      md/linear: shutup lockdep warnning
      md/raid1: handle flush request correctly
      md/raid1: fix write behind issues introduced by bio_clone_bioset_partial

Song Liu (3):
      EXPORT_SYMBOL radix_tree_replace_slot
      md/r5cache: enable chunk_aligned_read with write back cache
      md/r5cache: improve journal device efficiency

colyli@suse.de (3):
      md linear: fix a race between linear_add() and linear_congested()
      RAID1: a new I/O barrier implementation to remove resync window
      RAID1: avoid unnecessary spin locks in I/O barrier code

 Documentation/00-INDEX                |   4 +-
 Documentation/admin-guide/md.rst      |   5 +
 Documentation/{ => md}/md-cluster.txt |   0
 Documentation/md/raid5-cache.txt      | 109 +++++++
 block/bio.c                           |  61 +++-
 drivers/md/faulty.c                   |   2 +-
 drivers/md/linear.c                   |  41 ++-
 drivers/md/linear.h                   |   1 +
 drivers/md/md.c                       |  22 +-
 drivers/md/md.h                       |   9 +-
 drivers/md/multipath.c                |   1 +
 drivers/md/raid0.c                    |   1 +
 drivers/md/raid1.c                    | 598 +++++++++++++++++++++-------------
 drivers/md/raid1.h                    |  58 ++--
 drivers/md/raid10.c                   |  11 +-
 drivers/md/raid5-cache.c              | 225 +++++++++++--
 drivers/md/raid5.c                    | 129 ++++++--
 drivers/md/raid5.h                    |   7 +
 include/linux/bio.h                   |  11 +-
 lib/radix-tree.c                      |   1 +
 20 files changed, 943 insertions(+), 353 deletions(-)
 rename Documentation/{ => md}/md-cluster.txt (100%)
 create mode 100644 Documentation/md/raid5-cache.txt

^ permalink raw reply

* Re: [BUG] non-metadata arrays cannot use more than 27 component devices
From: ian_bruce @ 2017-02-24 16:40 UTC (permalink / raw)
  To: linux-raid
In-Reply-To: <41ea334c-ae1c-dac6-e1a1-480d3700a588@turmel.org>

On Fri, 24 Feb 2017 10:20:52 -0500
Phil Turmel <philip@turmel.org> wrote:

> Considering the existence of --build is strictly to support arrays
> that predate MD raid, it seems a bit of a stretch to claim this as a
> bug instead of a feature request.

quoting from the mdadm manual page:

    *Build*

    Build an array that doesn't have per-device metadata (superblocks).
    For these sorts of arrays, mdadm cannot differentiate between
    initial creation and subsequent assembly of an array. It also cannot
    perform any checks that appropriate components have been requested.
    Because of this, the Build mode should only be used together with a
    complete understanding of what you are doing.

No mention of "arrays that predate MD RAID" there. Nor any mention of a
27-component limit, either. Nor does the eventual error message mention
any such thing (although "mdadm --create --metadata=0 --raid-devices=28"
does). I'd call that a bug.

Since there's no pre-existing superblock, and the kernel has to create
one, it could just as easily use the v1.2 format as the v0.90 format, as
it does with "mdadm --create". Why shouldn't the v1.2 format be the
default for "mdadm --build" as well? That would be more consistent --
why should these two options behave differently in this regard, in the
absence of any material reason to do so?

--create : initialize v1.2 kernel superblock and write to disk

--build  : initialize v1.2 kernel superblock but don't write to disk

It seems like it would actually be simpler to treat the two cases the
same, rather than differently.

-- Ian Bruce

^ permalink raw reply

* Re: [PATCH V3 1/2] RAID1: a new I/O barrier implementation to remove resync window
From: Coly Li @ 2017-02-24 17:02 UTC (permalink / raw)
  To: Shaohua Li
  Cc: NeilBrown, linux-raid, Shaohua Li, Johannes Thumshirn,
	Guoqing Jiang
In-Reply-To: <20170223195811.sio23y5m4zxeqnmt@kernel.org>

On 2017/2/24 上午3:58, Shaohua Li wrote:
> On Fri, Feb 24, 2017 at 03:31:16AM +0800, Coly Li wrote:
>> On 2017/2/24 上午1:34, Shaohua Li wrote:
>>> On Thu, Feb 23, 2017 at 01:54:47PM +0800, Coly Li wrote:
>> [snip]
>>>>>>>> As r1bio_pool preallocates 256 entries, this is unlikely  but not 
>>>>>>>> impossible.  If 256 threads all attempt a write (or read) that
>>>>>>>> crosses a boundary, then they will consume all 256 preallocated
>>>>>>>> entries, and want more. If there is no free memory, they will block
>>>>>>>> indefinitely.
>>>>>>>>
>>>>>>>
>>>>>>> If raid1_make_request() is modified into this way,
>>>>>>> +	if (bio_data_dir(split) == READ)
>>>>>>> +		raid1_read_request(mddev, split);
>>>>>>> +	else
>>>>>>> +		raid1_write_request(mddev, split);
>>>>>>> +	if (split != bio)
>>>>>>> +		generic_make_request(bio);
>>>>>>>
>>>>>>> Then the original bio will be added into the bio_list_on_stack of top
>>>>>>> level generic_make_request(), current->bio_list is initialized, when
>>>>>>> generic_make_request() is called nested in raid1_make_request(), the
>>>>>>> split bio will be added into current->bio_list and nothing else happens.
>>>>>>>
>>>>>>> After the nested generic_make_request() returns, the code back to next
>>>>>>> code of generic_make_request(),
>>>>>>> 2022                         ret = q->make_request_fn(q, bio);
>>>>>>> 2023
>>>>>>> 2024                         blk_queue_exit(q);
>>>>>>> 2025
>>>>>>> 2026                         bio = bio_list_pop(current->bio_list);
>>>>>>>
>>>>>>> bio_list_pop() will return the second half of the split bio, and it is
>>>>>>
>>>>>> So in above sequence, the curent->bio_list will has bios in below sequence:
>>>>>> bios to underlaying disks, second half of original bio
>>>>>>
>>>>>> bio_list_pop will pop bios to underlaying disks first, handle them, then the
>>>>>> second half of original bio.
>>>>>>
>>>>>> That said, this doesn't work for array stacked 3 layers. Because in 3-layer
>>>>>> array, handling the middle layer bio will make the 3rd layer bio hold to
>>>>>> bio_list again.
>>>>>>
>>>>>
>>>>> Could you please give me more hint,
>>>>> - What is the meaning of "hold" from " make the 3rd layer bio hold to
>>>>> bio_list again" ?
>>>>> - Why deadlock happens if the 3rd layer bio hold to bio_list again ?
>>>>
>>>> I tried to set up a 4 layer stacked md raid1, and reduce I/O barrier
>>>> bucket size to 8MB, running for 10 hours, there is no deadlock observed,
>>>>
>>>> Here is how the 4 layer stacked raid1 setup,
>>>> - There are 4 NVMe SSDs, on each SSD I create four 500GB partition,
>>>>   /dev/nvme0n1:  nvme0n1p1, nvme0n1p2, nvme0n1p3, nvme0n1p4
>>>>   /dev/nvme1n1:  nvme1n1p1, nvme1n1p2, nvme1n1p3, nvme1n1p4
>>>>   /dev/nvme2n1:  nvme2n1p1, nvme2n1p2, nvme2n1p3, nvme2n1p4
>>>>   /dev/nvme3n1:  nvme3n1p1, nvme3n1p2, nvme3n1p3, nvme3n1p4
>>>> - Here is how the 4 layer stacked raid1 assembled, level 1 means the top
>>>> level, level 4 means the bottom level in the stacked devices,
>>>>   - level 1:
>>>> 	/dev/md40: /dev/md30  /dev/md31
>>>>   - level 2:
>>>> 	/dev/md30: /dev/md20  /dev/md21
>>>> 	/dev/md31: /dev/md22  /dev/md23
>>>>   - level 3:
>>>> 	/dev/md20: /dev/md10  /dev/md11
>>>> 	/dev/md21: /dev/md12  /dev/md13
>>>> 	/dev/md22: /dev/md14  /dev/md15
>>>> 	/dev/md23: /dev/md16  /dev/md17
>>>>   - level 4:
>>>> 	/dev/md10: /dev/nvme0n1p1  /dev/nvme1n1p1
>>>> 	/dev/md11: /dev/nvme2n1p1  /dev/nvme3n1p1
>>>> 	/dev/md12: /dev/nvme0n1p2  /dev/nvme1n1p2
>>>> 	/dev/md13: /dev/nvme2n1p2  /dev/nvme3n1p2
>>>> 	/dev/md14: /dev/nvme0n1p3  /dev/nvme1n1p3
>>>> 	/dev/md15: /dev/nvme2n1p3  /dev/nvme3n1p3
>>>> 	/dev/md16: /dev/nvme0n1p4  /dev/nvme1n1p4
>>>> 	/dev/md17: /dev/nvme2n1p4  /dev/nvme3n1p4
>>>>
>>>> Here is the fio job file,
>>>> [global]
>>>> direct=1
>>>> thread=1
>>>> ioengine=libaio
>>>>
>>>> [job]
>>>> filename=/dev/md40
>>>> readwrite=write
>>>> numjobs=10
>>>> blocksize=33M
>>>> iodepth=128
>>>> time_based=1
>>>> runtime=10h
>>>>
>>>> I planed to learn how the deadlock comes by analyze a deadlock
>>>> condition. Maybe it was because 8MB bucket unit size is small enough,
>>>> now I try to run with 512K bucket unit size, and see whether I can
>>>> encounter a deadlock.
>>>
>>> Don't think raid1 could easily trigger the deadlock. Maybe you should try
>>> raid10. The resync case is hard to trigger for raid1. The memory pressure case
>>> is hard to trigger for both raid1/10. But it's possible to trigger.
>>>
>>> The 3-layer case is something like this:
>>
>> Hi Shaohua,
>>
>> I try to catch up with you, let me try to follow your mind by the
>> split-in-while-loop condition (this is my new I/O barrier patch). I
>> assume the original BIO is a write bio, and original bio is split and
>> handled in a while loop in raid1_make_request().
>>
>>> 1. in level1, set current->bio_list, split bio to bio1 and bio2
>>
>> This is done in level1 raid1_make_request().
>>
>>> 2. remap bio1 to level2 disk, and queue bio1-level2 in current->bio_list
>>
>> Remap is done by raid1_write_request(), and bio1_level may be added into
>> one of the two list:
>> - plug->pending:
>>   bios in plug->pending may be handled in raid1_unplug(), or in
>> flush_pending_writes() of raid1d().
>>   If current task is about to be scheduled, raid1_unplug() will merge
>> plug->pending's bios to conf->pending_bio_list. And
>> conf->pending_bio_list will be handled in raid1d.
>>   If raid1_unplug() is triggered by blk_finish_plug(), it is also
>> handled in raid1d.
>>
>> - conf->pending_bio_list:
>>   bios in this list is handled in raid1d by calling flush_pending_writes().
>>
>>
>> So generic_make_request() to handle bio1_level2 can only be called in
>> context of raid1d thread, bio1_level2 is added into raid1d's
>> bio_list_on_stack, not caller of level1 generic_make_request().
>>
>>> 3. queue bio2 in current->bio_list
>>
>> Same, bio2_level2 is in level1 raid1d's bio_list_on_stack.
>> Then back to level1 generic_make_request()
>>
>>> 4. generic_make_request then pops bio1-level2
>>
>> At this moment, bio1_level2 and bio2_level2 are in either plug->pending
>> or conf->pending_bio_list, bio_list_pop() returns NULL, and level1
>> generic_make_request() returns to its caller.
>>
>> If before bio_list_pop() called, kernel thread raid1d wakes up and
>> iterates conf->pending_bio_list in flush_pending_writes() or iterate
>> plug->pending in raid1_unplug() by blk_finish_plug(), that happens in
>> level1 raid1d's stack, bios will not show up in level1
>> generic_make_reques(), bio_list_pop() still returns NULL.
>>
>>> 5. remap bio1-level2 to level3 disk, and queue bio1-level2-level3 in current->bio_list
>>
>> bio2_level2 is at head of conf->pending_bio_list or plug->pending, so
>> bio2_level2 is handled firstly.
>>
>> level1 raid1 calls level2 generic_make_request(), then level2
>> raid1_make_request() is called, then level raid1_write_request().
>> bio2_level2 is remapped to bio2_level3, added into plug->pending (level1
>> raid1d's context) or conf->pending_bio_list (level2 raid1's conf), it
>> will be handled by level2 raid1d, when level2 raid1d wakes up.
>> Then returns back to level1 raid1, bio1_level2
>> is handled by level2 generic_make_request() and added into level2
>> plug->pending or conf->pending_bio_list. In this case neither
>> bio2_level2 nor bio1_level is added into any bio_list_on_stack.
>>
>> Then level1 raid1d handles all bios in level1 conf->pending_bio_list,
>> and sleeps.
>>
>> Then level2 raid1d wakes up, and handle bio2_level3 and bio1_level3, by
>> iterate level2 plug->pending or conf->pending_bio_list, and calling
>> level3 generic_make_request().
>>
>> In level3 generic_make_request(), because it is level2 raid1d context,
>> not level1 raid1d context, bio2_level3 is send into
>> q->make_request_fn(), and finally added into level3 plug->pending or
>> conf->pending_bio_list, then back to level3 generic_make_reqeust().
>>
>> Now level2 raid1d's current->bio_list is empty, so level3
>> generic_make_request() returns to level2 raid1d and continue to iterate
>> and send bio1_level3 into level3 generic_make_request().
>>
>> After all bios are added into level3 plug->pending or
>> conf->pending_bio_list, level2 raid1d sleeps.
>>
>> Now level3 raid1d wakes up, continue to iterate level3 plug->pending or
>> conf->pending_bio_list by calling generic_make_request() to underlying
>> devices (which might be a read device).
>>
>> On the above whole patch, each lower level generic_make_request() is
>> called in context of the lower level raid1d. No recursive call happens
>> in normal code path.
>>
>> In raid1 code, recursive call of generic_make_request() only happens for
>> READ bio, but if array is not frozen, no barrier is required, it doesn't
>> hurt.
>>
>>
>>> 6. generic_make_request then pops bio2, but bio1 hasn't finished yet, deadlock
>>
>> As my understand to the code, it won't happen neither.
>>
>>>
>>> The problem is because we add new bio to current->bio_list tail.
>>
>> New bios are added into other context's current->bio_list, which are
>> different lists. If what I understand is correct, a dead lock won't
>> happen in this way.
>>
>> If my understanding is correct, suddenly I come to realize why raid1
>> bios are handled indirectly in another kernel thread.
>>
>> (Just for your information, when I write to this location, another run
>> of testing finished, no deadlock. This time I reduce I/O barrier bucket
>> unit size to 512KB, and set blocksize to 33MB in fio job file. It is
>> really slow (130MB/s), but no deadlock observed)
>>
>>
>> The stacked raid1 devices are really really confused, if I am wrong, any
>> hint is warmly welcome.
> 
> Aha, you are correct. I missed we never directly dispatch bio in a schedule based
> blk-plug flush. I'll drop the patch. Thanks for the insistence, good discussion!

Thank you for the encouragement :-)
After carefully think your patch again, I suggest to let it go ahead and
keep your fix in -next. The reasons are,
1) For 32bit bi_iter.bi_size, it is safe and no hash conflict. But if
someone (maybe I) changes bi_iter.bi_size from 32bit to 64bit, for a
DISCARD bio, it is very easy to be split into more then 512 pieces, then
there will be a lot of hash conflit. If there are resync triggered, it
will every easy go into a dead lock.
  If this dead lock happens in future, it will be quite hard to find out
the root cause. If we have this fix now to avoid future possible hash
conflict, life will be easier at that time.
2) If we decide to use nested generic_make_request() fix in your patch,
then the while-loop does not exist, more CPU cycles will be consumed, it
does not make sense to save a branch by a function pointer, and pay the
cost of code readability. So remove the function pointer is better once
we take reason 1).

Please keep this patch in -next, it helps to avoid future bug.

Thanks.

Coly


^ permalink raw reply

* Re: [PATCH V3 1/2] RAID1: a new I/O barrier implementation to remove resync window
From: Coly Li @ 2017-02-24 17:06 UTC (permalink / raw)
  To: NeilBrown, Shaohua Li
  Cc: linux-raid, Shaohua Li, Johannes Thumshirn, Guoqing Jiang
In-Reply-To: <87lgswqz3w.fsf@notabene.neil.brown.name>

On 2017/2/24 上午7:14, NeilBrown wrote:
> On Thu, Feb 23 2017, Coly Li wrote:
> 
>> 
>> I tried to set up a 4 layer stacked md raid1, and reduce I/O
>> barrier bucket size to 8MB, running for 10 hours, there is no
>> deadlock observed,
> 
> Try setting BARRIER_BUCKETS_NR to '1' and BARRIER_UNIT_SECTOR_BITS
> to 3 and make sure the write requests are larger than 1 page (and
> have resync happen at the same time as writes).

Hi Neil,

Yes, the above method triggers deadlock easily. After come to
understand how bios are handled in stacked raid1 and the relationship
between current->bio_list, plug->pending and conf->pending_bio_list, I
think I come to understand what you worried and the meaning of your fix.

I totally agree and understand there will be hash conflict sooner or
later now. Yes we need this fix.

Thanks to you and Shaohua, explaining the details to me, and help me
to catch up your mind :-)

Coly

^ permalink raw reply

* Re: [PATCH V3 1/2] RAID1: a new I/O barrier implementation to remove resync window
From: Shaohua Li @ 2017-02-24 17:17 UTC (permalink / raw)
  To: Coly Li
  Cc: NeilBrown, linux-raid, Shaohua Li, Johannes Thumshirn,
	Guoqing Jiang
In-Reply-To: <ef194546-a825-7eab-f8cb-e5ee147d065b@suse.de>

On Sat, Feb 25, 2017 at 01:06:22AM +0800, Coly Li wrote:
> On 2017/2/24 上午7:14, NeilBrown wrote:
> > On Thu, Feb 23 2017, Coly Li wrote:
> > 
> >> 
> >> I tried to set up a 4 layer stacked md raid1, and reduce I/O
> >> barrier bucket size to 8MB, running for 10 hours, there is no
> >> deadlock observed,
> > 
> > Try setting BARRIER_BUCKETS_NR to '1' and BARRIER_UNIT_SECTOR_BITS
> > to 3 and make sure the write requests are larger than 1 page (and
> > have resync happen at the same time as writes).
> 
> Hi Neil,
> 
> Yes, the above method triggers deadlock easily. After come to
> understand how bios are handled in stacked raid1 and the relationship
> between current->bio_list, plug->pending and conf->pending_bio_list, I
> think I come to understand what you worried and the meaning of your fix.
> 
> I totally agree and understand there will be hash conflict sooner or
> later now. Yes we need this fix.
> 
> Thanks to you and Shaohua, explaining the details to me, and help me
> to catch up your mind :-)

I'm confused. So the deadlock is real? How is it triggered?

Thanks,
Shaohua

^ permalink raw reply

* Re: [GIT PULL] MD update for 4.11
From: Shaohua Li @ 2017-02-24 18:36 UTC (permalink / raw)
  To: torvalds; +Cc: linux-kernel, linux-raid, neilb
In-Reply-To: <20170224155206.tu7ub3rp7wv4faoq@kernel.org>

On Fri, Feb 24, 2017 at 07:52:06AM -0800, Shaohua Li wrote:
> Hi,
> 
> please pull MD update for 4.11, this pull mainly fixes bugs and improves
> performance:
> - Improve scalability for raid1 from Coly
> - Improve raid5-cache read performance, disk efficiency and IO pattern from
>   Song and me
> - Fix a race condition of disk hotplug for linear from Coly
> - A few cleanup patches from Ming and Byungchul
> - Fix a memory leak from Neil
> - Fix WRITE SAME IO failure from me
> - Add doc for raid5-cache from me

I forgot to mention there is a merge conflict because of a last minute fix.
Below is the fix. Sorry about this.

Thanks,
Shaohua


diff --cc drivers/md/raid1.c
index 830ff2b,d4e8796..0000000
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@@ -1385,8 -1515,7 +1512,8 @@@ static void raid1_write_request(struct 
  				   conf->mirrors[i].rdev->data_offset);
  		mbio->bi_bdev = conf->mirrors[i].rdev->bdev;
  		mbio->bi_end_io	= raid1_end_write_request;
 -		bio_set_op_attrs(mbio, op, do_fua | do_sync);
 +		mbio->bi_opf = bio_op(bio) |
- 			(bio->bi_opf & (REQ_SYNC | REQ_PREFLUSH | REQ_FUA));
++			(bio->bi_opf & (REQ_SYNC | REQ_FUA));
  		if (test_bit(FailFast, &conf->mirrors[i].rdev->flags) &&
  		    !test_bit(WriteMostly, &conf->mirrors[i].rdev->flags) &&
  		    conf->raid_disks - mddev->degraded > 1)

> 
> Thanks,
> Shaohua
> 
> The following changes since commit 7089db84e356562f8ba737c29e472cc42d530dbc:
> 
>   Linux 4.10-rc8 (2017-02-12 13:03:20 -0800)
> 
> are available in the git repository at:
> 
>   git://git.kernel.org/pub/scm/linux/kernel/git/shli/md.git for-next
> 
> for you to fetch changes up to 1ec492232ed659acde8cc00b9ecc7529778e03e1:
> 
>   md/raid1: fix write behind issues introduced by bio_clone_bioset_partial (2017-02-23 11:59:44 -0800)
> 
> ----------------------------------------------------------------
> Byungchul Park (1):
>       md/raid5: Don't reinvent the wheel but use existing llist API
> 
> Ming Lei (5):
>       block: introduce bio_clone_bioset_partial()
>       md: fail if mddev->bio_set can't be created
>       md/raid1: use bio_clone_bioset_partial() in case of write behind
>       md: remove unnecessary check on mddev
>       md: fast clone bio in bio_clone_mddev()
> 
> NeilBrown (1):
>       md: ensure md devices are freed before module is unloaded.
> 
> Shaohua Li (10):
>       raid5: only dispatch IO from raid5d for harddisk raid
>       Documentation: move MD related doc into a separate dir
>       MD: add doc for raid5-cache
>       md/raid5-cache: stripe reclaim only counts valid stripes
>       md/raid5-cache: exclude reclaiming stripes in reclaim check
>       md: disable WRITE SAME if it fails in underlayer disks
>       md/raid1: fix a use-after-free bug
>       md/linear: shutup lockdep warnning
>       md/raid1: handle flush request correctly
>       md/raid1: fix write behind issues introduced by bio_clone_bioset_partial
> 
> Song Liu (3):
>       EXPORT_SYMBOL radix_tree_replace_slot
>       md/r5cache: enable chunk_aligned_read with write back cache
>       md/r5cache: improve journal device efficiency
> 
> colyli@suse.de (3):
>       md linear: fix a race between linear_add() and linear_congested()
>       RAID1: a new I/O barrier implementation to remove resync window
>       RAID1: avoid unnecessary spin locks in I/O barrier code
> 
>  Documentation/00-INDEX                |   4 +-
>  Documentation/admin-guide/md.rst      |   5 +
>  Documentation/{ => md}/md-cluster.txt |   0
>  Documentation/md/raid5-cache.txt      | 109 +++++++
>  block/bio.c                           |  61 +++-
>  drivers/md/faulty.c                   |   2 +-
>  drivers/md/linear.c                   |  41 ++-
>  drivers/md/linear.h                   |   1 +
>  drivers/md/md.c                       |  22 +-
>  drivers/md/md.h                       |   9 +-
>  drivers/md/multipath.c                |   1 +
>  drivers/md/raid0.c                    |   1 +
>  drivers/md/raid1.c                    | 598 +++++++++++++++++++++-------------
>  drivers/md/raid1.h                    |  58 ++--
>  drivers/md/raid10.c                   |  11 +-
>  drivers/md/raid5-cache.c              | 225 +++++++++++--
>  drivers/md/raid5.c                    | 129 ++++++--
>  drivers/md/raid5.h                    |   7 +
>  include/linux/bio.h                   |  11 +-
>  lib/radix-tree.c                      |   1 +
>  20 files changed, 943 insertions(+), 353 deletions(-)
>  rename Documentation/{ => md}/md-cluster.txt (100%)
>  create mode 100644 Documentation/md/raid5-cache.txt
> --
> To unsubscribe from this list: send the line "unsubscribe linux-raid" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH V3 1/2] RAID1: a new I/O barrier implementation to remove resync window
From: Coly Li @ 2017-02-24 18:57 UTC (permalink / raw)
  To: Shaohua Li
  Cc: NeilBrown, linux-raid, Shaohua Li, Johannes Thumshirn,
	Guoqing Jiang
In-Reply-To: <20170224171724.4go2ahruap2nqhlq@kernel.org>

On 2017/2/25 上午1:17, Shaohua Li wrote:
> On Sat, Feb 25, 2017 at 01:06:22AM +0800, Coly Li wrote:
>> On 2017/2/24 上午7:14, NeilBrown wrote:
>>> On Thu, Feb 23 2017, Coly Li wrote:
>>>
>>>>
>>>> I tried to set up a 4 layer stacked md raid1, and reduce I/O
>>>> barrier bucket size to 8MB, running for 10 hours, there is no
>>>> deadlock observed,
>>>
>>> Try setting BARRIER_BUCKETS_NR to '1' and BARRIER_UNIT_SECTOR_BITS
>>> to 3 and make sure the write requests are larger than 1 page (and
>>> have resync happen at the same time as writes).
>>
>> Hi Neil,
>>
>> Yes, the above method triggers deadlock easily. After come to
>> understand how bios are handled in stacked raid1 and the relationship
>> between current->bio_list, plug->pending and conf->pending_bio_list, I
>> think I come to understand what you worried and the meaning of your fix.
>>
>> I totally agree and understand there will be hash conflict sooner or
>> later now. Yes we need this fix.
>>
>> Thanks to you and Shaohua, explaining the details to me, and help me
>> to catch up your mind :-)
> 
> I'm confused. So the deadlock is real? How is it triggered?

Let me explain,

There is no deadlock now, because,
1) If there is hash conflict existing, a deadlock is possible.
2) In current Linux kernel, hash conflict won't happen in real life
   2.1) regular bio maximum size is 2MB, it can only be split into 2
bios in raid1_make_request() of my new I/O barrier patch
   2.2) DISCARD bio maximum size is 4GB, it can be split into 65 bios in
raid1_make_request() of my new I/O barrier patch.
   2.3) I verified that, for any consecutive  512 integers in [0,
1<<63], there is no hash conflict by calling sector_to_idx().
   2.4) Currently there is almost no device provides LBA range exceeds
(1<<63) bytes. So in current Linux kernel with my new I/O barrier patch,
no dead lock will happen. The patch in current Linux kernel is deadlock
clean from all conditions we discussed before.

The reason why I suggest to still have your patch is,
1) If one day bi_iter.bi_size is extended from 32bit (unsigned int) to
64bit (unsigned long), a DISCARD bio will be split to more than 1024
smaller bios.
2) If a DISCARD bio is split into more then 1024 smaller bios, that
means sector_to_idx() is called by 1024+ consecutive integers. It is
100% possible to have hash conflict.
3) If hash conflict exists, the deadlock described by Neil will be passible.

What I mean is, currently there is no deadlock, because bi_iter.bi_size
is 32 bit; if in future bi_iter.bi_size extended to 64 bit, we will have
deadlock. Your fix almost does not hurt performance, improves code
readability and will avoid a potential deadlock in future (when
bi_iter.bi_size extended to 64 bit), why not have it ?

Coly

^ permalink raw reply

* Re: [PATCH V3 1/2] RAID1: a new I/O barrier implementation to remove resync window
From: Shaohua Li @ 2017-02-24 19:02 UTC (permalink / raw)
  To: Coly Li
  Cc: NeilBrown, linux-raid, Shaohua Li, Johannes Thumshirn,
	Guoqing Jiang
In-Reply-To: <77888b1a-9b5c-6171-f830-ff2d62193067@suse.de>

On Sat, Feb 25, 2017 at 02:57:26AM +0800, Coly Li wrote:
> On 2017/2/25 上午1:17, Shaohua Li wrote:
> > On Sat, Feb 25, 2017 at 01:06:22AM +0800, Coly Li wrote:
> >> On 2017/2/24 上午7:14, NeilBrown wrote:
> >>> On Thu, Feb 23 2017, Coly Li wrote:
> >>>
> >>>>
> >>>> I tried to set up a 4 layer stacked md raid1, and reduce I/O
> >>>> barrier bucket size to 8MB, running for 10 hours, there is no
> >>>> deadlock observed,
> >>>
> >>> Try setting BARRIER_BUCKETS_NR to '1' and BARRIER_UNIT_SECTOR_BITS
> >>> to 3 and make sure the write requests are larger than 1 page (and
> >>> have resync happen at the same time as writes).
> >>
> >> Hi Neil,
> >>
> >> Yes, the above method triggers deadlock easily. After come to
> >> understand how bios are handled in stacked raid1 and the relationship
> >> between current->bio_list, plug->pending and conf->pending_bio_list, I
> >> think I come to understand what you worried and the meaning of your fix.
> >>
> >> I totally agree and understand there will be hash conflict sooner or
> >> later now. Yes we need this fix.
> >>
> >> Thanks to you and Shaohua, explaining the details to me, and help me
> >> to catch up your mind :-)
> > 
> > I'm confused. So the deadlock is real? How is it triggered?
> 
> Let me explain,
> 
> There is no deadlock now, because,
> 1) If there is hash conflict existing, a deadlock is possible.
> 2) In current Linux kernel, hash conflict won't happen in real life
>    2.1) regular bio maximum size is 2MB, it can only be split into 2
> bios in raid1_make_request() of my new I/O barrier patch
>    2.2) DISCARD bio maximum size is 4GB, it can be split into 65 bios in
> raid1_make_request() of my new I/O barrier patch.
>    2.3) I verified that, for any consecutive  512 integers in [0,
> 1<<63], there is no hash conflict by calling sector_to_idx().
>    2.4) Currently there is almost no device provides LBA range exceeds
> (1<<63) bytes. So in current Linux kernel with my new I/O barrier patch,
> no dead lock will happen. The patch in current Linux kernel is deadlock
> clean from all conditions we discussed before.
> 
> The reason why I suggest to still have your patch is,
> 1) If one day bi_iter.bi_size is extended from 32bit (unsigned int) to
> 64bit (unsigned long), a DISCARD bio will be split to more than 1024
> smaller bios.
> 2) If a DISCARD bio is split into more then 1024 smaller bios, that
> means sector_to_idx() is called by 1024+ consecutive integers. It is
> 100% possible to have hash conflict.
> 3) If hash conflict exists, the deadlock described by Neil will be passible.
> 
> 
> What I mean is, currently there is no deadlock, because bi_iter.bi_size
> is 32 bit; if in future bi_iter.bi_size extended to 64 bit, we will have
> deadlock. Your fix almost does not hurt performance, improves code
> readability and will avoid a potential deadlock in future (when
> bi_iter.bi_size extended to 64 bit), why not have it ?

Let's assume there is hash conflict. We have raid10 anyway, which doesn't have
the fancy barrier. When can the deadlock be triggered? My understanding is
there isn't because we are handling bios in raid1/10d. Any thing I missed?

Thanks,
Shaohua

^ permalink raw reply

* Re: [PATCH V3 1/2] RAID1: a new I/O barrier implementation to remove resync window
From: Coly Li @ 2017-02-24 19:19 UTC (permalink / raw)
  To: Shaohua Li
  Cc: NeilBrown, linux-raid, Shaohua Li, Johannes Thumshirn,
	Guoqing Jiang
In-Reply-To: <20170224190215.uzv2cl65ry7siip5@kernel.org>

On 2017/2/25 上午3:02, Shaohua Li wrote:
> On Sat, Feb 25, 2017 at 02:57:26AM +0800, Coly Li wrote:
>> On 2017/2/25 上午1:17, Shaohua Li wrote:
>>> On Sat, Feb 25, 2017 at 01:06:22AM +0800, Coly Li wrote:
>>>> On 2017/2/24 上午7:14, NeilBrown wrote:
>>>>> On Thu, Feb 23 2017, Coly Li wrote:
>>>>>
>>>>>>
>>>>>> I tried to set up a 4 layer stacked md raid1, and reduce I/O
>>>>>> barrier bucket size to 8MB, running for 10 hours, there is no
>>>>>> deadlock observed,
>>>>>
>>>>> Try setting BARRIER_BUCKETS_NR to '1' and BARRIER_UNIT_SECTOR_BITS
>>>>> to 3 and make sure the write requests are larger than 1 page (and
>>>>> have resync happen at the same time as writes).
>>>>
>>>> Hi Neil,
>>>>
>>>> Yes, the above method triggers deadlock easily. After come to
>>>> understand how bios are handled in stacked raid1 and the relationship
>>>> between current->bio_list, plug->pending and conf->pending_bio_list, I
>>>> think I come to understand what you worried and the meaning of your fix.
>>>>
>>>> I totally agree and understand there will be hash conflict sooner or
>>>> later now. Yes we need this fix.
>>>>
>>>> Thanks to you and Shaohua, explaining the details to me, and help me
>>>> to catch up your mind :-)
>>>
>>> I'm confused. So the deadlock is real? How is it triggered?
>>
>> Let me explain,
>>
>> There is no deadlock now, because,
>> 1) If there is hash conflict existing, a deadlock is possible.
>> 2) In current Linux kernel, hash conflict won't happen in real life
>>    2.1) regular bio maximum size is 2MB, it can only be split into 2
>> bios in raid1_make_request() of my new I/O barrier patch
>>    2.2) DISCARD bio maximum size is 4GB, it can be split into 65 bios in
>> raid1_make_request() of my new I/O barrier patch.
>>    2.3) I verified that, for any consecutive  512 integers in [0,
>> 1<<63], there is no hash conflict by calling sector_to_idx().
>>    2.4) Currently there is almost no device provides LBA range exceeds
>> (1<<63) bytes. So in current Linux kernel with my new I/O barrier patch,
>> no dead lock will happen. The patch in current Linux kernel is deadlock
>> clean from all conditions we discussed before.
>>
>> The reason why I suggest to still have your patch is,
>> 1) If one day bi_iter.bi_size is extended from 32bit (unsigned int) to
>> 64bit (unsigned long), a DISCARD bio will be split to more than 1024
>> smaller bios.
>> 2) If a DISCARD bio is split into more then 1024 smaller bios, that
>> means sector_to_idx() is called by 1024+ consecutive integers. It is
>> 100% possible to have hash conflict.
>> 3) If hash conflict exists, the deadlock described by Neil will be passible.
>>
>>
>> What I mean is, currently there is no deadlock, because bi_iter.bi_size
>> is 32 bit; if in future bi_iter.bi_size extended to 64 bit, we will have
>> deadlock. Your fix almost does not hurt performance, improves code
>> readability and will avoid a potential deadlock in future (when
>> bi_iter.bi_size extended to 64 bit), why not have it ?
> 
> Let's assume there is hash conflict. We have raid10 anyway, which doesn't have
> the fancy barrier. When can the deadlock be triggered? My understanding is
> there isn't because we are handling bios in raid1/10d. Any thing I missed?
> 

Oho, yeah, when we discuss indirect bio handling by raid1d, no barrier
bucket idx mentioned at all. That means even without the new I/O barrier
code, it does not lock up at all.

Yes you are right, ignore my noise please :-)

Coly




^ permalink raw reply

* Re: [BUG] non-metadata arrays cannot use more than 27 component devices
From: Phil Turmel @ 2017-02-24 20:46 UTC (permalink / raw)
  To: ian_bruce, linux-raid
In-Reply-To: <20170224084024.4dfe83a2.ian_bruce@mail.ru>

On 02/24/2017 11:40 AM, ian_bruce@mail.ru wrote:
> On Fri, 24 Feb 2017 10:20:52 -0500 Phil Turmel <philip@turmel.org> 
> wrote:
> 
>> Considering the existence of --build is strictly to support arrays 
>> that predate MD raid, it seems a bit of a stretch to claim this as
>> a bug instead of a feature request.
> 
> quoting from the mdadm manual page:

Quote all you like, it doesn't change the history. Note that build mode
doesn't support a bunch of other MD raid features either, like all of
the parity raid levels.  That it doesn't support v1+ metadata isn't a
surprise, and isn't the only legacy feature that only uses legacy
metadata (built-in kernel auto-assembly gets the most whining, actually).

Anyways, though I can't speak for the maintainers, it seems that build
mode is there to keep the MD maintainers from being yelled at by Linus
for breaking legacy setups.  Nothing more.

If you think its trivial to implement --build with v1.x metadata, go
right ahead.  Post your patches for review.

Phil

^ permalink raw reply

* raid5 - adding journal to an existing device?
From: Tomasz Chmielewski @ 2017-02-25 14:12 UTC (permalink / raw)
  To: linux-raid

mdadm manual specifies the following for create, build and grow modes:

        --write-journal
               Specify journal device for the RAID-4/5/6 array.

However:

# mdadm --grow /dev/md2 --write-journal /dev/sdc1
mdadm: :option --write-journal not valid in grow mode

Is it not yet supported, will never be supported, or am I simply trying 
to use it in a  wrong way?

Tried the following versions:

# ./mdadm -V
mdadm - v3.4-115-ge22fe3a - 29th November 2016

# mdadm -V
mdadm - v3.4 - 28th January 2016

Tomasz Chmielewski
https://lxadm.com

^ permalink raw reply

* Re: [PATCH v1 01/14] block: introduce bio_segments_all()
From: Christoph Hellwig @ 2017-02-25 18:22 UTC (permalink / raw)
  To: Ming Lei
  Cc: Shaohua Li, Jens Axboe, linux-kernel, linux-raid, linux-block,
	Christoph Hellwig
In-Reply-To: <1487950971-1131-2-git-send-email-tom.leiming@gmail.com>

> +static inline unsigned bio_segments_all(struct bio *bio)
> +{
> +	WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED));
> +
> +	return bio->bi_vcnt;
> +}

I don't think this helpers really adds any benefit.

^ permalink raw reply

* Re: [PATCH v1 02/14] block: introduce bio_remove_last_page()
From: Christoph Hellwig @ 2017-02-25 18:23 UTC (permalink / raw)
  To: Ming Lei
  Cc: Shaohua Li, Jens Axboe, linux-kernel, linux-raid, linux-block,
	Christoph Hellwig
In-Reply-To: <1487950971-1131-3-git-send-email-tom.leiming@gmail.com>

On Fri, Feb 24, 2017 at 11:42:39PM +0800, Ming Lei wrote:
> MD need this helper to remove the last added page, so introduce
> it.

If MD really has a valid use case for this it should open code the
operation.  The semantics look deeply fishy to me.

^ permalink raw reply

* Re: [BUG] non-metadata arrays cannot use more than 27 component devices
From: Anthony Youngman @ 2017-02-25 20:05 UTC (permalink / raw)
  To: Phil Turmel, ian_bruce, linux-raid
In-Reply-To: <1e40da0d-b175-9ff5-d2e5-cf1f25aacc26@turmel.org>

On 24/02/17 20:46, Phil Turmel wrote:
> On 02/24/2017 11:40 AM, ian_bruce@mail.ru wrote:
>> On Fri, 24 Feb 2017 10:20:52 -0500 Phil Turmel <philip@turmel.org>
>> wrote:
>>
>>> Considering the existence of --build is strictly to support arrays
>>> that predate MD raid, it seems a bit of a stretch to claim this as
>>> a bug instead of a feature request.
>> quoting from the mdadm manual page:
> Quote all you like, it doesn't change the history. Note that build mode
> doesn't support a bunch of other MD raid features either, like all of
> the parity raid levels.  That it doesn't support v1+ metadata isn't a
> surprise, and isn't the only legacy feature that only uses legacy
> metadata (built-in kernel auto-assembly gets the most whining, actually).
>
> Anyways, though I can't speak for the maintainers, it seems that build
> mode is there to keep the MD maintainers from being yelled at by Linus
> for breaking legacy setups.  Nothing more.

Although I would have thought build mode was superb for doing backups 
without needing to stop using the system ... I haven't seen any 
documentation about things like breaking raid to do backups and all that 
sort of thing.

I need to investigate it, but I'd like to know how to suspend a mirror, 
back it up, and then resume. The databases I work with have an option 
that suspends all new writes, but flushes all current transactions to 
disk so the disk is consistent for backing up. So if you do that and 
back up the database you know your backup is consistent.

This is all a rather important usage of raid, actually, imho. It seems 
so obvious - create a temporary mirror, wait for the sync to complete, 
suspend i/o to get the disk consistent, then you can break the mirror 
and carry on. Terabytes :-) of data safely backed up in the space of 
seconds.

Cheers,
Wol

^ permalink raw reply

* Re: [BUG] non-metadata arrays cannot use more than 27 component devices
From: Phil Turmel @ 2017-02-25 22:00 UTC (permalink / raw)
  To: Anthony Youngman, ian_bruce, linux-raid
In-Reply-To: <c23b2aca-b6e6-37b8-3238-e59423d8f5a7@youngman.org.uk>

On 02/25/2017 03:05 PM, Anthony Youngman wrote:

> Although I would have thought build mode was superb for doing
> backups without needing to stop using the system ... I haven't seen
> any documentation about things like breaking raid to do backups and
> all that sort of thing.
> 
> I need to investigate it, but I'd like to know how to suspend a
> mirror, back it up, and then resume. The databases I work with have
> an option that suspends all new writes, but flushes all current
> transactions to disk so the disk is consistent for backing up. So if
> you do that and back up the database you know your backup is
> consistent.
> 
> This is all a rather important usage of raid, actually, imho. It
> seems so obvious - create a temporary mirror, wait for the sync to
> complete, suspend i/o to get the disk consistent, then you can break
> the mirror and carry on. Terabytes :-) of data safely backed up in
> the space of seconds.

No. Don't go there.  There's already a technology out there that does
this correctly, called LVM snapshots.  And they let you resume normal
operations after a very brief hesitation, and the snapshot holds the
static image while you copy it off.

Phil

^ permalink raw reply

* Re: [BUG] non-metadata arrays cannot use more than 27 component devices
From: Wols Lists @ 2017-02-25 23:30 UTC (permalink / raw)
  To: Phil Turmel, ian_bruce, linux-raid
In-Reply-To: <aee1f55b-cedf-15e9-5865-7fc0cdf9a188@turmel.org>

On 25/02/17 22:00, Phil Turmel wrote:
>> This is all a rather important usage of raid, actually, imho. It
>> > seems so obvious - create a temporary mirror, wait for the sync to
>> > complete, suspend i/o to get the disk consistent, then you can break
>> > the mirror and carry on. Terabytes :-) of data safely backed up in
>> > the space of seconds.

> No. Don't go there.  There's already a technology out there that does
> this correctly, called LVM snapshots.  And they let you resume normal
> operations after a very brief hesitation, and the snapshot holds the
> static image while you copy it off.

Will it let you put that snapshot on a hot-plug disk you can remove? For
my little system I'd quite happily mirror it off onto a hard-disk and
unplug it.

Oh - and I'm not running lvm. Not that I think there's anything wrong
with that, it's just yet another layer that I'm not (currently)
comfortable with.

Is there a sound technical reason not to go there, or is it simply a
case of "learn another tool for that job"? The less tools I have to know
the better, imho.

(Although why I'm worrying, I don't know. I know btrfs is planning to
make that obsolete :-)

Cheers,
Wol

^ permalink raw reply

* Re: [BUG] non-metadata arrays cannot use more than 27 component devices
From: Phil Turmel @ 2017-02-25 23:41 UTC (permalink / raw)
  To: Wols Lists, ian_bruce, linux-raid
In-Reply-To: <58B2137B.6070608@youngman.org.uk>

On 02/25/2017 06:30 PM, Wols Lists wrote:
> On 25/02/17 22:00, Phil Turmel wrote:

>> No. Don't go there.  There's already a technology out there that does
>> this correctly, called LVM snapshots.  And they let you resume normal
>> operations after a very brief hesitation, and the snapshot holds the
>> static image while you copy it off.
> 
> Will it let you put that snapshot on a hot-plug disk you can remove? For
> my little system I'd quite happily mirror it off onto a hard-disk and
> unplug it.

You can copy it off to any block device you like, or dd it to a file, or
dd and gzip to a compressed file.  Anything you can do to copy a
partition to backup can be used on the snapshot.

> Oh - and I'm not running lvm. Not that I think there's anything wrong
> with that, it's just yet another layer that I'm not (currently)
> comfortable with.

So you know how to use a hammer, and don't feel comfortable with using a
handsaw, so you're going to smash a board in two instead of sawing it?

Ok, maybe that was too facetious. (-:

> Is there a sound technical reason not to go there, or is it simply a
> case of "learn another tool for that job"? The less tools I have to know
> the better, imho.

Um, no, imnsho.  Learn new tools when you need them.

Linux raid has no formal mechanism to cleanly separate a mirror from a
running array, access it as a backup, and not risk corruption when
re-attaching it to the array.  Most filesystems write to the partition
when mounting, even for read-only mounts.  You cannot safely access the
disconnected member except via pure block reads.

Phil

^ permalink raw reply

* Re: [BUG] non-metadata arrays cannot use more than 27 component devices
From: Wols Lists @ 2017-02-25 23:55 UTC (permalink / raw)
  To: Phil Turmel, ian_bruce, linux-raid
In-Reply-To: <5172e2ab-e193-477b-52c4-86fbab0d52fe@turmel.org>

On 25/02/17 23:41, Phil Turmel wrote:
>> Is there a sound technical reason not to go there, or is it simply a
>> > case of "learn another tool for that job"? The less tools I have to know
>> > the better, imho.

> Um, no, imnsho.  Learn new tools when you need them.

I don't have a problem with that. All too often people use the tool
they're familiar with when it's the wrong tool. But there's a reason
they do that - it's a familiar tool!
> 
> Linux raid has no formal mechanism to cleanly separate a mirror from a
> running array, access it as a backup, and not risk corruption when
> re-attaching it to the array.  Most filesystems write to the partition
> when mounting, even for read-only mounts.  You cannot safely access the
> disconnected member except via pure block reads.

Because to do so doesn't make sense? Or because nobody's bothered to do
it? I get grumpy when people implement corner cases without bothering to
implement the logically sensible options - bit like those extremely
annoying dialog boxes that give you three choices, "yes", "no", "yes to
all". What about no to all?

I feel like mirror-raid is perfect for doing backups. I take your point
that linux hasn't implemented that feature (particularly well), but
surely it's a feature that *should* be there. I know I know - "patches
welcome" :-)

Cheers,
Wol

^ permalink raw reply

* Re: [BUG] non-metadata arrays cannot use more than 27 component devices
From: Phil Turmel @ 2017-02-26  0:07 UTC (permalink / raw)
  To: Wols Lists, ian_bruce, linux-raid
In-Reply-To: <58B21987.6060604@youngman.org.uk>

On 02/25/2017 06:55 PM, Wols Lists wrote:
> On 25/02/17 23:41, Phil Turmel wrote:
>>> Is there a sound technical reason not to go there, or is it simply a
>>>> case of "learn another tool for that job"? The less tools I have to know
>>>> the better, imho.
> 
>> Um, no, imnsho.  Learn new tools when you need them.
> 
> I don't have a problem with that. All too often people use the tool
> they're familiar with when it's the wrong tool. But there's a reason
> they do that - it's a familiar tool!
>>
>> Linux raid has no formal mechanism to cleanly separate a mirror from a
>> running array, access it as a backup, and not risk corruption when
>> re-attaching it to the array.  Most filesystems write to the partition
>> when mounting, even for read-only mounts.  You cannot safely access the
>> disconnected member except via pure block reads.
> 
> Because to do so doesn't make sense? Or because nobody's bothered to do
> it? I get grumpy when people implement corner cases without bothering to
> implement the logically sensible options - bit like those extremely
> annoying dialog boxes that give you three choices, "yes", "no", "yes to
> all". What about no to all?

Because while disconnected, and the array begins accumulating
write-intent bits indicating where any disconnected device is out of
date, the array has no way to know what writes are happening to that
member.  And therefore any re-add will introduce unknowable corruptions.
 There is no way to control what writes happen to that member, and
drives don't naturally keep a log of writes that have happened.  The data to
safely do what you want simply doesn't exist.  Your only known safe
choice is to disable write-intent bitmaps, forcing complete resync on
--re-add.

> I feel like mirror-raid is perfect for doing backups.

Your feelings are wrong.  Sorry.  LVM is the perfect tool because it
entirely controls the snapshot and doesn't have to re-add it.

> I take your point
> that linux hasn't implemented that feature (particularly well), but
> surely it's a feature that *should* be there. I know I know - "patches
> welcome" :-)

Good luck creating the necessary data from thin air.  It's not a
question of writing patches.

Phil

^ permalink raw reply

* RAID10 reshape and change lout possible?
From: Reindl Harald @ 2017-02-26 14:37 UTC (permalink / raw)
  To: linux-raid

since in recent news i saw that convert several raid-levels with respahe 
is possible now

is it somehow possible to change a existing md RAID10 array from "2 
near-copies" to "2 far-copies" and what would be the mdadm syntax? the 
IO intense workload here are large reads within virtual disks and as far 
as i can see far-layout would here be the better option but as installed 
the stuff in 2011 not sure if anaconda even offered that in the OS installer

so i would like to change the layout "on-the-fly" without dump/restore 
data (which is hard for the OS itself while keep dracut and friends 
working just fine) and also important keep all the UUID's (there is a 
second cloned machine which is in large parts rsynced including 
grub-config and fstab)

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox