Linux RAID subsystem development

Linux RAID subsystem development
 help / color / mirror / Atom feed

* Re: [PATCH v2 09/13] md: raid1: use bio_segments_all()
From: Shaohua Li @ 2017-02-28 23:42 UTC (permalink / raw)
  To: Ming Lei; +Cc: Jens Axboe, linux-raid, linux-block, Christoph Hellwig
In-Reply-To: <1488296503-4987-10-git-send-email-tom.leiming@gmail.com>

On Tue, Feb 28, 2017 at 11:41:39PM +0800, Ming Lei wrote:
> Use this helper, instead of direct access to .bi_vcnt.

what We really need to do for the behind IO is:
- allocate memory and copy bio data to the memory
- let behind bio do IO against the memory

The behind bio doesn't need to have the exactly same bio_vec setting. If we
just track the new memory, we don't need use the bio_segments_all and access
bio_vec too.

Thanks,
Shaohua
 
> Signed-off-by: Ming Lei <tom.leiming@gmail.com>
> ---
>  drivers/md/raid1.c | 7 ++++---
>  1 file changed, 4 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
> index 316bd6dd6cc1..7396c99ff7b1 100644
> --- a/drivers/md/raid1.c
> +++ b/drivers/md/raid1.c
> @@ -1091,7 +1091,8 @@ static void alloc_behind_pages(struct bio *bio, struct r1bio *r1_bio)
>  {
>  	int i;
>  	struct bio_vec *bvec;
> -	struct bio_vec *bvecs = kzalloc(bio->bi_vcnt * sizeof(struct bio_vec),
> +	unsigned vcnt = bio_segments_all(bio);
> +	struct bio_vec *bvecs = kzalloc(vcnt * sizeof(struct bio_vec),
>  					GFP_NOIO);
>  	if (unlikely(!bvecs))
>  		return;
> @@ -1107,12 +1108,12 @@ static void alloc_behind_pages(struct bio *bio, struct r1bio *r1_bio)
>  		kunmap(bvec->bv_page);
>  	}
>  	r1_bio->behind_bvecs = bvecs;
> -	r1_bio->behind_page_count = bio->bi_vcnt;
> +	r1_bio->behind_page_count = vcnt;
>  	set_bit(R1BIO_BehindIO, &r1_bio->state);
>  	return;
>  
>  do_sync_io:
> -	for (i = 0; i < bio->bi_vcnt; i++)
> +	for (i = 0; i < vcnt; i++)
>  		if (bvecs[i].bv_page)
>  			put_page(bvecs[i].bv_page);
>  	kfree(bvecs);
> -- 
> 2.7.4
> 

^ permalink raw reply

* Re: [PATCH v2 11/13] md: raid10: don't use bio's vec table to manage resync pages
From: Shaohua Li @ 2017-02-28 23:43 UTC (permalink / raw)
  To: Ming Lei; +Cc: Jens Axboe, linux-raid, linux-block, Christoph Hellwig
In-Reply-To: <1488296503-4987-12-git-send-email-tom.leiming@gmail.com>

On Tue, Feb 28, 2017 at 11:41:41PM +0800, Ming Lei wrote:
> Now we allocate one page array for managing resync pages, instead
> of using bio's vec table to do that, and the old way is very hacky
> and won't work any more if multipage bvec is enabled.
> 
> The introduced cost is that we need to allocate (128 + 16) * copies
> bytes per r10_bio, and it is fine because the inflight r10_bio for
> resync shouldn't be much, as pointed by Shaohua.
> 
> Also bio_reset() in raid10_sync_request() and reshape_request()
> are removed because all bios are freshly new now in these functions
> and not necessary to reset any more.
> 
> This patch can be thought as cleanup too.
> 
> Suggested-by: Shaohua Li <shli@kernel.org>
> Signed-off-by: Ming Lei <tom.leiming@gmail.com>
> ---
>  drivers/md/raid10.c | 125 ++++++++++++++++++++++++++++++----------------------
>  1 file changed, 73 insertions(+), 52 deletions(-)
> 
> diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
> index a9ddd4f14008..f887b21332e7 100644
> --- a/drivers/md/raid10.c
> +++ b/drivers/md/raid10.c
> @@ -110,6 +110,16 @@ static void end_reshape(struct r10conf *conf);
>  #define raid10_log(md, fmt, args...)				\
>  	do { if ((md)->queue) blk_add_trace_msg((md)->queue, "raid10 " fmt, ##args); } while (0)
>  
> +static inline struct resync_pages *get_resync_pages(struct bio *bio)
> +{
> +	return bio->bi_private;
> +}
> +
> +static inline struct r10bio *get_resync_r10bio(struct bio *bio)
> +{
> +	return get_resync_pages(bio)->raid_bio;
> +}

Same applies to raid10 too. embedded the resync_pages into r10bio, possibly a
pointer. Merge the 11, 12, 13 into a single patch.

Thanks,
Shaohua

> +
>  static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data)
>  {
>  	struct r10conf *conf = data;
> @@ -140,11 +150,11 @@ static void r10bio_pool_free(void *r10_bio, void *data)
>  static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
>  {
>  	struct r10conf *conf = data;
> -	struct page *page;
>  	struct r10bio *r10_bio;
>  	struct bio *bio;
> -	int i, j;
> -	int nalloc;
> +	int j;
> +	int nalloc, nalloc_rp;
> +	struct resync_pages *rps;
>  
>  	r10_bio = r10bio_pool_alloc(gfp_flags, conf);
>  	if (!r10_bio)
> @@ -156,6 +166,15 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
>  	else
>  		nalloc = 2; /* recovery */
>  
> +	/* allocate once for all bios */
> +	if (!conf->have_replacement)
> +		nalloc_rp = nalloc;
> +	else
> +		nalloc_rp = nalloc * 2;
> +	rps = kmalloc(sizeof(struct resync_pages) * nalloc_rp, gfp_flags);
> +	if (!rps)
> +		goto out_free_r10bio;
> +
>  	/*
>  	 * Allocate bios.
>  	 */
> @@ -175,36 +194,40 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
>  	 * Allocate RESYNC_PAGES data pages and attach them
>  	 * where needed.
>  	 */
> -	for (j = 0 ; j < nalloc; j++) {
> +	for (j = 0; j < nalloc; j++) {
>  		struct bio *rbio = r10_bio->devs[j].repl_bio;
> +		struct resync_pages *rp, *rp_repl;
> +
> +		rp = &rps[j];
> +		if (rbio)
> +			rp_repl = &rps[nalloc + j];
> +
>  		bio = r10_bio->devs[j].bio;
> -		for (i = 0; i < RESYNC_PAGES; i++) {
> -			if (j > 0 && !test_bit(MD_RECOVERY_SYNC,
> -					       &conf->mddev->recovery)) {
> -				/* we can share bv_page's during recovery
> -				 * and reshape */
> -				struct bio *rbio = r10_bio->devs[0].bio;
> -				page = rbio->bi_io_vec[i].bv_page;
> -				get_page(page);
> -			} else
> -				page = alloc_page(gfp_flags);
> -			if (unlikely(!page))
> +
> +		if (!j || test_bit(MD_RECOVERY_SYNC,
> +				   &conf->mddev->recovery)) {
> +			if (resync_alloc_pages(rp, gfp_flags))
>  				goto out_free_pages;
> +		} else {
> +			memcpy(rp, &rps[0], sizeof(*rp));
> +			resync_get_all_pages(rp);
> +		}
>  
> -			bio->bi_io_vec[i].bv_page = page;
> -			if (rbio)
> -				rbio->bi_io_vec[i].bv_page = page;
> +		rp->idx = 0;
> +		rp->raid_bio = r10_bio;
> +		bio->bi_private = rp;
> +		if (rbio) {
> +			memcpy(rp_repl, rp, sizeof(*rp));
> +			rbio->bi_private = rp_repl;
>  		}
>  	}
>  
>  	return r10_bio;
>  
>  out_free_pages:
> -	for ( ; i > 0 ; i--)
> -		safe_put_page(bio->bi_io_vec[i-1].bv_page);
> -	while (j--)
> -		for (i = 0; i < RESYNC_PAGES ; i++)
> -			safe_put_page(r10_bio->devs[j].bio->bi_io_vec[i].bv_page);
> +	while (--j >= 0)
> +		resync_free_pages(&rps[j * 2]);
> +
>  	j = 0;
>  out_free_bio:
>  	for ( ; j < nalloc; j++) {
> @@ -213,30 +236,34 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
>  		if (r10_bio->devs[j].repl_bio)
>  			bio_put(r10_bio->devs[j].repl_bio);
>  	}
> +	kfree(rps);
> +out_free_r10bio:
>  	r10bio_pool_free(r10_bio, conf);
>  	return NULL;
>  }
>  
>  static void r10buf_pool_free(void *__r10_bio, void *data)
>  {
> -	int i;
>  	struct r10conf *conf = data;
>  	struct r10bio *r10bio = __r10_bio;
>  	int j;
> +	struct resync_pages *rp = NULL;
>  
> -	for (j=0; j < conf->copies; j++) {
> +	for (j = conf->copies; j--; ) {
>  		struct bio *bio = r10bio->devs[j].bio;
> -		if (bio) {
> -			for (i = 0; i < RESYNC_PAGES; i++) {
> -				safe_put_page(bio->bi_io_vec[i].bv_page);
> -				bio->bi_io_vec[i].bv_page = NULL;
> -			}
> -			bio_put(bio);
> -		}
> +
> +		rp = get_resync_pages(bio);
> +		resync_free_pages(rp);
> +		bio_put(bio);
> +
>  		bio = r10bio->devs[j].repl_bio;
>  		if (bio)
>  			bio_put(bio);
>  	}
> +
> +	/* resync pages array stored in the 1st bio's .bi_private */
> +	kfree(rp);
> +
>  	r10bio_pool_free(r10bio, conf);
>  }
>  
> @@ -1935,7 +1962,7 @@ static void __end_sync_read(struct r10bio *r10_bio, struct bio *bio, int d)
>  
>  static void end_sync_read(struct bio *bio)
>  {
> -	struct r10bio *r10_bio = bio->bi_private;
> +	struct r10bio *r10_bio = get_resync_r10bio(bio);
>  	struct r10conf *conf = r10_bio->mddev->private;
>  	int d = find_bio_disk(conf, r10_bio, bio, NULL, NULL);
>  
> @@ -1944,6 +1971,7 @@ static void end_sync_read(struct bio *bio)
>  
>  static void end_reshape_read(struct bio *bio)
>  {
> +	/* reshape read bio isn't allocated from r10buf_pool */
>  	struct r10bio *r10_bio = bio->bi_private;
>  
>  	__end_sync_read(r10_bio, bio, r10_bio->read_slot);
> @@ -1978,7 +2006,7 @@ static void end_sync_request(struct r10bio *r10_bio)
>  
>  static void end_sync_write(struct bio *bio)
>  {
> -	struct r10bio *r10_bio = bio->bi_private;
> +	struct r10bio *r10_bio = get_resync_r10bio(bio);
>  	struct mddev *mddev = r10_bio->mddev;
>  	struct r10conf *conf = mddev->private;
>  	int d;
> @@ -2058,6 +2086,7 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
>  	for (i=0 ; i < conf->copies ; i++) {
>  		int  j, d;
>  		struct md_rdev *rdev;
> +		struct resync_pages *rp;
>  
>  		tbio = r10_bio->devs[i].bio;
>  
> @@ -2099,11 +2128,13 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
>  		 * First we need to fixup bv_offset, bv_len and
>  		 * bi_vecs, as the read request might have corrupted these
>  		 */
> +		rp = get_resync_pages(tbio);
>  		bio_reset(tbio);
>  
>  		tbio->bi_vcnt = vcnt;
>  		tbio->bi_iter.bi_size = fbio->bi_iter.bi_size;
> -		tbio->bi_private = r10_bio;
> +		rp->raid_bio = r10_bio;
> +		tbio->bi_private = rp;
>  		tbio->bi_iter.bi_sector = r10_bio->devs[i].addr;
>  		tbio->bi_end_io = end_sync_write;
>  		bio_set_op_attrs(tbio, REQ_OP_WRITE, 0);
> @@ -3171,10 +3202,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
>  					}
>  				}
>  				bio = r10_bio->devs[0].bio;
> -				bio_reset(bio);
>  				bio->bi_next = biolist;
>  				biolist = bio;
> -				bio->bi_private = r10_bio;
>  				bio->bi_end_io = end_sync_read;
>  				bio_set_op_attrs(bio, REQ_OP_READ, 0);
>  				if (test_bit(FailFast, &rdev->flags))
> @@ -3198,10 +3227,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
>  
>  				if (!test_bit(In_sync, &mrdev->flags)) {
>  					bio = r10_bio->devs[1].bio;
> -					bio_reset(bio);
>  					bio->bi_next = biolist;
>  					biolist = bio;
> -					bio->bi_private = r10_bio;
>  					bio->bi_end_io = end_sync_write;
>  					bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
>  					bio->bi_iter.bi_sector = to_addr
> @@ -3226,10 +3253,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
>  				if (mreplace == NULL || bio == NULL ||
>  				    test_bit(Faulty, &mreplace->flags))
>  					break;
> -				bio_reset(bio);
>  				bio->bi_next = biolist;
>  				biolist = bio;
> -				bio->bi_private = r10_bio;
>  				bio->bi_end_io = end_sync_write;
>  				bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
>  				bio->bi_iter.bi_sector = to_addr +
> @@ -3351,7 +3376,6 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
>  				r10_bio->devs[i].repl_bio->bi_end_io = NULL;
>  
>  			bio = r10_bio->devs[i].bio;
> -			bio_reset(bio);
>  			bio->bi_error = -EIO;
>  			rcu_read_lock();
>  			rdev = rcu_dereference(conf->mirrors[d].rdev);
> @@ -3376,7 +3400,6 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
>  			atomic_inc(&r10_bio->remaining);
>  			bio->bi_next = biolist;
>  			biolist = bio;
> -			bio->bi_private = r10_bio;
>  			bio->bi_end_io = end_sync_read;
>  			bio_set_op_attrs(bio, REQ_OP_READ, 0);
>  			if (test_bit(FailFast, &conf->mirrors[d].rdev->flags))
> @@ -3395,13 +3418,11 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
>  
>  			/* Need to set up for writing to the replacement */
>  			bio = r10_bio->devs[i].repl_bio;
> -			bio_reset(bio);
>  			bio->bi_error = -EIO;
>  
>  			sector = r10_bio->devs[i].addr;
>  			bio->bi_next = biolist;
>  			biolist = bio;
> -			bio->bi_private = r10_bio;
>  			bio->bi_end_io = end_sync_write;
>  			bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
>  			if (test_bit(FailFast, &conf->mirrors[d].rdev->flags))
> @@ -3440,7 +3461,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
>  		if (len == 0)
>  			break;
>  		for (bio= biolist ; bio ; bio=bio->bi_next) {
> -			page = bio->bi_io_vec[bio->bi_vcnt].bv_page;
> +			page = resync_fetch_page(get_resync_pages(bio));
>  			/*
>  			 * won't fail because the vec table is big enough
>  			 * to hold all these pages
> @@ -3449,7 +3470,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
>  		}
>  		nr_sectors += len>>9;
>  		sector_nr += len>>9;
> -	} while (biolist->bi_vcnt < RESYNC_PAGES);
> +	} while (resync_page_available(get_resync_pages(biolist)));
>  	r10_bio->sectors = nr_sectors;
>  
>  	while (biolist) {
> @@ -3457,7 +3478,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
>  		biolist = biolist->bi_next;
>  
>  		bio->bi_next = NULL;
> -		r10_bio = bio->bi_private;
> +		r10_bio = get_resync_r10bio(bio);
>  		r10_bio->sectors = nr_sectors;
>  
>  		if (bio->bi_end_io == end_sync_read) {
> @@ -4352,6 +4373,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
>  	struct bio *blist;
>  	struct bio *bio, *read_bio;
>  	int sectors_done = 0;
> +	struct page **pages;
>  
>  	if (sector_nr == 0) {
>  		/* If restarting in the middle, skip the initial sectors */
> @@ -4502,11 +4524,9 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
>  		if (!rdev2 || test_bit(Faulty, &rdev2->flags))
>  			continue;
>  
> -		bio_reset(b);
>  		b->bi_bdev = rdev2->bdev;
>  		b->bi_iter.bi_sector = r10_bio->devs[s/2].addr +
>  			rdev2->new_data_offset;
> -		b->bi_private = r10_bio;
>  		b->bi_end_io = end_reshape_write;
>  		bio_set_op_attrs(b, REQ_OP_WRITE, 0);
>  		b->bi_next = blist;
> @@ -4516,8 +4536,9 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
>  	/* Now add as many pages as possible to all of these bios. */
>  
>  	nr_sectors = 0;
> +	pages = get_resync_pages(r10_bio->devs[0].bio)->pages;
>  	for (s = 0 ; s < max_sectors; s += PAGE_SIZE >> 9) {
> -		struct page *page = r10_bio->devs[0].bio->bi_io_vec[s/(PAGE_SIZE>>9)].bv_page;
> +		struct page *page = pages[s / (PAGE_SIZE >> 9)];
>  		int len = (max_sectors - s) << 9;
>  		if (len > PAGE_SIZE)
>  			len = PAGE_SIZE;
> @@ -4701,7 +4722,7 @@ static int handle_reshape_read_error(struct mddev *mddev,
>  
>  static void end_reshape_write(struct bio *bio)
>  {
> -	struct r10bio *r10_bio = bio->bi_private;
> +	struct r10bio *r10_bio = get_resync_r10bio(bio);
>  	struct mddev *mddev = r10_bio->mddev;
>  	struct r10conf *conf = mddev->private;
>  	int d;
> -- 
> 2.7.4
> 

^ permalink raw reply

* Re: [PATCH v2 13/13] md: raid10: avoid direct access to bvec table in handle_reshape_read_error
From: Shaohua Li @ 2017-02-28 23:46 UTC (permalink / raw)
  To: Ming Lei; +Cc: Jens Axboe, linux-raid, linux-block, Christoph Hellwig
In-Reply-To: <1488296503-4987-14-git-send-email-tom.leiming@gmail.com>

On Tue, Feb 28, 2017 at 11:41:43PM +0800, Ming Lei wrote:
> The cost is 128bytes(8*16) stack space in kernel thread context, and
> just use the bio helper to retrieve pages from bio.
> 
> Signed-off-by: Ming Lei <tom.leiming@gmail.com>
> ---
>  drivers/md/raid10.c | 12 ++++++++++--
>  1 file changed, 10 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
> index 0b97631e3905..6ffb64ab45f8 100644
> --- a/drivers/md/raid10.c
> +++ b/drivers/md/raid10.c
> @@ -4670,7 +4670,15 @@ static int handle_reshape_read_error(struct mddev *mddev,
>  	struct r10bio *r10b = &on_stack.r10_bio;
>  	int slot = 0;
>  	int idx = 0;
> -	struct bio_vec *bvec = r10_bio->master_bio->bi_io_vec;
> +	struct bio_vec *bvl;
> +	struct page *pages[RESYNC_PAGES];
> +
> +	/*
> +	 * This bio is allocated in reshape_request(), and size
> +	 * is still RESYNC_PAGES
> +	 */
> +	bio_for_each_segment_all(bvl, r10_bio->master_bio, idx)
> +		pages[idx] = bvl->bv_page;

The reshape bio is doing IO against the memory we allocated for r10_bio, I'm
wondering why we can't get the pages from r10_bio. In this way, we don't need
access the bio_vec any more.

Thanks,
Shaohua
 
>  	r10b->sector = r10_bio->sector;
>  	__raid10_find_phys(&conf->prev, r10b);
> @@ -4699,7 +4707,7 @@ static int handle_reshape_read_error(struct mddev *mddev,
>  			success = sync_page_io(rdev,
>  					       addr,
>  					       s << 9,
> -					       bvec[idx].bv_page,
> +					       pages[idx],
>  					       REQ_OP_READ, 0, false);
>  			rdev_dec_pending(rdev, mddev);
>  			rcu_read_lock();
> -- 
> 2.7.4
> 

^ permalink raw reply

* Re: GRUB warning after replacing disk drive in RAID1
From: Reindl Harald @ 2017-03-01  0:12 UTC (permalink / raw)
  To: linux-raid
In-Reply-To: <009601d29218$9fd3e460$df7bad20$@wnsdev.com>



Am 01.03.2017 um 00:15 schrieb Peter Sangas:
> Thanks for your help.  See below for output

Personalities : [raid1] [linear] [multipath] [raid0] [raid6] [raid5] [raid4]

not sure if it means something but i only see the on the machines used 
raid-levels at that line - looks like i am out of ideas for now, i saw 
the "grub-install: warning: Couldn't find physical volume `(null)'" 
messages (while it at the same time said it was successful) as i removed 
2 out of my 4 drives from the RAID10 by using the script below and after 
both where in sync "grub2-sintall /dev/sd[a-d]" was completly silent again

___________________________________________________

#!/bin/bash

GOOD_DISK="/dev/sda"
BAD_DISK="/dev/sdc"

# clone MBR
dd if=$GOOD_DISK of=$BAD_DISK bs=512 count=1

# force OS to read partition tables
partprobe $BAD_DISK

# start RAID recovery
mdadm /dev/md0 --add ${BAD_DISK}1
mdadm /dev/md1 --add ${BAD_DISK}2
mdadm /dev/md2 --add ${BAD_DISK}3

# print RAID status on screen
sleep 5
cat /proc/mdstat

# install bootloader on replacement disk
grub2-install "$BAD_DISK"
___________________________________________________


> -----Original Message-----
> From: Reindl Harald [mailto:h.reindl@thelounge.net]
> Sent: Tuesday, February 28, 2017 2:34 PM
> To: linux-raid@vger.kernel.org
> Subject: Re: GRUB warning after replacing disk drive in RAID1
>
>
>
> Am 28.02.2017 um 22:01 schrieb Peter Sangas:
>> But I issue the grub command AFTER the re-sync is completed
>
>>>> output of "cat /proc/mdstat" and your environment missing!
>
>>>> * cat /proc/mdstat
>>>> * df -hT
>>>> * lsscsi
>>>> * lsblk
>
>>>> no pictures and interpretations, just copy&paste from the terminal (input
> as well as output)
>
>>>> please help others to help you
>
>
> cat /proc/mdstat
> Personalities : [raid1] [linear] [multipath] [raid0] [raid6] [raid5] [raid4]
> [raid10]
> md3 : active raid1 sdc5[3] sdb5[1] sda5[0]
>       97589248 blocks super 1.2 [3/3] [UUU]
>
> md1 : active raid1 sdc2[3] sdb2[1] sda2[0]
>       126887936 blocks super 1.2 [3/3] [UUU]
>       bitmap: 0/1 pages [0KB], 65536KB chunk
>
> md5 : active raid1 sdc7[3] sdb7[1] sda7[0]
>       244169728 blocks super 1.2 [3/3] [UUU]
>       bitmap: 0/2 pages [0KB], 65536KB chunk
>
> md2 : active raid1 sdc3[3] sdb3[1] sda3[0]
>       195181568 blocks super 1.2 [3/3] [UUU]
>       bitmap: 1/2 pages [4KB], 65536KB chunk
>
> md4 : active raid1 sdc6[3] sdb6[1] sda6[0]
>       97589248 blocks super 1.2 [3/3] [UUU]
>
> md0 : active raid1 sdc1[3] sdb1[1] sda1[0]
>       19514368 blocks super 1.2 [3/3] [UUU]
>
> unused devices: <none>
>
> uname -a
> Linux green 4.4.0-47-generic #68-Ubuntu SMP Wed Oct 26 19:39:52 UTC 2016
> x86_64 x86_64 x86_64 GNU/Linux
>
> df -hT
> Filesystem     Type      Size  Used Avail Use% Mounted on
> udev           devtmpfs   63G     0   63G   0% /dev
> tmpfs          tmpfs      13G  746M   12G   6% /run
> /dev/md2       ext4      184G   31G  144G  18% /
> tmpfs          tmpfs      63G     0   63G   0% /dev/shm
> tmpfs          tmpfs     5.0M     0  5.0M   0% /run/lock
> tmpfs          tmpfs      63G     0   63G   0% /sys/fs/cgroup
> /dev/md0       ext4       19G  289M   17G   2% /boot
> /dev/md3       ext4       92G   40G   48G  46% /cl
> /dev/md5       ext4      230G   31G  187G  15% /sd
> /dev/md4       ext4       92G   20G   68G  22% /pc
> tan:/clbck     nfs4      596G  169G  398G  30% /clbck
> tan:/sdbck     nfs4      596G  169G  398G  30% /sdbck
> tmpfs          tmpfs      13G  4.0K   13G   1% /run/user/275
> /dev/sde1      ext3      2.7T  676G  1.9T  26% /archive
> tmpfs          tmpfs      13G  4.0K   13G   1% /run/user/286
> /dev/sdd1      ext3      1.8T  1.6T  182G  90% /backupdisk
> tmpfs          tmpfs      13G   12K   13G   1% /run/user/277
> tmpfs          tmpfs      13G     0   13G   0% /run/user/283
> tmpfs          tmpfs      13G  4.0K   13G   1% /run/user/280
> tmpfs          tmpfs      13G  4.0K   13G   1% /run/user/285
> tmpfs          tmpfs      13G     0   13G   0% /run/user/299
> tmpfs          tmpfs      13G     0   13G   0% /run/user/1100
> tmpfs          tmpfs      13G     0   13G   0% /run/user/1685
>
>
> lsscsi
> [2:0:0:0]    disk    ATA      WDC WD30EZRS-00J 0A80  /dev/sde
> [3:0:0:0]    disk    ATA      WDC WD2000FYYZ-0 1K03  /dev/sdd
> [4:0:0:0]    disk    ATA      INTEL SSDSC2BX80 0140  /dev/sda
> [5:0:0:0]    disk    ATA      INTEL SSDSC2BX80 0140  /dev/sdb
> [6:0:0:0]    disk    ATA      INTEL SSDSC2BX80 0140  /dev/sdc
>
> lsblk
> NAME    MAJ:MIN RM   SIZE RO TYPE  MOUNTPOINT
> sda       8:0    0 745.2G  0 disk
> +-sda1    8:1    0  18.6G  0 part
> │ L-md0   9:0    0  18.6G  0 raid1 /boot
> +-sda2    8:2    0 121.1G  0 part
> │ L-md1   9:1    0   121G  0 raid1 [SWAP]
> +-sda3    8:3    0 186.3G  0 part
> │ L-md2   9:2    0 186.1G  0 raid1 /
> +-sda4    8:4    0     1K  0 part
> +-sda5    8:5    0  93.1G  0 part
> │ L-md3   9:3    0  93.1G  0 raid1 /cl
> +-sda6    8:6    0  93.1G  0 part
> │ L-md4   9:4    0  93.1G  0 raid1 /pc
> L-sda7    8:7    0   233G  0 part
>   L-md5   9:5    0 232.9G  0 raid1 /sd
> sdb       8:16   0 745.2G  0 disk
> +-sdb1    8:17   0  18.6G  0 part
> │ L-md0   9:0    0  18.6G  0 raid1 /boot
> +-sdb2    8:18   0 121.1G  0 part
> │ L-md1   9:1    0   121G  0 raid1 [SWAP]
> +-sdb3    8:19   0 186.3G  0 part
> │ L-md2   9:2    0 186.1G  0 raid1 /
> +-sdb4    8:20   0     1K  0 part
> +-sdb5    8:21   0  93.1G  0 part
> │ L-md3   9:3    0  93.1G  0 raid1 /cl
> +-sdb6    8:22   0  93.1G  0 part
> │ L-md4   9:4    0  93.1G  0 raid1 /pc
> L-sdb7    8:23   0   233G  0 part
>   L-md5   9:5    0 232.9G  0 raid1 /sd
> sdc       8:32   0 745.2G  0 disk
> +-sdc1    8:33   0  18.6G  0 part
> │ L-md0   9:0    0  18.6G  0 raid1 /boot
> +-sdc2    8:34   0 121.1G  0 part
> │ L-md1   9:1    0   121G  0 raid1 [SWAP]
> +-sdc3    8:35   0 186.3G  0 part
> │ L-md2   9:2    0 186.1G  0 raid1 /
> +-sdc4    8:36   0     1K  0 part
> +-sdc5    8:37   0  93.1G  0 part
> │ L-md3   9:3    0  93.1G  0 raid1 /cl
> +-sdc6    8:38   0  93.1G  0 part
> │ L-md4   9:4    0  93.1G  0 raid1 /pc
> L-sdc7    8:39   0   233G  0 part
>   L-md5   9:5    0 232.9G  0 raid1 /sd
> sdd       8:48   0   1.8T  0 disk
> L-sdd1    8:49   0   1.8T  0 part  /backupdisk
> sde       8:64   0   2.7T  0 disk
> L-sde1    8:65   0   2.7T  0 part  /archive
>
>> -----Original Message-----
>> From: Reindl Harald [mailto:h.reindl@thelounge.net]
>> Sent: Tuesday, February 28, 2017 1:23 AM
>> To: linux-raid@vger.kernel.org
>> Subject: Re: GRUB warning after replacing disk drive in RAID1
>>
>> Am 28.02.2017 um 00:37 schrieb Peter Sangas:
>>> I have a RAID1 with 3 disks sda,sdb,sdc.  After replacing sdc and
>>> re-syncing it to the array I issued the following command to load
>>> grub but I get this
>>> warning:
>>>
>>> grub-install /dev/sdc
>>>
>>> Installing for i386-pc platform.
>>> grub-install: warning: Couldn't find physical volume `(null)'. Some
>>> modules may be missing from core image..
>>> grub-install: warning: Couldn't find physical volume `(null)'. Some
>>> modules may be missing from core image..
>>> Installation finished. No error reported.
>>>
>>> Does anyone know why I get this warning and how to avoid it
>>
>> it's harmless and disappears after the resync finished

^ permalink raw reply

* [PATCH 06/14] md-cluster: use sync way to handle METADATA_UPDATED msg
From: Guoqing Jiang @ 2017-03-01  2:02 UTC (permalink / raw)
  To: linux-raid; +Cc: shli, neilb, Guoqing Jiang
In-Reply-To: <1487906124-20107-1-git-send-email-gqjiang@suse.com>

Previously, when node received METADATA_UPDATED msg, it just
need to wakeup mddev->thread, then md_reload_sb will be called
eventually.

We taken the asynchronous way to avoid a deadlock issue, the
deadlock issue could happen when one node is receiving the
METADATA_UPDATED msg (wants reconfig_mutex) and trying to run
the path:

md_check_recovery -> mddev_trylock(hold reconfig_mutex)
                  -> md_update_sb-metadata_update_start
		     (want EX on token however token is
		      got by the sending node)

Since we will support resizing for clustered raid, and we
need the metadata update handling to be synchronous so that
the initiating node can detect failure, so we need to change
the way for handling METADATA_UPDATED msg.

But, we obviously need to avoid above deadlock with the
sync way. To make this happen, we considered to not hold
reconfig_mutex to call md_reload_sb, if some other thread
has already taken reconfig_mutex and waiting for the 'token',
then process_recvd_msg() can safely call md_reload_sb()
without taking the mutex. This is because we can be certain
that no other thread will take the mutex, and we also certain
that the actions performed by md_reload_sb() won't interfere
with anything that the other thread is in the middle of.

To make this more concrete, we added a new cinfo->state bit
        MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD

Which is set in lock_token() just before dlm_lock_sync() is
called, and cleared just after. As lock_token() is always
called with reconfig_mutex() held (the specific case is the
resync_info_update which is distinguished well in previous
patch), if process_recvd_msg() finds that the new bit is set,
then the mutex must be held by some other thread, and it will
keep waiting.

So process_metadata_update() can call md_reload_sb() if either
mddev_trylock() succeeds, or if MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD
is set. The tricky bit is what to do if neither of these apply.
We need to wait. Fortunately mddev_unlock() always calls wake_up()
on mddev->thread->wqueue. So we can get lock_token() to call
wake_up() on that when it sets the bit.

Also remove RELOAD_SB related codes since there are not valid
anymore.

Finally, thanks to Neil for his great idea and help!

Reviewed-by: NeilBrown <neilb@suse.com>
Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
---
 drivers/md/md-cluster.c | 28 +++++++++++++++++++++++++---
 drivers/md/md.c         |  4 ----
 drivers/md/md.h         |  3 ---
 3 files changed, 25 insertions(+), 10 deletions(-)

diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index a6cf02c5c7bc..0aad477d1b20 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -66,7 +66,7 @@ struct resync_info {
  * set up all the related infos such as bitmap and personality */
 #define		MD_CLUSTER_ALREADY_IN_CLUSTER		6
 #define		MD_CLUSTER_PENDING_RECV_EVENT		7
-
+#define 	MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD		8
 
 struct md_cluster_info {
 	struct mddev *mddev; /* the md device which md_cluster_info belongs to */
@@ -523,11 +523,17 @@ static void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg)
 
 static void process_metadata_update(struct mddev *mddev, struct cluster_msg *msg)
 {
+	int got_lock = 0;
 	struct md_cluster_info *cinfo = mddev->cluster_info;
 	mddev->good_device_nr = le32_to_cpu(msg->raid_slot);
-	set_bit(MD_RELOAD_SB, &mddev->flags);
+
 	dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR);
-	md_wakeup_thread(mddev->thread);
+	wait_event(mddev->thread->wqueue,
+		   (got_lock = mddev_trylock(mddev)) ||
+		    test_bit(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state));
+	md_reload_sb(mddev, mddev->good_device_nr);
+	if (got_lock)
+		mddev_unlock(mddev);
 }
 
 static void process_remove_disk(struct mddev *mddev, struct cluster_msg *msg)
@@ -649,8 +655,24 @@ static void recv_daemon(struct md_thread *thread)
 static int lock_token(struct md_cluster_info *cinfo, bool mddev_locked)
 {
 	int error;
+	struct mddev *mddev = cinfo->mddev;
 
+	/*
+	 * If resync thread run after raid1d thread, then process_metadata_update
+	 * could not continue if raid1d held reconfig_mutex (and raid1d is blocked
+	 * since another node already got EX on Token and waitting the EX of Ack),
+	 * so let resync wake up thread in case flag is set.
+	 */
+	if (mddev_locked) {
+		error = test_and_set_bit_lock(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD,
+					      &cinfo->state);
+		WARN_ON_ONCE(error);
+		md_wakeup_thread(mddev->thread);
+	}
 	error = dlm_lock_sync(cinfo->token_lockres, DLM_LOCK_EX);
+	if (mddev_locked)
+		clear_bit_unlock(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state);
+
 	if (error)
 		pr_err("md-cluster(%s:%d): failed to get EX on TOKEN (%d)\n",
 				__func__, __LINE__, error);
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 55e7e7a8714e..6e910d99c9c1 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -8389,7 +8389,6 @@ void md_check_recovery(struct mddev *mddev)
 		(mddev->sb_flags & ~ (1<<MD_SB_CHANGE_PENDING)) ||
 		test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
 		test_bit(MD_RECOVERY_DONE, &mddev->recovery) ||
-		test_bit(MD_RELOAD_SB, &mddev->flags) ||
 		(mddev->external == 0 && mddev->safemode == 1) ||
 		(mddev->safemode == 2 && ! atomic_read(&mddev->writes_pending)
 		 && !mddev->in_sync && mddev->recovery_cp == MaxSector)
@@ -8438,9 +8437,6 @@ void md_check_recovery(struct mddev *mddev)
 						rdev->raid_disk < 0)
 					md_kick_rdev_from_array(rdev);
 			}
-
-			if (test_and_clear_bit(MD_RELOAD_SB, &mddev->flags))
-				md_reload_sb(mddev, mddev->good_device_nr);
 		}
 
 		if (!mddev->external) {
diff --git a/drivers/md/md.h b/drivers/md/md.h
index b8859cbf84b6..ec5ffde03ccf 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -219,9 +219,6 @@ enum mddev_flags {
 				 * it then */
 	MD_JOURNAL_CLEAN,	/* A raid with journal is already clean */
 	MD_HAS_JOURNAL,		/* The raid array has journal feature set */
-	MD_RELOAD_SB,		/* Reload the superblock because another node
-				 * updated it.
-				 */
 	MD_CLUSTER_RESYNC_LOCKED, /* cluster raid only, which means node
 				   * already took resync lock, need to
 				   * release the lock */
-- 
2.6.2


^ permalink raw reply related

* Re: [PATCH 01/14] md-cluster: remove unnecessary header files
From: Guoqing Jiang @ 2017-03-01  2:47 UTC (permalink / raw)
  To: Shaohua Li; +Cc: linux-raid, shli, neilb
In-Reply-To: <20170228180337.j7qakl7kbf5lapup@kernel.org>



On 03/01/2017 02:03 AM, Shaohua Li wrote:
> On Fri, Feb 24, 2017 at 11:15:11AM +0800, Guoqing Jiang wrote:
>> md-cluster.h is already included in md.h, so remove
>> the redundant one and we don't want to cross include
>> header file too.
>>
>> Reviewed-by: NeilBrown <neilb@suse.com>
>> Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
> It would be better md.h doesn't include md-cluster.h and include md-cluster.h
> in required .c files.

Currently, there are compile dependency since md_cluster_operations and
md_cluster_info (though it need to move from md-cluster.c to md-cluster.h
in future) are used inside md.h. Anyway, I will consider about how to 
improve
this.

Thanks,
Guoqing

^ permalink raw reply

* Re: [PATCH 05/14] md-cluster: add new parameter for lock_token
From: Guoqing Jiang @ 2017-03-01  2:48 UTC (permalink / raw)
  To: Shaohua Li; +Cc: linux-raid, shli, neilb
In-Reply-To: <20170228180714.ooim4khb4zlowlof@kernel.org>



On 03/01/2017 02:07 AM, Shaohua Li wrote:
> On Fri, Feb 24, 2017 at 11:15:15AM +0800, Guoqing Jiang wrote:
>> lock_token is called from either lock_comm or
>> metadata_update_start, if we look back more for
>> the call chain, some of them are called with the
>> reconfig_mutex held while a few of them don't.
>>
>> Specifically, resync_info_update is mostly called
>> without the protection of reconfig_mutex. But
>> resync_finish calls resync_info_update within
>> the mutex.
>>
>> We make the change to lock_token since we need
>> to use synchronous way to handle METADATA_UPDATED
>> message in latter patch.
> please merge this to the patch which uses the new paramter
>

NP, though I think it is easy for review.

Thanks,
Guoqing

^ permalink raw reply

* Re: [PATCH 04/14] md-cluster: add mddev into struct md_cluster_info
From: Guoqing Jiang @ 2017-03-01  2:49 UTC (permalink / raw)
  To: Shaohua Li; +Cc: linux-raid, shli, neilb
In-Reply-To: <20170228180625.rxdbizl7thpif566@kernel.org>



On 03/01/2017 02:06 AM, Shaohua Li wrote:
> On Fri, Feb 24, 2017 at 11:15:14AM +0800, Guoqing Jiang wrote:
>> Store "struct mddev *mddev" in md_cluster_info,
>> then we can get mddev inside lock_token in next
>> patch.
> Please merge this to the patch which requires the mddev. Don't think we need a
> separate patch.
>   

I will merge it.

Thanks,
Guoqing

^ permalink raw reply

* Re: [PATCH 09/14] md-cluster: set MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD in metadata_update_start
From: Guoqing Jiang @ 2017-03-01  2:58 UTC (permalink / raw)
  To: Shaohua Li; +Cc: linux-raid, shli, neilb
In-Reply-To: <20170228182444.ggwl7dnmhw2zi3n4@kernel.org>



On 03/01/2017 02:24 AM, Shaohua Li wrote:
> On Fri, Feb 24, 2017 at 11:15:19AM +0800, Guoqing Jiang wrote:
>> Since we have switched to sync way to handle METADATA_UPDATED
>> msg, then process_metadata_update can only continue with either
>> get the reconfig_mutex or MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD is
>> set.
> same here. Either the patch should be put into the 'METADATA_UPDATED' patch or
> this patch should be before the 'METADATA_UPDATED' patch.
>
> We shouldn't introduce a broken patch then fix it in latter patch.

Yes, for both 8 and 9, it is possible to merge with 6 (I resend it though
it was not posted successfully for some reason). I seperated them since
it would be more easy for review and don't want mess up  header of the
6th patch.

Thanks,
Guoqing


^ permalink raw reply

* Re: [PATCH 11/14] md-cluster: introduce cluster_check_sync_size
From: Guoqing Jiang @ 2017-03-01  3:11 UTC (permalink / raw)
  To: Shaohua Li; +Cc: linux-raid, shli, neilb
In-Reply-To: <20170228190545.katik7exfukly6fw@kernel.org>



On 03/01/2017 03:05 AM, Shaohua Li wrote:
> On Fri, Feb 24, 2017 at 11:15:21AM +0800, Guoqing Jiang wrote:
>> Support resize is a little complex for clustered
>> raid, since we need to ensure all the nodes share
>> the same knowledge about the size of raid.
>>
>> We achieve the goal by check the sync_size which
>> is in each node's bitmap, we can only change the
>> capacity after cluster_check_sync_size returns 0.
>>
>> Also, get_bitmap_from_slot is added to get a slot's
>> bitmap. And we exported some funcs since they are
>> used in cluster_check_sync_size().
>>
>> Reviewed-by: NeilBrown <neilb@suse.com>
>> Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
>> ---
>>   drivers/md/bitmap.c     | 25 ++++++++++++++++++++-
>>   drivers/md/bitmap.h     |  2 ++
>>   drivers/md/md-cluster.c | 58 +++++++++++++++++++++++++++++++++++++++++++++++++
>>   3 files changed, 84 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
>> index 9fb2ccac958a..67a7d399f501 100644
>> --- a/drivers/md/bitmap.c
>> +++ b/drivers/md/bitmap.c
>> @@ -471,6 +471,7 @@ void bitmap_update_sb(struct bitmap *bitmap)
>>   	kunmap_atomic(sb);
>>   	write_page(bitmap, bitmap->storage.sb_page, 1);
>>   }
>> +EXPORT_SYMBOL(bitmap_update_sb);
>>   
>>   /* print out the bitmap file superblock */
>>   void bitmap_print_sb(struct bitmap *bitmap)
>> @@ -1727,7 +1728,7 @@ void bitmap_flush(struct mddev *mddev)
>>   /*
>>    * free memory that was allocated
>>    */
>> -static void bitmap_free(struct bitmap *bitmap)
>> +void bitmap_free(struct bitmap *bitmap)
>>   {
>>   	unsigned long k, pages;
>>   	struct bitmap_page *bp;
>> @@ -1761,6 +1762,7 @@ static void bitmap_free(struct bitmap *bitmap)
>>   	kfree(bp);
>>   	kfree(bitmap);
>>   }
>> +EXPORT_SYMBOL(bitmap_free);
>>   
>>   void bitmap_destroy(struct mddev *mddev)
>>   {
>> @@ -1920,6 +1922,27 @@ int bitmap_load(struct mddev *mddev)
>>   }
>>   EXPORT_SYMBOL_GPL(bitmap_load);
>>   
>> +struct bitmap *get_bitmap_from_slot(struct mddev *mddev, int slot)
>> +{
>> +	int rv = 0;
>> +	struct bitmap *bitmap;
>> +
>> +	bitmap = bitmap_create(mddev, slot);
>> +	if (IS_ERR(bitmap)) {
>> +		rv = PTR_ERR(bitmap);
>> +		return ERR_PTR(rv);
>> +	}
>> +
>> +	rv = bitmap_init_from_disk(bitmap, 0);
>> +	if (rv) {
>> +		bitmap_free(bitmap);
>> +		return ERR_PTR(rv);
>> +	}
>> +
>> +	return bitmap;
>> +}
>> +EXPORT_SYMBOL(get_bitmap_from_slot);
>> +
>>   /* Loads the bitmap associated with slot and copies the resync information
>>    * to our bitmap
>>    */
>> diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h
>> index 5b6dd63dda91..9f761097aab2 100644
>> --- a/drivers/md/bitmap.h
>> +++ b/drivers/md/bitmap.h
>> @@ -267,8 +267,10 @@ void bitmap_daemon_work(struct mddev *mddev);
>>   
>>   int bitmap_resize(struct bitmap *bitmap, sector_t blocks,
>>   		  int chunksize, int init);
>> +struct bitmap *get_bitmap_from_slot(struct mddev *mddev, int slot);
>>   int bitmap_copy_from_slot(struct mddev *mddev, int slot,
>>   				sector_t *lo, sector_t *hi, bool clear_bits);
>> +void bitmap_free(struct bitmap *bitmap);
>>   #endif
>>   
>>   #endif
>> diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
>> index f1b9a7a3ddd2..d3c024e6bfcf 100644
>> --- a/drivers/md/md-cluster.c
>> +++ b/drivers/md/md-cluster.c
>> @@ -1089,6 +1089,64 @@ static void metadata_update_cancel(struct mddev *mddev)
>>   	unlock_comm(cinfo);
>>   }
>>   
>> +/*
>> + * retun 0 if all the bitmaps have the same sync_size
>> + */
>> +int cluster_check_sync_size(struct mddev *mddev)
>> +{
>> +	int i, rv;
>> +	bitmap_super_t *sb;
>> +	unsigned long my_sync_size, sync_size = 0;
>> +	int node_num = mddev->bitmap_info.nodes;
>> +	int current_slot = md_cluster_ops->slot_number(mddev);
>> +	struct bitmap *bitmap = mddev->bitmap;
>> +	char str[64];
>> +	struct dlm_lock_resource *bm_lockres;
>> +
>> +	sb = kmap_atomic(bitmap->storage.sb_page);
>> +	my_sync_size = sb->sync_size;
>> +	kunmap_atomic(sb);
>> +
>> +	for (i = 0; i < node_num; i++) {
>> +		if (i == current_slot)
>> +			continue;
>> +
>> +		bitmap = get_bitmap_from_slot(mddev, i);
>> +		if (IS_ERR(bitmap)) {
>> +			pr_err("can't get bitmap from slot %d\n", i);
>> +			return -1;
>> +		}
>> +
>> +		/*
>> +		 * If we can hold the bitmap lock of one node then
>> +		 * the slot is not occupied, update the sb.
>> +		 */
>> +		snprintf(str, 64, "bitmap%04d", i);
>> +		bm_lockres = lockres_init(mddev, str, NULL, 1);
>> +		if (!bm_lockres) {
>> +			pr_err("md-cluster: Cannot initialize %s\n", str);
>> +		}
> just print error here?

It should return -ENOMEM, thanks.

>
>> +		bm_lockres->flags |= DLM_LKF_NOQUEUE;
>> +		rv = dlm_lock_sync(bm_lockres, DLM_LOCK_PW);
>> +		if (!rv)
>> +			bitmap_update_sb(bitmap);
> always the sb even the sync_size is the same?

I suppose you mean "always *update* the sb even the sync_size is the same".

If a node can hold PW on one bitmap lock, then we believe that the bitmap is
not used (we create 4 bitmaps by default, though it is possible that 
only one
node is on-line), so the bitmap without related on-line node just keep 
the original
data, which means sync_size can't be same as the master node (which issues
update_size cmd).

Thanks,
Guoqing

^ permalink raw reply

* Re: [PATCH 14/14] md-cluster: add the support for resize
From: Guoqing Jiang @ 2017-03-01  3:28 UTC (permalink / raw)
  To: Shaohua Li; +Cc: linux-raid, shli, neilb
In-Reply-To: <20170228192536.r74i2o3vchf74isn@kernel.org>



On 03/01/2017 03:25 AM, Shaohua Li wrote:
> On Fri, Feb 24, 2017 at 11:15:24AM +0800, Guoqing Jiang wrote:
>> To update size for cluster raid, we need to make
>> sure all nodes can perform the change successfully.
>> However, it is possible that some of them can't do
>> it due to failure (bitmap_resize could fail). So
>> we need to consider the issue before we set the
>> capacity unconditionally, and we use below steps
>> to perform sanity check.
>>
>> 1. A change the size, then broadcast METADATA_UPDATED
>>     msg.
>> 2. B and C receive METADATA_UPDATED change the size
>>     excepts call set_capacity, sync_size is not update
>>     if the change failed. Also call bitmap_update_sb
>>     to sync sb to disk.
>> 3. A checks other node's sync_size, if sync_size has
>>     been updated in all nodes, then send CHANGE_CAPACITY
>>     msg otherwise send msg to revert previous change.
>> 4. B and C call set_capacity if receive CHANGE_CAPACITY
>>     msg, otherwise pers->resize will be called to restore
>>     the old value.
>>
>> Reviewed-by: NeilBrown <neilb@suse.com>
>> Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
>> ---
>>   Documentation/md/md-cluster.txt |  2 +-
>>   drivers/md/md-cluster.c         | 75 +++++++++++++++++++++++++++++++++++++++++
>>   drivers/md/md-cluster.h         |  1 +
>>   drivers/md/md.c                 | 21 +++++++++---
>>   4 files changed, 93 insertions(+), 6 deletions(-)
>>
>> diff --git a/Documentation/md/md-cluster.txt b/Documentation/md/md-cluster.txt
>> index 38883276d31c..2663d49dd8a0 100644
>> --- a/Documentation/md/md-cluster.txt
>> +++ b/Documentation/md/md-cluster.txt
>> @@ -321,4 +321,4 @@ The algorithm is:
>>   
>>   There are somethings which are not supported by cluster MD yet.
>>   
>> -- update size and change array_sectors.
>> +- change array_sectors.
>> diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
>> index d3c024e6bfcf..75da83187c31 100644
>> --- a/drivers/md/md-cluster.c
>> +++ b/drivers/md/md-cluster.c
>> @@ -1147,6 +1147,80 @@ int cluster_check_sync_size(struct mddev *mddev)
>>   	return (my_sync_size == sync_size) ? 0 : -1;
>>   }
>>   
>> +/*
>> + * Update the size for cluster raid is a little more complex, we perform it
>> + * by the steps:
>> + * 1. hold token lock and update superblock in initiator node.
>> + * 2. send METADATA_UPDATED msg to other nodes.
>> + * 3. The initiator node continues to check each bitmap's sync_size, if all
>> + *    bitmaps have the same value of sync_size, then we can set capacity and
>> + *    let other nodes to perform it. If one node can't update sync_size
>> + *    accordingly, we need to revert to previous value.
>> + */
>> +static void update_size(struct mddev *mddev, sector_t old_dev_sectors)
>> +{
>> +	struct md_cluster_info *cinfo = mddev->cluster_info;
>> +	struct cluster_msg cmsg;
>> +	struct md_rdev *rdev;
>> +	int ret = 0;
>> +	int raid_slot = -1;
>> +
>> +	md_update_sb(mddev, 1);
>> +	lock_comm(cinfo, 1);
>> +
>> +	memset(&cmsg, 0, sizeof(cmsg));
>> +	cmsg.type = cpu_to_le32(METADATA_UPDATED);
>> +	rdev_for_each(rdev, mddev)
>> +		if (rdev->raid_disk >= 0 && !test_bit(Faulty, &rdev->flags)) {
>> +			raid_slot = rdev->desc_nr;
>> +			break;
>> +		}
>> +	if (raid_slot >= 0) {
>> +		cmsg.raid_slot = cpu_to_le32(raid_slot);
>> +		/*
>> +		 * We can only change capiticy after all the nodes can do it,
>> +		 * so need to wait after other nodes already received the msg
>> +		 * and handled the change
>> +		 */
>> +		ret = __sendmsg(cinfo, &cmsg);
>> +		if (ret) {
>> +			pr_err("%s:%d: failed to send METADATA_UPDATED msg\n",
>> +			       __func__, __LINE__);
>> +			unlock_comm(cinfo);
>> +			return;
>> +		}
>> +	} else {
>> +		pr_err("md-cluster: No good device id found to send\n");
>> +		unlock_comm(cinfo);
>> +		return;
>> +	}
>> +
>> +	/*
>> +	 * check the sync_size from other node's bitmap, if sync_size
>> +	 * have already updated in other nodes as expected, send an
>> +	 * empty metadata msg to permit the change of capacity
>> +	 */
>> +	if (cluster_check_sync_size(mddev) == 0) {
>> +		memset(&cmsg, 0, sizeof(cmsg));
>> +		cmsg.type = cpu_to_le32(CHANGE_CAPACITY);
>> +		ret = __sendmsg(cinfo, &cmsg);
>> +		if (ret)
>> +			pr_err("%s:%d: failed to send CHANGE_CAPACITY msg\n",
>> +			       __func__, __LINE__);
>> +		set_capacity(mddev->gendisk, mddev->array_sectors);
> don't call revalidate_disk here? And why don't move the gendisk related stuff to md.c.

Thanks, I will add revalidate_disk after set_capacity. But we can't move 
it to md.c
since md-cluster runs a different way for resize.

>> --- a/drivers/md/md.c
>> +++ b/drivers/md/md.c
>> @@ -6503,10 +6503,7 @@ static int update_size(struct mddev *mddev, sector_t num_sectors)
>>   	struct md_rdev *rdev;
>>   	int rv;
>>   	int fit = (num_sectors == 0);
>> -
>> -	/* cluster raid doesn't support update size */
>> -	if (mddev_is_clustered(mddev))
>> -		return -EINVAL;
>> +	sector_t old_dev_sectors = mddev->dev_sectors;
>>   
>>   	if (mddev->pers->resize == NULL)
>>   		return -EINVAL;
>> @@ -6535,7 +6532,9 @@ static int update_size(struct mddev *mddev, sector_t num_sectors)
>>   	}
>>   	rv = mddev->pers->resize(mddev, num_sectors);
>>   	if (!rv) {
>> -		if (mddev->queue) {
>> +		if (mddev_is_clustered(mddev))
>> +			md_cluster_ops->update_size(mddev, old_dev_sectors);
>> +		else if (mddev->queue) {
>>   			set_capacity(mddev->gendisk, mddev->array_sectors);
>>   			revalidate_disk(mddev->gendisk);
>>   		}

You can see the path is different between common md and md-cluster, 
because we have
to check if all the bitmaps have the same sync_size or not, then set 
capacity and revalidate
disk.

>> @@ -8753,6 +8752,18 @@ static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev)
>>   	int role, ret;
>>   	char b[BDEVNAME_SIZE];
>>   
>> +	/*
>> +	 * If size is changed in another node then we need to
>> +	 * do resize as well.
>> +	 */
>> +	if (mddev->dev_sectors != le64_to_cpu(sb->size)) {
>> +		ret = mddev->pers->resize(mddev, le64_to_cpu(sb->size));
>> +		if (ret)
>> +			pr_info("md-cluster: resize failed\n");
>> +		else
>> +			bitmap_update_sb(mddev->bitmap);
>> +	}
> I'm confused, who will trigger this? The patch 10 only calls set_capacity.

process_metadata_update -> md_reload_sb -> check_sb_changes, so which
means if a node received METADATA_UPDATED msg, it will call the path.

And in this patch you may see that update_size sends the msg.

+static void update_size(struct mddev *mddev, sector_t old_dev_sectors)
+{
+	struct md_cluster_info *cinfo = mddev->cluster_info;
+	struct cluster_msg cmsg;
+	struct md_rdev *rdev;
+	int ret = 0;
+	int raid_slot = -1;
+
+	md_update_sb(mddev, 1);
+	lock_comm(cinfo, 1);
+
+	memset(&cmsg, 0, sizeof(cmsg));
+	cmsg.type = cpu_to_le32(METADATA_UPDATED);

> Please describe the details in each node. Also please add it to md-cluster.txt
> document.

Does it help?

diff --git a/Documentation/md/md-cluster.txt 
b/Documentation/md/md-cluster.txt
index 2663d49dd8a0..709439e687c2 100644
--- a/Documentation/md/md-cluster.txt
+++ b/Documentation/md/md-cluster.txt
@@ -71,8 +71,8 @@ There are three groups of locks for managing the device:

   3.1.1 METADATA_UPDATED: informs other nodes that the metadata has
     been updated, and the node must re-read the md superblock. This is
-   performed synchronously. It is primarily used to signal device
-   failure.
+   performed synchronously. We send this message if sb is updated or
+   the size is changed.

Thanks,
Guoqing

^ permalink raw reply related

* Re: [PATCH 07/14] md: move bitmap_destroy before __md_stop
From: Guoqing Jiang @ 2017-03-01  3:46 UTC (permalink / raw)
  To: Shaohua Li; +Cc: linux-raid, shli, neilb
In-Reply-To: <20170228182009.zfsi2epb24ihjudp@kernel.org>



On 03/01/2017 02:20 AM, Shaohua Li wrote:
> On Fri, Feb 24, 2017 at 11:15:17AM +0800, Guoqing Jiang wrote:
>> Since we have switched to sync way to handle METADATA_UPDATED
>> msg for md-cluster, then process_metadata_update is depended
>> on mddev->thread->wqueue.
>>
>> With the new change, clustered raid could possible hang if
>> array received a METADATA_UPDATED msg after array unregistered
>> mddev->thread, so we need to stop clustered raid earlier
>> than before.
>>
>> And this change should be safe for non-clustered raid since
>> all writes are stopped before the destroy. Also in md_run,
>> we activate the personality (pers->run()) before activating
>> the bitmap (bitmap_create()). So it is pleasingly symmetric
>> to stop the bitmap (bitmap_destroy()) before stopping the
>> personality (__md_stop() calls pers->free()).
> There is no patch 6. So I can't judge the purpose of the patch.

Sorry, it is resent now.

> The patch changed behaviors. We only destroy bitmap with mode == 0, now we do it
> even for mode == 2. Please specify why it's safe.

Thanks for reminder, I will add the check for mode.

> The __md_stop will wait for
> behind writes for example only with bitmap set, but the patch makes us not do
> the wait.

I don't know behind write very well,  but we have call __md_stop_writes 
before
destroy bitmap, also bitmap is flushed. Shouldn't it mean the 
behind_writes is
cleared? If not, when the behind_write bit can be cleared? Thanks.

Guoqing


^ permalink raw reply

* Re: [PATCH 00/14] the latest changes for md-cluster
From: Guoqing Jiang @ 2017-03-01  3:50 UTC (permalink / raw)
  To: Shaohua Li; +Cc: linux-raid, shli, neilb, zhilong
In-Reply-To: <20170228192721.3jxfigqeqbhhmxca@kernel.org>



On 03/01/2017 03:27 AM, Shaohua Li wrote:
> On Fri, Feb 24, 2017 at 11:15:10AM +0800, Guoqing Jiang wrote:
>> Hi,
>>
>> This patchset is based on md/for-next branch, which includes below parts:
>>
>> 1. some code improvements (0001-0004).
>> 2. changes for use sync way to handle METADATA_UPDATED msg (0005-0009).
>> 3. add resize support for md-cluster (0010-0012 and 0014), and also
>>     make some related changes inside md (0013).
> Applied patch 2, 3, 13. Replied to other patches.

Thanks for review.

> BTW, is it possible you can create a test suite people can verify the md-cluster stuff?

Actually, Zhilong is currently working on this, we will push the test
suite into mdadm/test and also improve current test too (you may
see his effort in the list).

Thanks,
Guoqing

^ permalink raw reply

* [MDADM PATCH 0/2] Fix some building errors
From: Xiao Ni @ 2017-03-01  6:12 UTC (permalink / raw)
  To: linux-raid; +Cc: Jes.Sorensen, ncroxon

Hi Jes

There are some error buildings in Fedora release 26 (Rawhide)
The gcc version is gcc (GCC) 7.0.1 20170211 (Red Hat 7.0.1-0.8)

There are three types of errors:
1. Fall through 
mdadm.c:149:28: error: this statement may fall through [-Werror=implicit-fallthrough=]
    if (mode == ASSEMBLE || mode == BUILD ||
        ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        mode == CREATE || mode == GROW ||
        ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        mode == INCREMENTAL || mode == MANAGE)
        ~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~
mdadm.c:151:3: note: here
   case Brief:
   ^~~~
2. format-overflow
Detail.c: In function ‘Detail’:
Detail.c:584:31: error: ‘%s’ directive writing up to 255 bytes into a region of size 189 [-Werror=format-overflow=]
     sprintf(path, "/sys/block/%s/md/metadata_version",
                               ^~
Detail.c:584:5: note: ‘sprintf’ output between 32 and 287 bytes into a destination of size 200
     sprintf(path, "/sys/block/%s/md/metadata_version",
     ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
      de->d_name);
      ~~~~~~~~~~~
3. format-truncation
super-intel.c: In function ‘examine_super_imsm’:
super-intel.c:1815:30: error: ‘%s’ directive output may be truncated writing up to 31 bytes into a region of size 24 [-Werror=format-truncation=]
  snprintf(str, MPB_SIG_LEN, "%s", mpb->sig);
                              ^~
super-intel.c:1815:2: note: ‘snprintf’ output between 1 and 32 bytes into a destination of size 24
  snprintf(str, MPB_SIG_LEN, "%s", mpb->sig);
  ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Xiao Ni (2):
  Add Wimplicit-fallthrough=0 in Makefile
  Specify suitable size when write to buffer

 Detail.c      | 2 +-
 Makefile      | 2 +-
 super-intel.c | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

-- 
2.7.4


^ permalink raw reply

* [MDADM PATCH 1/2] Add Wimplicit-fallthrough=0 in Makefile
From: Xiao Ni @ 2017-03-01  6:12 UTC (permalink / raw)
  To: linux-raid; +Cc: Jes.Sorensen, ncroxon
In-Reply-To: <1488348766-31087-1-git-send-email-xni@redhat.com>

There are many errors like 'error: this statement may fall through'. 
But the logic is right. So add the flag Wimplicit-fallthrough=0 
to disable the error messages. The method I use is from 
https://gcc.gnu.org/onlinedocs/gcc/Warning-Options.html
#index-Wimplicit-fallthrough-375
---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index a6f464c..b9f52e0 100644
--- a/Makefile
+++ b/Makefile
@@ -43,7 +43,7 @@ KLIBC_GCC = gcc -nostdinc -iwithprefix include -I$(KLIBC)/klibc/include -I$(KLIB
 
 CC ?= $(CROSS_COMPILE)gcc
 CXFLAGS ?= -ggdb
-CWFLAGS = -Wall -Werror -Wstrict-prototypes -Wextra -Wno-unused-parameter
+CWFLAGS = -Wall -Werror -Wstrict-prototypes -Wextra -Wno-unused-parameter -Wimplicit-fallthrough=0
 ifdef WARN_UNUSED
 CWFLAGS += -Wp,-D_FORTIFY_SOURCE=2 -O3
 endif
-- 
2.7.4


^ permalink raw reply related

* [MDADM PATCH 2/2] Specify suitable size when write to buffer
From: Xiao Ni @ 2017-03-01  6:12 UTC (permalink / raw)
  To: linux-raid; +Cc: Jes.Sorensen, ncroxon
In-Reply-To: <1488348766-31087-1-git-send-email-xni@redhat.com>

---
 Detail.c      | 2 +-
 super-intel.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/Detail.c b/Detail.c
index 509b0d4..f4d4241 100644
--- a/Detail.c
+++ b/Detail.c
@@ -575,7 +575,7 @@ This is pretty boring
 			printf("  Member Arrays :");
 
 			while (dir && (de = readdir(dir)) != NULL) {
-				char path[200];
+				char path[320];
 				char vbuf[1024];
 				int nlen = strlen(sra->sys_name);
 				dev_t devid;
diff --git a/super-intel.c b/super-intel.c
index d5e9517..ea50265 100644
--- a/super-intel.c
+++ b/super-intel.c
@@ -1811,7 +1811,7 @@ static void examine_super_imsm(struct supertype *st, char *homehost)
 	__u32 reserved = imsm_reserved_sectors(super, super->disks);
 	struct dl *dl;
 
-	snprintf(str, MPB_SIG_LEN, "%s", mpb->sig);
+	snprintf(str, MAX_SIGNATURE_LENGTH, "%s", mpb->sig);
 	printf("          Magic : %s\n", str);
 	snprintf(str, strlen(MPB_VERSION_RAID0), "%s", get_imsm_version(mpb));
 	printf("        Version : %s\n", get_imsm_version(mpb));
-- 
2.7.4


^ permalink raw reply related

* [MDADM PATCH 0/2] Fix some building errors
From: Xiao Ni @ 2017-03-01  7:07 UTC (permalink / raw)
  To: linux-raid; +Cc: Jes.Sorensen, ncroxon

Hi Jes

There are some error buildings in Fedora release 26 (Rawhide)
The gcc version is gcc (GCC) 7.0.1 20170211 (Red Hat 7.0.1-0.8)

There are three types of errors:
1. Fall through 
mdadm.c:149:28: error: this statement may fall through [-Werror=implicit-fallthrough=]
2. format-overflow
Detail.c: In function Detail:
Detail.c:584:31: error: directive writing up to 255 bytes into 
a region of size 189 [-Werror=format-overflow=]
3. format-truncation
super-intel.c: In function examine_super_imsm:
super-intel.c:1815:30: error: directive output may be truncated 
writing up to 31 bytes into a region of size 24 [-Werror=format-truncation=]

Xiao Ni (2):
  Add Wimplicit-fallthrough=0 in Makefile
  Specify suitable size when write to buffer

 Detail.c      | 2 +-
 Makefile      | 2 +-
 super-intel.c | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

-- 
2.7.4

^ permalink raw reply

* [PATCH] mdadm: add checking the clustered bitmap in assemble mode
From: Zhilong Liu @ 2017-03-01  7:47 UTC (permalink / raw)
  To: Jes.Sorensen; +Cc: linux-raid, Zhilong Liu

1. both clustered and internal array don't need to specify
--bitmap when assembling array.
2. clustered array doesn't support external bitmap mode.

Signed-off-by: Zhilong Liu <zlliu@suse.com>

diff --git a/mdadm.c b/mdadm.c
index b5d89e4..b08cace 100644
--- a/mdadm.c
+++ b/mdadm.c
@@ -1095,8 +1095,14 @@ int main(int argc, char *argv[])
 				pr_err("bitmap file needed with -b in --assemble mode\n");
 				exit(2);
 			}
-			if (strcmp(optarg, "internal") == 0) {
-				pr_err("there is no need to specify --bitmap when assembling arrays with internal bitmaps\n");
+			if (strcmp(optarg, "internal") == 0 ||
+			    strcmp(optarg, "clustered") == 0) {
+				pr_err("no need to specify --bitmap when assembling arrays with internal or clustered bitmaps\n");
+				continue;
+			}
+			if (strcmp(optarg, "clustered") == 0 &&
+			    strchr(optarg, '/') != NULL) {
+				pr_err("clustered array doesn't support external bitmap\n");
 				continue;
 			}
 			bitmap_fd = open(optarg, O_RDWR);
-- 
2.6.6


^ permalink raw reply related

* Re: [RFC PATCH v4] IV Generation algorithms for dm-crypt
From: Gilad Ben-Yossef @ 2017-03-01  8:30 UTC (permalink / raw)
  To: Milan Broz
  Cc: Binoy Jayan, Rajendra, Herbert Xu, Oded, Mike Snitzer,
	Linux kernel mailing list, Ondrej Mosnacek, linux-raid, dm-devel,
	Mark Brown, Arnd Bergmann, linux-crypto, Shaohua Li,
	David S. Miller, Alasdair Kergon, Ofir
In-Reply-To: <68f70534-a309-46ba-a84d-8acc1e6620e5@gmail.com>

On Tue, Feb 28, 2017 at 11:05 PM, Milan Broz <gmazyland@gmail.com> wrote:
>
> On 02/22/2017 07:12 AM, Binoy Jayan wrote:
> >
> > I was wondering if this is near to be ready for submission (apart from
> > the testmgr.c
> > changes) or I need to make some changes to make it similar to the IPSec offload?
>
> I just tried this and except it registers the IV for every new device again, it works...
> (After a while you have many duplicate entries in /proc/crypto.)
>
> But I would like to see some summary why such a big patch is needed in the first place.
> (During an internal discussions seems that people are already lost in mails and
> patches here, so Ondra promised me to send some summary mail soon here.)
>
> IIRC the first initial problem was dmcrypt performance on some embedded
> crypto processors that are not able to cope with small crypto requests effectively.
>
>
> Do you have some real performance numbers that proves that such a patch is adequate?
>
> I would really like to see the performance issue fixed but I am really not sure
> this approach works for everyone. It would be better to avoid repeating this exercise later.
> IIRC Ondra's "bulk" mode, despite rejected, shows that there is a potential
> to speedup things even for crypt drivers that do not support own IV generators.
>

AFAIK the problem that we are trying to solve is that if the IV is
generated outside the crypto API
domain than you are forced to have an invocation of the crypto API per
each block because you
need to provide the IV for each block.

By putting the IV generation responsibility in the Crypto API we open
the way to do a single invocation
of the crypto API for a whole sequence of blocks.

For software implementation of XTS this doesn't matter much - but for
hardware based XTS providers
that can do the IV generation themselves it's the difference between a
sequence of small invocation,
with all the overhead that that entails  and a single big invocatio -
and this can be big.

This lead some vendors to ship hacked up versions of dm-crypt to match
the specific crypto hardware
they were using, or so I've heard at least - didn't see the code myself.

I believe Binoy is trying to address this in a generic upstream worthy
way instead.

Anyway, you are only supposed to see s difference when using a
hardware based XTS provider algo
that supports IV generation.

> I like the patch is now contained inside dmcrypt, but it still exposes IVs that
> are designed just for old, insecure, compatibility-only containers.
>
> I really do not think every compatible crap must be accessible through crypto API.
> (I wrote the dmcrypt lrw and tcw compatibility IVs and I would never do that this way
> if I know it is accessible outside of dmcrypt internals...)
> Even the ESSIV is something that was born to fix predictive IVs (CBC watermarking
> attacks) for disk encryption only, no reason to expose it outside of disk encryption.
>

The point is that you have more than one implementation of these
"compatible crap" - the
software implementation that you wrote and potentially multiple
hardware implementations
and putting this in the crypto API domain is the way to abstract this
so you use the one
that works best of your platform.

Thanks,
Gilad

-- 
Gilad Ben-Yossef
Chief Coffee Drinker

"If you take a class in large-scale robotics, can you end up in a
situation where the homework eats your dog?"
 -- Jean-Baptiste Queru

^ permalink raw reply

* [PATCH V2 0/5] the latest changes for md-cluster
From: Guoqing Jiang @ 2017-03-01  8:42 UTC (permalink / raw)
  To: linux-raid; +Cc: shli, neilb, Guoqing Jiang

Since 2, 3 and 13 are merged, so the relationship between V2
and previous patchset are:
	 V2	 	     old
	0001 -> 0004, 0005, 0006, 0008 and 0009
	0002 -> 	    0007
	0003 -> 	    0010
	0004 -> 	0011 and 0012
	0005 -> 	    0014

And based on Shaohua's comments, this version also changes:

1. discard the first patch for now.
2. add mode check before destroy bitmap.
3. for patch 11, add more codes to handle lockres_init returns err.
4. call revalidate_disk after set_capacity in both patch 3 and 5.

Thanks,
Guoqing

^ permalink raw reply

* [PATCH V2 2/5] md: move bitmap_destroy before __md_stop
From: Guoqing Jiang @ 2017-03-01  8:42 UTC (permalink / raw)
  To: linux-raid; +Cc: shli, neilb, Guoqing Jiang
In-Reply-To: <1488357760-7893-1-git-send-email-gqjiang@suse.com>

Since we have switched to sync way to handle METADATA_UPDATED
msg for md-cluster, then process_metadata_update is depended
on mddev->thread->wqueue.

With the new change, clustered raid could possible hang if
array received a METADATA_UPDATED msg after array unregistered
mddev->thread, so we need to stop clustered raid earlier
than before.

And this change should be safe for non-clustered raid since
all writes are stopped before the destroy. Also in md_run,
we activate the personality (pers->run()) before activating
the bitmap (bitmap_create()). So it is pleasingly symmetric
to stop the bitmap (bitmap_destroy()) before stopping the
personality (__md_stop() calls pers->free()).

Reviewed-by: NeilBrown <neilb@suse.com>
Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
---
 drivers/md/md.c | 30 +++++++++++++++++-------------
 1 file changed, 17 insertions(+), 13 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 44206bc6e3aa..e1d9116044ae 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -5574,8 +5574,8 @@ void md_stop(struct mddev *mddev)
 	/* stop the array and free an attached data structures.
 	 * This is called from dm-raid
 	 */
-	__md_stop(mddev);
 	bitmap_destroy(mddev);
+	__md_stop(mddev);
 	if (mddev->bio_set)
 		bioset_free(mddev->bio_set);
 }
@@ -5688,6 +5688,22 @@ static int do_md_stop(struct mddev *mddev, int mode,
 			set_disk_ro(disk, 0);
 
 		__md_stop_writes(mddev);
+
+		/*
+		 * Destroy bitmap after all writes are stopped
+		 */
+		if (mode == 0) {
+			bitmap_destroy(mddev);
+			if (mddev->bitmap_info.file) {
+				struct file *f = mddev->bitmap_info.file;
+				spin_lock(&mddev->lock);
+				mddev->bitmap_info.file = NULL;
+				spin_unlock(&mddev->lock);
+				fput(f);
+			}
+			mddev->bitmap_info.offset = 0;
+		}
+
 		__md_stop(mddev);
 		mddev->queue->backing_dev_info->congested_fn = NULL;
 
@@ -5712,19 +5728,7 @@ static int do_md_stop(struct mddev *mddev, int mode,
 	 */
 	if (mode == 0) {
 		pr_info("md: %s stopped.\n", mdname(mddev));
-
-		bitmap_destroy(mddev);
-		if (mddev->bitmap_info.file) {
-			struct file *f = mddev->bitmap_info.file;
-			spin_lock(&mddev->lock);
-			mddev->bitmap_info.file = NULL;
-			spin_unlock(&mddev->lock);
-			fput(f);
-		}
-		mddev->bitmap_info.offset = 0;
-
 		export_array(mddev);
-
 		md_clean(mddev);
 		if (mddev->hold_active == UNTIL_STOP)
 			mddev->hold_active = 0;
-- 
2.6.2


^ permalink raw reply related

* [PATCH V2 3/5] md-cluster: add CHANGE_CAPACITY message type
From: Guoqing Jiang @ 2017-03-01  8:42 UTC (permalink / raw)
  To: linux-raid; +Cc: shli, neilb, Guoqing Jiang
In-Reply-To: <1488357760-7893-1-git-send-email-gqjiang@suse.com>

The msg type CHANGE_CAPACITY is introduced to support
resize clustered raid in later patch, and it is sent
after all the nodes have the same sync_size, receiver
node just need to set new capacity once received this
msg.

Reviewed-by: NeilBrown <neilb@suse.com>
Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
---
 drivers/md/md-cluster.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index 5cf0a9d29bf0..8b7d55bf5aa2 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -104,6 +104,7 @@ enum msg_type {
 	REMOVE,
 	RE_ADD,
 	BITMAP_NEEDS_SYNC,
+	CHANGE_CAPACITY,
 };
 
 struct cluster_msg {
@@ -579,6 +580,10 @@ static int process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg)
 	case METADATA_UPDATED:
 		process_metadata_update(mddev, msg);
 		break;
+	case CHANGE_CAPACITY:
+		set_capacity(mddev->gendisk, mddev->array_sectors);
+		revalidate_disk(mddev->gendisk);
+		break;
 	case RESYNCING:
 		process_suspend_info(mddev, le32_to_cpu(msg->slot),
 				     le64_to_cpu(msg->low),
-- 
2.6.2


^ permalink raw reply related

* [PATCH V2 4/5] md-cluster: introduce cluster_check_sync_size
From: Guoqing Jiang @ 2017-03-01  8:42 UTC (permalink / raw)
  To: linux-raid; +Cc: shli, neilb, Guoqing Jiang
In-Reply-To: <1488357760-7893-1-git-send-email-gqjiang@suse.com>

Support resize is a little complex for clustered
raid, since we need to ensure all the nodes share
the same knowledge about the size of raid.

We achieve the goal by check the sync_size which
is in each node's bitmap, we can only change the
capacity after cluster_check_sync_size returns 0.

Also, get_bitmap_from_slot is added to get a slot's
bitmap. And we exported some funcs since they are
used in cluster_check_sync_size().

We can also reuse get_bitmap_from_slot to remove
redundant code existed in bitmap_copy_from_slot.

Reviewed-by: NeilBrown <neilb@suse.com>
Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
---
 drivers/md/bitmap.c     | 41 ++++++++++++++++++++++++---------
 drivers/md/bitmap.h     |  2 ++
 drivers/md/md-cluster.c | 60 +++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 93 insertions(+), 10 deletions(-)

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 9fb2ccac958a..b6fa55a3cff8 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -471,6 +471,7 @@ void bitmap_update_sb(struct bitmap *bitmap)
 	kunmap_atomic(sb);
 	write_page(bitmap, bitmap->storage.sb_page, 1);
 }
+EXPORT_SYMBOL(bitmap_update_sb);
 
 /* print out the bitmap file superblock */
 void bitmap_print_sb(struct bitmap *bitmap)
@@ -1727,7 +1728,7 @@ void bitmap_flush(struct mddev *mddev)
 /*
  * free memory that was allocated
  */
-static void bitmap_free(struct bitmap *bitmap)
+void bitmap_free(struct bitmap *bitmap)
 {
 	unsigned long k, pages;
 	struct bitmap_page *bp;
@@ -1761,6 +1762,7 @@ static void bitmap_free(struct bitmap *bitmap)
 	kfree(bp);
 	kfree(bitmap);
 }
+EXPORT_SYMBOL(bitmap_free);
 
 void bitmap_destroy(struct mddev *mddev)
 {
@@ -1920,6 +1922,27 @@ int bitmap_load(struct mddev *mddev)
 }
 EXPORT_SYMBOL_GPL(bitmap_load);
 
+struct bitmap *get_bitmap_from_slot(struct mddev *mddev, int slot)
+{
+	int rv = 0;
+	struct bitmap *bitmap;
+
+	bitmap = bitmap_create(mddev, slot);
+	if (IS_ERR(bitmap)) {
+		rv = PTR_ERR(bitmap);
+		return ERR_PTR(rv);
+	}
+
+	rv = bitmap_init_from_disk(bitmap, 0);
+	if (rv) {
+		bitmap_free(bitmap);
+		return ERR_PTR(rv);
+	}
+
+	return bitmap;
+}
+EXPORT_SYMBOL(get_bitmap_from_slot);
+
 /* Loads the bitmap associated with slot and copies the resync information
  * to our bitmap
  */
@@ -1929,14 +1952,13 @@ int bitmap_copy_from_slot(struct mddev *mddev, int slot,
 	int rv = 0, i, j;
 	sector_t block, lo = 0, hi = 0;
 	struct bitmap_counts *counts;
-	struct bitmap *bitmap = bitmap_create(mddev, slot);
-
-	if (IS_ERR(bitmap))
-		return PTR_ERR(bitmap);
+	struct bitmap *bitmap;
 
-	rv = bitmap_init_from_disk(bitmap, 0);
-	if (rv)
-		goto err;
+	bitmap = get_bitmap_from_slot(mddev, slot);
+	if (IS_ERR(bitmap)) {
+		pr_err("%s can't get bitmap from slot %d\n", __func__, slot);
+		return -1;
+	}
 
 	counts = &bitmap->counts;
 	for (j = 0; j < counts->chunks; j++) {
@@ -1963,8 +1985,7 @@ int bitmap_copy_from_slot(struct mddev *mddev, int slot,
 	bitmap_unplug(mddev->bitmap);
 	*low = lo;
 	*high = hi;
-err:
-	bitmap_free(bitmap);
+
 	return rv;
 }
 EXPORT_SYMBOL_GPL(bitmap_copy_from_slot);
diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h
index 5b6dd63dda91..9f761097aab2 100644
--- a/drivers/md/bitmap.h
+++ b/drivers/md/bitmap.h
@@ -267,8 +267,10 @@ void bitmap_daemon_work(struct mddev *mddev);
 
 int bitmap_resize(struct bitmap *bitmap, sector_t blocks,
 		  int chunksize, int init);
+struct bitmap *get_bitmap_from_slot(struct mddev *mddev, int slot);
 int bitmap_copy_from_slot(struct mddev *mddev, int slot,
 				sector_t *lo, sector_t *hi, bool clear_bits);
+void bitmap_free(struct bitmap *bitmap);
 #endif
 
 #endif
diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index 8b7d55bf5aa2..5a42fc5202ff 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -1091,6 +1091,66 @@ static void metadata_update_cancel(struct mddev *mddev)
 	unlock_comm(cinfo);
 }
 
+/*
+ * retun 0 if all the bitmaps have the same sync_size
+ */
+int cluster_check_sync_size(struct mddev *mddev)
+{
+	int i, rv;
+	bitmap_super_t *sb;
+	unsigned long my_sync_size, sync_size = 0;
+	int node_num = mddev->bitmap_info.nodes;
+	int current_slot = md_cluster_ops->slot_number(mddev);
+	struct bitmap *bitmap = mddev->bitmap;
+	char str[64];
+	struct dlm_lock_resource *bm_lockres;
+
+	sb = kmap_atomic(bitmap->storage.sb_page);
+	my_sync_size = sb->sync_size;
+	kunmap_atomic(sb);
+
+	for (i = 0; i < node_num; i++) {
+		if (i == current_slot)
+			continue;
+
+		bitmap = get_bitmap_from_slot(mddev, i);
+		if (IS_ERR(bitmap)) {
+			pr_err("can't get bitmap from slot %d\n", i);
+			return -1;
+		}
+
+		/*
+		 * If we can hold the bitmap lock of one node then
+		 * the slot is not occupied, update the sb.
+		 */
+		snprintf(str, 64, "bitmap%04d", i);
+		bm_lockres = lockres_init(mddev, str, NULL, 1);
+		if (!bm_lockres) {
+			pr_err("md-cluster: Cannot initialize %s\n", str);
+			lockres_free(bm_lockres);
+			return -ENOMEM;
+		}
+		bm_lockres->flags |= DLM_LKF_NOQUEUE;
+		rv = dlm_lock_sync(bm_lockres, DLM_LOCK_PW);
+		if (!rv)
+			bitmap_update_sb(bitmap);
+		lockres_free(bm_lockres);
+
+		sb = kmap_atomic(bitmap->storage.sb_page);
+		if (sync_size == 0)
+			sync_size = sb->sync_size;
+		else if (sync_size != sb->sync_size) {
+			kunmap_atomic(sb);
+			bitmap_free(bitmap);
+			return -1;
+		}
+		kunmap_atomic(sb);
+		bitmap_free(bitmap);
+	}
+
+	return (my_sync_size == sync_size) ? 0 : -1;
+}
+
 static int resync_start(struct mddev *mddev)
 {
 	struct md_cluster_info *cinfo = mddev->cluster_info;
-- 
2.6.2


^ permalink raw reply related

* [PATCH V2 5/5] md-cluster: add the support for resize
From: Guoqing Jiang @ 2017-03-01  8:42 UTC (permalink / raw)
  To: linux-raid; +Cc: shli, neilb, Guoqing Jiang
In-Reply-To: <1488357760-7893-1-git-send-email-gqjiang@suse.com>

To update size for cluster raid, we need to make
sure all nodes can perform the change successfully.
However, it is possible that some of them can't do
it due to failure (bitmap_resize could fail). So
we need to consider the issue before we set the
capacity unconditionally, and we use below steps
to perform sanity check.

1. A change the size, then broadcast METADATA_UPDATED
   msg.
2. B and C receive METADATA_UPDATED change the size
   excepts call set_capacity, sync_size is not update
   if the change failed. Also call bitmap_update_sb
   to sync sb to disk.
3. A checks other node's sync_size, if sync_size has
   been updated in all nodes, then send CHANGE_CAPACITY
   msg otherwise send msg to revert previous change.
4. B and C call set_capacity if receive CHANGE_CAPACITY
   msg, otherwise pers->resize will be called to restore
   the old value.

Reviewed-by: NeilBrown <neilb@suse.com>
Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
---
 Documentation/md/md-cluster.txt |  2 +-
 drivers/md/md-cluster.c         | 76 +++++++++++++++++++++++++++++++++++++++++
 drivers/md/md-cluster.h         |  1 +
 drivers/md/md.c                 | 21 +++++++++---
 4 files changed, 94 insertions(+), 6 deletions(-)

diff --git a/Documentation/md/md-cluster.txt b/Documentation/md/md-cluster.txt
index 38883276d31c..2663d49dd8a0 100644
--- a/Documentation/md/md-cluster.txt
+++ b/Documentation/md/md-cluster.txt
@@ -321,4 +321,4 @@ The algorithm is:
 
 There are somethings which are not supported by cluster MD yet.
 
-- update size and change array_sectors.
+- change array_sectors.
diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index 5a42fc5202ff..88ce15843a42 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -1151,6 +1151,81 @@ int cluster_check_sync_size(struct mddev *mddev)
 	return (my_sync_size == sync_size) ? 0 : -1;
 }
 
+/*
+ * Update the size for cluster raid is a little more complex, we perform it
+ * by the steps:
+ * 1. hold token lock and update superblock in initiator node.
+ * 2. send METADATA_UPDATED msg to other nodes.
+ * 3. The initiator node continues to check each bitmap's sync_size, if all
+ *    bitmaps have the same value of sync_size, then we can set capacity and
+ *    let other nodes to perform it. If one node can't update sync_size
+ *    accordingly, we need to revert to previous value.
+ */
+static void update_size(struct mddev *mddev, sector_t old_dev_sectors)
+{
+	struct md_cluster_info *cinfo = mddev->cluster_info;
+	struct cluster_msg cmsg;
+	struct md_rdev *rdev;
+	int ret = 0;
+	int raid_slot = -1;
+
+	md_update_sb(mddev, 1);
+	lock_comm(cinfo, 1);
+
+	memset(&cmsg, 0, sizeof(cmsg));
+	cmsg.type = cpu_to_le32(METADATA_UPDATED);
+	rdev_for_each(rdev, mddev)
+		if (rdev->raid_disk >= 0 && !test_bit(Faulty, &rdev->flags)) {
+			raid_slot = rdev->desc_nr;
+			break;
+		}
+	if (raid_slot >= 0) {
+		cmsg.raid_slot = cpu_to_le32(raid_slot);
+		/*
+		 * We can only change capiticy after all the nodes can do it,
+		 * so need to wait after other nodes already received the msg
+		 * and handled the change
+		 */
+		ret = __sendmsg(cinfo, &cmsg);
+		if (ret) {
+			pr_err("%s:%d: failed to send METADATA_UPDATED msg\n",
+			       __func__, __LINE__);
+			unlock_comm(cinfo);
+			return;
+		}
+	} else {
+		pr_err("md-cluster: No good device id found to send\n");
+		unlock_comm(cinfo);
+		return;
+	}
+
+	/*
+	 * check the sync_size from other node's bitmap, if sync_size
+	 * have already updated in other nodes as expected, send an
+	 * empty metadata msg to permit the change of capacity
+	 */
+	if (cluster_check_sync_size(mddev) == 0) {
+		memset(&cmsg, 0, sizeof(cmsg));
+		cmsg.type = cpu_to_le32(CHANGE_CAPACITY);
+		ret = __sendmsg(cinfo, &cmsg);
+		if (ret)
+			pr_err("%s:%d: failed to send CHANGE_CAPACITY msg\n",
+			       __func__, __LINE__);
+		set_capacity(mddev->gendisk, mddev->array_sectors);
+		revalidate_disk(mddev->gendisk);
+	} else {
+		/* revert to previous sectors */
+		ret = mddev->pers->resize(mddev, old_dev_sectors);
+		if (!ret)
+			revalidate_disk(mddev->gendisk);
+		ret = __sendmsg(cinfo, &cmsg);
+		if (ret)
+			pr_err("%s:%d: failed to send METADATA_UPDATED msg\n",
+			       __func__, __LINE__);
+	}
+	unlock_comm(cinfo);
+}
+
 static int resync_start(struct mddev *mddev)
 {
 	struct md_cluster_info *cinfo = mddev->cluster_info;
@@ -1396,6 +1471,7 @@ static struct md_cluster_operations cluster_ops = {
 	.gather_bitmaps = gather_bitmaps,
 	.lock_all_bitmaps = lock_all_bitmaps,
 	.unlock_all_bitmaps = unlock_all_bitmaps,
+	.update_size = update_size,
 };
 
 static int __init cluster_init(void)
diff --git a/drivers/md/md-cluster.h b/drivers/md/md-cluster.h
index e765499ba591..274016177983 100644
--- a/drivers/md/md-cluster.h
+++ b/drivers/md/md-cluster.h
@@ -27,6 +27,7 @@ struct md_cluster_operations {
 	int (*gather_bitmaps)(struct md_rdev *rdev);
 	int (*lock_all_bitmaps)(struct mddev *mddev);
 	void (*unlock_all_bitmaps)(struct mddev *mddev);
+	void (*update_size)(struct mddev *mddev, sector_t old_dev_sectors);
 };
 
 #endif /* _MD_CLUSTER_H */
diff --git a/drivers/md/md.c b/drivers/md/md.c
index e1d9116044ae..46bae2fb4f9a 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -6496,10 +6496,7 @@ static int update_size(struct mddev *mddev, sector_t num_sectors)
 	struct md_rdev *rdev;
 	int rv;
 	int fit = (num_sectors == 0);
-
-	/* cluster raid doesn't support update size */
-	if (mddev_is_clustered(mddev))
-		return -EINVAL;
+	sector_t old_dev_sectors = mddev->dev_sectors;
 
 	if (mddev->pers->resize == NULL)
 		return -EINVAL;
@@ -6528,7 +6525,9 @@ static int update_size(struct mddev *mddev, sector_t num_sectors)
 	}
 	rv = mddev->pers->resize(mddev, num_sectors);
 	if (!rv) {
-		if (mddev->queue) {
+		if (mddev_is_clustered(mddev))
+			md_cluster_ops->update_size(mddev, old_dev_sectors);
+		else if (mddev->queue) {
 			set_capacity(mddev->gendisk, mddev->array_sectors);
 			revalidate_disk(mddev->gendisk);
 		}
@@ -8746,6 +8745,18 @@ static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev)
 	int role, ret;
 	char b[BDEVNAME_SIZE];
 
+	/*
+	 * If size is changed in another node then we need to
+	 * do resize as well.
+	 */
+	if (mddev->dev_sectors != le64_to_cpu(sb->size)) {
+		ret = mddev->pers->resize(mddev, le64_to_cpu(sb->size));
+		if (ret)
+			pr_info("md-cluster: resize failed\n");
+		else
+			bitmap_update_sb(mddev->bitmap);
+	}
+
 	/* Check for change of roles in the active devices */
 	rdev_for_each(rdev2, mddev) {
 		if (test_bit(Faulty, &rdev2->flags))
-- 
2.6.2


^ permalink raw reply related

* [PATCH] mdadm:fix typo in comment
From: Zhilong Liu @ 2017-03-01  8:44 UTC (permalink / raw)
  To: Jes.Sorensen; +Cc: linux-raid, Zhilong Liu

Signed-off-by: Zhilong Liu <zlliu@suse.com>

diff --git a/mdadm.c b/mdadm.c
index b5d89e4..16fd49a 100644
--- a/mdadm.c
+++ b/mdadm.c
@@ -1268,7 +1268,7 @@ int main(int argc, char *argv[])
 	 * hopefully it's mostly right but there might be some stuff
 	 * missing
 	 *
-	 * That is mosty checked in the per-mode stuff but...
+	 * That is mostly checked in the per-mode stuff but...
 	 *
 	 * For @,B,C and A without -s, the first device listed must be
 	 * an md device.  We check that here and open it.
-- 
2.6.6


^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox