Linux RAID subsystem development
 help / color / mirror / Atom feed
* Re: [PATCH 01/14] md-cluster: remove unnecessary header files
From: Shaohua Li @ 2017-02-28 18:03 UTC (permalink / raw)
  To: Guoqing Jiang; +Cc: linux-raid, shli, neilb
In-Reply-To: <1487906124-20107-2-git-send-email-gqjiang@suse.com>

On Fri, Feb 24, 2017 at 11:15:11AM +0800, Guoqing Jiang wrote:
> md-cluster.h is already included in md.h, so remove
> the redundant one and we don't want to cross include
> header file too.
> 
> Reviewed-by: NeilBrown <neilb@suse.com>
> Signed-off-by: Guoqing Jiang <gqjiang@suse.com>

It would be better md.h doesn't include md-cluster.h and include md-cluster.h
in required .c files.

> ---
>  drivers/md/md-cluster.c | 1 -
>  drivers/md/md-cluster.h | 2 --
>  2 files changed, 3 deletions(-)
> 
> diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
> index 2b13117fb918..03c38ed222ce 100644
> --- a/drivers/md/md-cluster.c
> +++ b/drivers/md/md-cluster.c
> @@ -16,7 +16,6 @@
>  #include <linux/raid/md_p.h>
>  #include "md.h"
>  #include "bitmap.h"
> -#include "md-cluster.h"
>  
>  #define LVB_SIZE	64
>  #define NEW_DEV_TIMEOUT 5000
> diff --git a/drivers/md/md-cluster.h b/drivers/md/md-cluster.h
> index e765499ba591..8f26a5e80810 100644
> --- a/drivers/md/md-cluster.h
> +++ b/drivers/md/md-cluster.h
> @@ -3,8 +3,6 @@
>  #ifndef _MD_CLUSTER_H
>  #define _MD_CLUSTER_H
>  
> -#include "md.h"
> -
>  struct mddev;
>  struct md_rdev;
>  
> -- 
> 2.6.2
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-raid" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH 04/14] md-cluster: add mddev into struct md_cluster_info
From: Shaohua Li @ 2017-02-28 18:06 UTC (permalink / raw)
  To: Guoqing Jiang; +Cc: linux-raid, shli, neilb
In-Reply-To: <1487906124-20107-5-git-send-email-gqjiang@suse.com>

On Fri, Feb 24, 2017 at 11:15:14AM +0800, Guoqing Jiang wrote:
> Store "struct mddev *mddev" in md_cluster_info,
> then we can get mddev inside lock_token in next
> patch.

Please merge this to the patch which requires the mddev. Don't think we need a
separate patch.
 
> Reviewed-by: NeilBrown <neilb@suse.com>
> Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
> ---
>  drivers/md/md-cluster.c | 2 ++
>  1 file changed, 2 insertions(+)
> 
> diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
> index 620828e56dc8..1ec89273ff99 100644
> --- a/drivers/md/md-cluster.c
> +++ b/drivers/md/md-cluster.c
> @@ -69,6 +69,7 @@ struct resync_info {
>  
>  
>  struct md_cluster_info {
> +	struct mddev *mddev; /* the md device which md_cluster_info belongs to */
>  	/* dlm lock space and resources for clustered raid. */
>  	dlm_lockspace_t *lockspace;
>  	int slot_number;
> @@ -833,6 +834,7 @@ static int join(struct mddev *mddev, int nodes)
>  	mutex_init(&cinfo->recv_mutex);
>  
>  	mddev->cluster_info = cinfo;
> +	cinfo->mddev = mddev;
>  
>  	memset(str, 0, 64);
>  	sprintf(str, "%pU", mddev->uuid);
> -- 
> 2.6.2
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-raid" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH 05/14] md-cluster: add new parameter for lock_token
From: Shaohua Li @ 2017-02-28 18:07 UTC (permalink / raw)
  To: Guoqing Jiang; +Cc: linux-raid, shli, neilb
In-Reply-To: <1487906124-20107-6-git-send-email-gqjiang@suse.com>

On Fri, Feb 24, 2017 at 11:15:15AM +0800, Guoqing Jiang wrote:
> lock_token is called from either lock_comm or
> metadata_update_start, if we look back more for
> the call chain, some of them are called with the
> reconfig_mutex held while a few of them don't.
> 
> Specifically, resync_info_update is mostly called
> without the protection of reconfig_mutex. But
> resync_finish calls resync_info_update within
> the mutex.
> 
> We make the change to lock_token since we need
> to use synchronous way to handle METADATA_UPDATED
> message in latter patch.

please merge this to the patch which uses the new paramter

 
> Reviewed-by: NeilBrown <neilb@suse.com>
> Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
> ---
>  drivers/md/md-cluster.c | 30 +++++++++++++++++++-----------
>  1 file changed, 19 insertions(+), 11 deletions(-)
> 
> diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
> index 1ec89273ff99..a6cf02c5c7bc 100644
> --- a/drivers/md/md-cluster.c
> +++ b/drivers/md/md-cluster.c
> @@ -646,7 +646,7 @@ static void recv_daemon(struct md_thread *thread)
>   * Takes the lock on the TOKEN lock resource so no other
>   * node can communicate while the operation is underway.
>   */
> -static int lock_token(struct md_cluster_info *cinfo)
> +static int lock_token(struct md_cluster_info *cinfo, bool mddev_locked)
>  {
>  	int error;
>  
> @@ -663,12 +663,12 @@ static int lock_token(struct md_cluster_info *cinfo)
>  /* lock_comm()
>   * Sets the MD_CLUSTER_SEND_LOCK bit to lock the send channel.
>   */
> -static int lock_comm(struct md_cluster_info *cinfo)
> +static int lock_comm(struct md_cluster_info *cinfo, bool mddev_locked)
>  {
>  	wait_event(cinfo->wait,
>  		   !test_and_set_bit(MD_CLUSTER_SEND_LOCK, &cinfo->state));
>  
> -	return lock_token(cinfo);
> +	return lock_token(cinfo, mddev_locked);
>  }
>  
>  static void unlock_comm(struct md_cluster_info *cinfo)
> @@ -743,11 +743,12 @@ static int __sendmsg(struct md_cluster_info *cinfo, struct cluster_msg *cmsg)
>  	return error;
>  }
>  
> -static int sendmsg(struct md_cluster_info *cinfo, struct cluster_msg *cmsg)
> +static int sendmsg(struct md_cluster_info *cinfo, struct cluster_msg *cmsg,
> +		   bool mddev_locked)
>  {
>  	int ret;
>  
> -	lock_comm(cinfo);
> +	lock_comm(cinfo, mddev_locked);
>  	ret = __sendmsg(cinfo, cmsg);
>  	unlock_comm(cinfo);
>  	return ret;
> @@ -944,7 +945,7 @@ static void resync_bitmap(struct mddev *mddev)
>  	int err;
>  
>  	cmsg.type = cpu_to_le32(BITMAP_NEEDS_SYNC);
> -	err = sendmsg(cinfo, &cmsg);
> +	err = sendmsg(cinfo, &cmsg, 1);
>  	if (err)
>  		pr_err("%s:%d: failed to send BITMAP_NEEDS_SYNC message (%d)\n",
>  			__func__, __LINE__, err);
> @@ -1007,7 +1008,7 @@ static int metadata_update_start(struct mddev *mddev)
>  	if (cinfo->token_lockres->mode == DLM_LOCK_EX)
>  		return 0;
>  
> -	return lock_token(cinfo);
> +	return lock_token(cinfo, 1);
>  }
>  
>  static int metadata_update_finish(struct mddev *mddev)
> @@ -1070,7 +1071,14 @@ static int resync_info_update(struct mddev *mddev, sector_t lo, sector_t hi)
>  	cmsg.low = cpu_to_le64(lo);
>  	cmsg.high = cpu_to_le64(hi);
>  
> -	return sendmsg(cinfo, &cmsg);
> +	/*
> +	 * mddev_lock is held if resync_info_update is called from
> +	 * resync_finish (md_reap_sync_thread -> resync_finish)
> +	 */
> +	if (lo == 0 && hi == 0)
> +		return sendmsg(cinfo, &cmsg, 1);
> +	else
> +		return sendmsg(cinfo, &cmsg, 0);
>  }
>  
>  static int resync_finish(struct mddev *mddev)
> @@ -1120,7 +1128,7 @@ static int add_new_disk(struct mddev *mddev, struct md_rdev *rdev)
>  	cmsg.type = cpu_to_le32(NEWDISK);
>  	memcpy(cmsg.uuid, uuid, 16);
>  	cmsg.raid_slot = cpu_to_le32(rdev->desc_nr);
> -	lock_comm(cinfo);
> +	lock_comm(cinfo, 1);
>  	ret = __sendmsg(cinfo, &cmsg);
>  	if (ret)
>  		return ret;
> @@ -1180,7 +1188,7 @@ static int remove_disk(struct mddev *mddev, struct md_rdev *rdev)
>  	struct md_cluster_info *cinfo = mddev->cluster_info;
>  	cmsg.type = cpu_to_le32(REMOVE);
>  	cmsg.raid_slot = cpu_to_le32(rdev->desc_nr);
> -	return sendmsg(cinfo, &cmsg);
> +	return sendmsg(cinfo, &cmsg, 1);
>  }
>  
>  static int lock_all_bitmaps(struct mddev *mddev)
> @@ -1244,7 +1252,7 @@ static int gather_bitmaps(struct md_rdev *rdev)
>  
>  	cmsg.type = cpu_to_le32(RE_ADD);
>  	cmsg.raid_slot = cpu_to_le32(rdev->desc_nr);
> -	err = sendmsg(cinfo, &cmsg);
> +	err = sendmsg(cinfo, &cmsg, 1);
>  	if (err)
>  		goto out;
>  
> -- 
> 2.6.2
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-raid" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH 07/14] md: move bitmap_destroy before __md_stop
From: Shaohua Li @ 2017-02-28 18:20 UTC (permalink / raw)
  To: Guoqing Jiang; +Cc: linux-raid, shli, neilb
In-Reply-To: <1487906124-20107-8-git-send-email-gqjiang@suse.com>

On Fri, Feb 24, 2017 at 11:15:17AM +0800, Guoqing Jiang wrote:
> Since we have switched to sync way to handle METADATA_UPDATED
> msg for md-cluster, then process_metadata_update is depended
> on mddev->thread->wqueue.
> 
> With the new change, clustered raid could possible hang if
> array received a METADATA_UPDATED msg after array unregistered
> mddev->thread, so we need to stop clustered raid earlier
> than before.
> 
> And this change should be safe for non-clustered raid since
> all writes are stopped before the destroy. Also in md_run,
> we activate the personality (pers->run()) before activating
> the bitmap (bitmap_create()). So it is pleasingly symmetric
> to stop the bitmap (bitmap_destroy()) before stopping the
> personality (__md_stop() calls pers->free()).

There is no patch 6. So I can't judge the purpose of the patch.

The patch changed behaviors. We only destroy bitmap with mode == 0, now we do it
even for mode == 2. Please specify why it's safe. The __md_stop will wait for
behind writes for example only with bitmap set, but the patch makes us not do
the wait.

> Reviewed-by: NeilBrown <neilb@suse.com>
> Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
> ---
>  drivers/md/md.c | 28 +++++++++++++++-------------
>  1 file changed, 15 insertions(+), 13 deletions(-)
> 
> diff --git a/drivers/md/md.c b/drivers/md/md.c
> index 6e910d99c9c1..7bcc757386e1 100644
> --- a/drivers/md/md.c
> +++ b/drivers/md/md.c
> @@ -5582,8 +5582,8 @@ void md_stop(struct mddev *mddev)
>  	/* stop the array and free an attached data structures.
>  	 * This is called from dm-raid
>  	 */
> -	__md_stop(mddev);
>  	bitmap_destroy(mddev);
> +	__md_stop(mddev);
>  	if (mddev->bio_set)
>  		bioset_free(mddev->bio_set);
>  }
> @@ -5696,6 +5696,20 @@ static int do_md_stop(struct mddev *mddev, int mode,
>  			set_disk_ro(disk, 0);
>  
>  		__md_stop_writes(mddev);
> +
> +		/*
> +		 * Destroy bitmap after all writes are stopped
> +		 */
> +		bitmap_destroy(mddev);
> +		if (mddev->bitmap_info.file) {
> +			struct file *f = mddev->bitmap_info.file;
> +			spin_lock(&mddev->lock);
> +			mddev->bitmap_info.file = NULL;
> +			spin_unlock(&mddev->lock);
> +			fput(f);
> +		}
> +		mddev->bitmap_info.offset = 0;
> +
>  		__md_stop(mddev);
>  		mddev->queue->backing_dev_info.congested_fn = NULL;
>  
> @@ -5720,19 +5734,7 @@ static int do_md_stop(struct mddev *mddev, int mode,
>  	 */
>  	if (mode == 0) {
>  		pr_info("md: %s stopped.\n", mdname(mddev));
> -
> -		bitmap_destroy(mddev);
> -		if (mddev->bitmap_info.file) {
> -			struct file *f = mddev->bitmap_info.file;
> -			spin_lock(&mddev->lock);
> -			mddev->bitmap_info.file = NULL;
> -			spin_unlock(&mddev->lock);
> -			fput(f);
> -		}
> -		mddev->bitmap_info.offset = 0;
> -
>  		export_array(mddev);
> -
>  		md_clean(mddev);
>  		if (mddev->hold_active == UNTIL_STOP)
>  			mddev->hold_active = 0;
> -- 
> 2.6.2
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-raid" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH 08/14] md-cluster: set MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD before unregister thread
From: Shaohua Li @ 2017-02-28 18:22 UTC (permalink / raw)
  To: Guoqing Jiang; +Cc: linux-raid, shli, neilb
In-Reply-To: <1487906124-20107-9-git-send-email-gqjiang@suse.com>

On Fri, Feb 24, 2017 at 11:15:18AM +0800, Guoqing Jiang wrote:
> After used sync way to handle METADATA_UPDATED msg, a deadlock
> could appear if stop a resyncing array.

shouldn't this put into the patch of 'handle METADATA_UPDATED' msg?
 
> betalinux244:~ # ps aux|grep md|grep D
> root     17164  0.0  0.0      0     0 ?        D    Jan09   0:00 [md0_cluster_rec]
> root     18151  0.0  0.1  19852  2008 ?        Ds   Jan09   0:00 /sbin/mdadm -Ssq
> betalinux244:~ # cat /proc/17164/stack
> [<ffffffffa06a7395>] recv_daemon+0x1f5/0x590 [md_cluster]
> [<ffffffffa067be20>] md_thread+0x130/0x150 [md_mod]
> [<ffffffff810995ed>] kthread+0xbd/0xe0
> [<ffffffff815e96bf>] ret_from_fork+0x3f/0x70
> [<ffffffff81099530>] kthread+0x0/0xe0
> [<ffffffffffffffff>] 0xffffffffffffffff
> betalinux244:~ # cat /proc/18151/stack
> [<ffffffff81099879>] kthread_stop+0x59/0x130
> [<ffffffffa067c566>] md_unregister_thread+0x46/0x80 [md_mod]
> [<ffffffffa06a6e71>] leave+0x81/0x120 [md_cluster]
> [<ffffffffa0684e94>] md_cluster_stop+0x14/0x30 [md_mod]
> [<ffffffffa06858b6>] bitmap_free+0x126/0x130 [md_mod]
> [<ffffffffa0682d06>] do_md_stop+0x356/0x5f0 [md_mod]
> [<ffffffffa0683cbe>] md_ioctl+0x6fe/0x1680 [md_mod]
> [<ffffffff812ed158>] blkdev_ioctl+0x258/0x920
> [<ffffffff8122f81d>] block_ioctl+0x3d/0x40
> [<ffffffff8120ac0d>] do_vfs_ioctl+0x2cd/0x4a0
> [<ffffffff8120ae54>] SyS_ioctl+0x74/0x80
> [<ffffffff815e936e>] entry_SYSCALL_64_fastpath+0x12/0x6d
> [<ffffffffffffffff>] 0xffffffffffffffff
> 
> Since md_unregister_thread(&cinfo->recv_thread) is blocked by
> recv_daemon -> process_recvd_msg -> process_metadata_update.
> To resolve the issue, MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD is
> also need to be set before unregister thread.
> 
> Reviewed-by: NeilBrown <neilb@suse.com>
> Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
> ---
>  drivers/md/md-cluster.c | 2 ++
>  1 file changed, 2 insertions(+)
> 
> diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
> index 0aad477d1b20..5e2c54be6f30 100644
> --- a/drivers/md/md-cluster.c
> +++ b/drivers/md/md-cluster.c
> @@ -932,6 +932,7 @@ static int join(struct mddev *mddev, int nodes)
>  
>  	return 0;
>  err:
> +	set_bit(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state);
>  	md_unregister_thread(&cinfo->recovery_thread);
>  	md_unregister_thread(&cinfo->recv_thread);
>  	lockres_free(cinfo->message_lockres);
> @@ -987,6 +988,7 @@ static int leave(struct mddev *mddev)
>  	if (cinfo->slot_number > 0 && mddev->recovery_cp != MaxSector)
>  		resync_bitmap(mddev);
>  
> +	set_bit(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state);
>  	md_unregister_thread(&cinfo->recovery_thread);
>  	md_unregister_thread(&cinfo->recv_thread);
>  	lockres_free(cinfo->message_lockres);
> -- 
> 2.6.2
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-raid" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH 09/14] md-cluster: set MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD in metadata_update_start
From: Shaohua Li @ 2017-02-28 18:24 UTC (permalink / raw)
  To: Guoqing Jiang; +Cc: linux-raid, shli, neilb
In-Reply-To: <1487906124-20107-10-git-send-email-gqjiang@suse.com>

On Fri, Feb 24, 2017 at 11:15:19AM +0800, Guoqing Jiang wrote:
> Since we have switched to sync way to handle METADATA_UPDATED
> msg, then process_metadata_update can only continue with either
> get the reconfig_mutex or MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD is
> set.

same here. Either the patch should be put into the 'METADATA_UPDATED' patch or
this patch should be before the 'METADATA_UPDATED' patch.

We shouldn't introduce a broken patch then fix it in latter patch.
 
> However, there is a deadlock issue which happened as follows:
> 
> 1. Node A sends METADATA_UPDATED msg (held Token lock).
> 2. Node B wants to do resync, and is blocked since it can't
>    get Token lock, but MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD is
>    not set since the callchain
>    (md_do_sync -> sync_request
>                -> resync_info_update
> 	       -> sendmsg
> 	       -> lock_comm -> lock_token)
>    doesn't hold reconfig_mutex.
> 3. Node B trys to update sb (held reconfig_mutex), but stopped
>    at wait_event() in metadata_update_start since we have set
>    MD_CLUSTER_SEND_LOCK flag in lock_comm (step 2).
> 4. Then Node B receives METADATA_UPDATED msg from A, of course
>    recv_daemon is blocked forever.
> 
> The followings are more detailed infos.
> betalinux240:~ # cat /proc/mdstat
> Personalities : [raid1]
> md0 : active raid1 vdc[1] vdb[0]
> 2095104 blocks super 1.2 [2/2] [UU]
> [>....................]  resync =  4.6% (98112/2095104) finish=28.8min speed=1154K/sec
> bitmap: 1/1 pages [4KB], 65536KB chunk
> 
> unused devices: <none>
> betalinux240:~ # ps aux|grep md|grep D
> root      4393  0.0  0.0      0     0 ?        D    11:42   0:00 [md0_raid1]
> root      4402  0.0  0.0      0     0 ?        D    11:42   0:00 [md0_cluster_rec]
> root      4407  0.0  0.0      0     0 ?        D    11:42   0:00 [md0_resync]
> betalinux240:~ # cat /proc/4407/stack
> [<ffffffffa05eb531>] dlm_lock_sync+0x81/0xc0 [md_cluster]
> [<ffffffffa05eba23>] lock_token+0x23/0xa0 [md_cluster]
> [<ffffffffa05ebad2>] lock_comm+0x32/0x90 [md_cluster]
> [<ffffffffa05ebb45>] sendmsg+0x15/0x30 [md_cluster]
> [<ffffffffa05ebd5a>] resync_info_update+0x8a/0xc0 [md_cluster]
> [<ffffffffa06b6ae7>] sync_request+0xa57/0xaf0 [raid1]
> [<ffffffffa069825d>] md_do_sync+0x90d/0xe70 [md_mod]
> [<ffffffffa0694c50>] md_thread+0x130/0x150 [md_mod]
> [<ffffffff810995ed>] kthread+0xbd/0xe0
> [<ffffffff815e96bf>] ret_from_fork+0x3f/0x70
> [<ffffffff81099530>] kthread+0x0/0xe0
> betalinux240:~ # cat /proc/4402/stack
> [<ffffffffa05ecf77>] recv_daemon+0x1d7/0x570 [md_cluster]
> [<ffffffffa0694c50>] md_thread+0x130/0x150 [md_mod]
> [<ffffffff810995ed>] kthread+0xbd/0xe0
> [<ffffffff815e96bf>] ret_from_fork+0x3f/0x70
> [<ffffffff81099530>] kthread+0x0/0xe0
> betalinux240:~ # cat /proc/4393/stack
> [<ffffffffa05ec0e5>] metadata_update_start+0xa5/0xb0 [md_cluster]
> [<ffffffffa069913e>] md_update_sb.part.50+0x8e/0x850 [md_mod]
> [<ffffffffa069ab1d>] md_check_recovery+0x21d/0x4e0 [md_mod]
> [<ffffffffa06b9322>] raid1d+0x42/0x7d0 [raid1]
> [<ffffffffa0694c50>] md_thread+0x130/0x150 [md_mod]
> [<ffffffff810995ed>] kthread+0xbd/0xe0
> [<ffffffff815e96bf>] ret_from_fork+0x3f/0x70
> [<ffffffff81099530>] kthread+0x0/0xe0
> 
> Since metadata_update_start always calls lock_token with
> reconfig_mutex, we need to set MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD
> here as well, and lock_token don't need to set it twice unless
> lock_token is invoked from lock_comm.
> 
> Reviewed-by: NeilBrown <neilb@suse.com>
> Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
> ---
>  drivers/md/md-cluster.c | 26 +++++++++++++++++++++-----
>  1 file changed, 21 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
> index 5e2c54be6f30..bcc269312b71 100644
> --- a/drivers/md/md-cluster.c
> +++ b/drivers/md/md-cluster.c
> @@ -654,7 +654,7 @@ static void recv_daemon(struct md_thread *thread)
>   */
>  static int lock_token(struct md_cluster_info *cinfo, bool mddev_locked)
>  {
> -	int error;
> +	int error, set_bit = 0;
>  	struct mddev *mddev = cinfo->mddev;
>  
>  	/*
> @@ -663,14 +663,16 @@ static int lock_token(struct md_cluster_info *cinfo, bool mddev_locked)
>  	 * since another node already got EX on Token and waitting the EX of Ack),
>  	 * so let resync wake up thread in case flag is set.
>  	 */
> -	if (mddev_locked) {
> +	if (mddev_locked && !test_bit(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD,
> +				      &cinfo->state)) {
>  		error = test_and_set_bit_lock(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD,
>  					      &cinfo->state);
>  		WARN_ON_ONCE(error);
>  		md_wakeup_thread(mddev->thread);
> +		set_bit = 1;
>  	}
>  	error = dlm_lock_sync(cinfo->token_lockres, DLM_LOCK_EX);
> -	if (mddev_locked)
> +	if (set_bit)
>  		clear_bit_unlock(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state);
>  
>  	if (error)
> @@ -1023,16 +1025,30 @@ static int slot_number(struct mddev *mddev)
>  static int metadata_update_start(struct mddev *mddev)
>  {
>  	struct md_cluster_info *cinfo = mddev->cluster_info;
> +	int ret;
> +
> +	/*
> +	 * metadata_update_start is always called with the protection of
> +	 * reconfig_mutex, so set WAITING_FOR_TOKEN here.
> +	 */
> +	ret = test_and_set_bit_lock(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD,
> +				    &cinfo->state);
> +	WARN_ON_ONCE(ret);
> +	md_wakeup_thread(mddev->thread);
>  
>  	wait_event(cinfo->wait,
>  		   !test_and_set_bit(MD_CLUSTER_SEND_LOCK, &cinfo->state) ||
>  		   test_and_clear_bit(MD_CLUSTER_SEND_LOCKED_ALREADY, &cinfo->state));
>  
>  	/* If token is already locked, return 0 */
> -	if (cinfo->token_lockres->mode == DLM_LOCK_EX)
> +	if (cinfo->token_lockres->mode == DLM_LOCK_EX) {
> +		clear_bit_unlock(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state);
>  		return 0;
> +	}
>  
> -	return lock_token(cinfo, 1);
> +	ret = lock_token(cinfo, 1);
> +	clear_bit_unlock(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state);
> +	return ret;
>  }
>  
>  static int metadata_update_finish(struct mddev *mddev)
> -- 
> 2.6.2
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-raid" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH 11/14] md-cluster: introduce cluster_check_sync_size
From: Shaohua Li @ 2017-02-28 19:05 UTC (permalink / raw)
  To: Guoqing Jiang; +Cc: linux-raid, shli, neilb
In-Reply-To: <1487906124-20107-12-git-send-email-gqjiang@suse.com>

On Fri, Feb 24, 2017 at 11:15:21AM +0800, Guoqing Jiang wrote:
> Support resize is a little complex for clustered
> raid, since we need to ensure all the nodes share
> the same knowledge about the size of raid.
> 
> We achieve the goal by check the sync_size which
> is in each node's bitmap, we can only change the
> capacity after cluster_check_sync_size returns 0.
> 
> Also, get_bitmap_from_slot is added to get a slot's
> bitmap. And we exported some funcs since they are
> used in cluster_check_sync_size().
> 
> Reviewed-by: NeilBrown <neilb@suse.com>
> Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
> ---
>  drivers/md/bitmap.c     | 25 ++++++++++++++++++++-
>  drivers/md/bitmap.h     |  2 ++
>  drivers/md/md-cluster.c | 58 +++++++++++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 84 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
> index 9fb2ccac958a..67a7d399f501 100644
> --- a/drivers/md/bitmap.c
> +++ b/drivers/md/bitmap.c
> @@ -471,6 +471,7 @@ void bitmap_update_sb(struct bitmap *bitmap)
>  	kunmap_atomic(sb);
>  	write_page(bitmap, bitmap->storage.sb_page, 1);
>  }
> +EXPORT_SYMBOL(bitmap_update_sb);
>  
>  /* print out the bitmap file superblock */
>  void bitmap_print_sb(struct bitmap *bitmap)
> @@ -1727,7 +1728,7 @@ void bitmap_flush(struct mddev *mddev)
>  /*
>   * free memory that was allocated
>   */
> -static void bitmap_free(struct bitmap *bitmap)
> +void bitmap_free(struct bitmap *bitmap)
>  {
>  	unsigned long k, pages;
>  	struct bitmap_page *bp;
> @@ -1761,6 +1762,7 @@ static void bitmap_free(struct bitmap *bitmap)
>  	kfree(bp);
>  	kfree(bitmap);
>  }
> +EXPORT_SYMBOL(bitmap_free);
>  
>  void bitmap_destroy(struct mddev *mddev)
>  {
> @@ -1920,6 +1922,27 @@ int bitmap_load(struct mddev *mddev)
>  }
>  EXPORT_SYMBOL_GPL(bitmap_load);
>  
> +struct bitmap *get_bitmap_from_slot(struct mddev *mddev, int slot)
> +{
> +	int rv = 0;
> +	struct bitmap *bitmap;
> +
> +	bitmap = bitmap_create(mddev, slot);
> +	if (IS_ERR(bitmap)) {
> +		rv = PTR_ERR(bitmap);
> +		return ERR_PTR(rv);
> +	}
> +
> +	rv = bitmap_init_from_disk(bitmap, 0);
> +	if (rv) {
> +		bitmap_free(bitmap);
> +		return ERR_PTR(rv);
> +	}
> +
> +	return bitmap;
> +}
> +EXPORT_SYMBOL(get_bitmap_from_slot);
> +
>  /* Loads the bitmap associated with slot and copies the resync information
>   * to our bitmap
>   */
> diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h
> index 5b6dd63dda91..9f761097aab2 100644
> --- a/drivers/md/bitmap.h
> +++ b/drivers/md/bitmap.h
> @@ -267,8 +267,10 @@ void bitmap_daemon_work(struct mddev *mddev);
>  
>  int bitmap_resize(struct bitmap *bitmap, sector_t blocks,
>  		  int chunksize, int init);
> +struct bitmap *get_bitmap_from_slot(struct mddev *mddev, int slot);
>  int bitmap_copy_from_slot(struct mddev *mddev, int slot,
>  				sector_t *lo, sector_t *hi, bool clear_bits);
> +void bitmap_free(struct bitmap *bitmap);
>  #endif
>  
>  #endif
> diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
> index f1b9a7a3ddd2..d3c024e6bfcf 100644
> --- a/drivers/md/md-cluster.c
> +++ b/drivers/md/md-cluster.c
> @@ -1089,6 +1089,64 @@ static void metadata_update_cancel(struct mddev *mddev)
>  	unlock_comm(cinfo);
>  }
>  
> +/*
> + * retun 0 if all the bitmaps have the same sync_size
> + */
> +int cluster_check_sync_size(struct mddev *mddev)
> +{
> +	int i, rv;
> +	bitmap_super_t *sb;
> +	unsigned long my_sync_size, sync_size = 0;
> +	int node_num = mddev->bitmap_info.nodes;
> +	int current_slot = md_cluster_ops->slot_number(mddev);
> +	struct bitmap *bitmap = mddev->bitmap;
> +	char str[64];
> +	struct dlm_lock_resource *bm_lockres;
> +
> +	sb = kmap_atomic(bitmap->storage.sb_page);
> +	my_sync_size = sb->sync_size;
> +	kunmap_atomic(sb);
> +
> +	for (i = 0; i < node_num; i++) {
> +		if (i == current_slot)
> +			continue;
> +
> +		bitmap = get_bitmap_from_slot(mddev, i);
> +		if (IS_ERR(bitmap)) {
> +			pr_err("can't get bitmap from slot %d\n", i);
> +			return -1;
> +		}
> +
> +		/*
> +		 * If we can hold the bitmap lock of one node then
> +		 * the slot is not occupied, update the sb.
> +		 */
> +		snprintf(str, 64, "bitmap%04d", i);
> +		bm_lockres = lockres_init(mddev, str, NULL, 1);
> +		if (!bm_lockres) {
> +			pr_err("md-cluster: Cannot initialize %s\n", str);
> +		}

just print error here?

> +		bm_lockres->flags |= DLM_LKF_NOQUEUE;
> +		rv = dlm_lock_sync(bm_lockres, DLM_LOCK_PW);
> +		if (!rv)
> +			bitmap_update_sb(bitmap);

always the sb even the sync_size is the same?


^ permalink raw reply

* Re: [PATCH 12/14] md/bitmap: replace redundant codes with get_bitmap_from_slot
From: Shaohua Li @ 2017-02-28 19:06 UTC (permalink / raw)
  To: Guoqing Jiang; +Cc: linux-raid, shli, neilb
In-Reply-To: <1487906124-20107-13-git-send-email-gqjiang@suse.com>

On Fri, Feb 24, 2017 at 11:15:22AM +0800, Guoqing Jiang wrote:
> Since get_bitmap_from_slot is introduced in previous
> commit, we can use it in bitmap_copy_from_slot to
> remove redundant code.

this should be merged into patch 11
 
> Reviewed-by: NeilBrown <neilb@suse.com>
> Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
> ---
>  drivers/md/bitmap.c | 16 +++++++---------
>  1 file changed, 7 insertions(+), 9 deletions(-)
> 
> diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
> index 67a7d399f501..b6fa55a3cff8 100644
> --- a/drivers/md/bitmap.c
> +++ b/drivers/md/bitmap.c
> @@ -1952,14 +1952,13 @@ int bitmap_copy_from_slot(struct mddev *mddev, int slot,
>  	int rv = 0, i, j;
>  	sector_t block, lo = 0, hi = 0;
>  	struct bitmap_counts *counts;
> -	struct bitmap *bitmap = bitmap_create(mddev, slot);
> -
> -	if (IS_ERR(bitmap))
> -		return PTR_ERR(bitmap);
> +	struct bitmap *bitmap;
>  
> -	rv = bitmap_init_from_disk(bitmap, 0);
> -	if (rv)
> -		goto err;
> +	bitmap = get_bitmap_from_slot(mddev, slot);
> +	if (IS_ERR(bitmap)) {
> +		pr_err("%s can't get bitmap from slot %d\n", __func__, slot);
> +		return -1;
> +	}
>  
>  	counts = &bitmap->counts;
>  	for (j = 0; j < counts->chunks; j++) {
> @@ -1986,8 +1985,7 @@ int bitmap_copy_from_slot(struct mddev *mddev, int slot,
>  	bitmap_unplug(mddev->bitmap);
>  	*low = lo;
>  	*high = hi;
> -err:
> -	bitmap_free(bitmap);
> +
>  	return rv;
>  }
>  EXPORT_SYMBOL_GPL(bitmap_copy_from_slot);
> -- 
> 2.6.2
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-raid" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH 14/14] md-cluster: add the support for resize
From: Shaohua Li @ 2017-02-28 19:25 UTC (permalink / raw)
  To: Guoqing Jiang; +Cc: linux-raid, shli, neilb
In-Reply-To: <1487906124-20107-15-git-send-email-gqjiang@suse.com>

On Fri, Feb 24, 2017 at 11:15:24AM +0800, Guoqing Jiang wrote:
> To update size for cluster raid, we need to make
> sure all nodes can perform the change successfully.
> However, it is possible that some of them can't do
> it due to failure (bitmap_resize could fail). So
> we need to consider the issue before we set the
> capacity unconditionally, and we use below steps
> to perform sanity check.
> 
> 1. A change the size, then broadcast METADATA_UPDATED
>    msg.
> 2. B and C receive METADATA_UPDATED change the size
>    excepts call set_capacity, sync_size is not update
>    if the change failed. Also call bitmap_update_sb
>    to sync sb to disk.
> 3. A checks other node's sync_size, if sync_size has
>    been updated in all nodes, then send CHANGE_CAPACITY
>    msg otherwise send msg to revert previous change.
> 4. B and C call set_capacity if receive CHANGE_CAPACITY
>    msg, otherwise pers->resize will be called to restore
>    the old value.
> 
> Reviewed-by: NeilBrown <neilb@suse.com>
> Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
> ---
>  Documentation/md/md-cluster.txt |  2 +-
>  drivers/md/md-cluster.c         | 75 +++++++++++++++++++++++++++++++++++++++++
>  drivers/md/md-cluster.h         |  1 +
>  drivers/md/md.c                 | 21 +++++++++---
>  4 files changed, 93 insertions(+), 6 deletions(-)
> 
> diff --git a/Documentation/md/md-cluster.txt b/Documentation/md/md-cluster.txt
> index 38883276d31c..2663d49dd8a0 100644
> --- a/Documentation/md/md-cluster.txt
> +++ b/Documentation/md/md-cluster.txt
> @@ -321,4 +321,4 @@ The algorithm is:
>  
>  There are somethings which are not supported by cluster MD yet.
>  
> -- update size and change array_sectors.
> +- change array_sectors.
> diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
> index d3c024e6bfcf..75da83187c31 100644
> --- a/drivers/md/md-cluster.c
> +++ b/drivers/md/md-cluster.c
> @@ -1147,6 +1147,80 @@ int cluster_check_sync_size(struct mddev *mddev)
>  	return (my_sync_size == sync_size) ? 0 : -1;
>  }
>  
> +/*
> + * Update the size for cluster raid is a little more complex, we perform it
> + * by the steps:
> + * 1. hold token lock and update superblock in initiator node.
> + * 2. send METADATA_UPDATED msg to other nodes.
> + * 3. The initiator node continues to check each bitmap's sync_size, if all
> + *    bitmaps have the same value of sync_size, then we can set capacity and
> + *    let other nodes to perform it. If one node can't update sync_size
> + *    accordingly, we need to revert to previous value.
> + */
> +static void update_size(struct mddev *mddev, sector_t old_dev_sectors)
> +{
> +	struct md_cluster_info *cinfo = mddev->cluster_info;
> +	struct cluster_msg cmsg;
> +	struct md_rdev *rdev;
> +	int ret = 0;
> +	int raid_slot = -1;
> +
> +	md_update_sb(mddev, 1);
> +	lock_comm(cinfo, 1);
> +
> +	memset(&cmsg, 0, sizeof(cmsg));
> +	cmsg.type = cpu_to_le32(METADATA_UPDATED);
> +	rdev_for_each(rdev, mddev)
> +		if (rdev->raid_disk >= 0 && !test_bit(Faulty, &rdev->flags)) {
> +			raid_slot = rdev->desc_nr;
> +			break;
> +		}
> +	if (raid_slot >= 0) {
> +		cmsg.raid_slot = cpu_to_le32(raid_slot);
> +		/*
> +		 * We can only change capiticy after all the nodes can do it,
> +		 * so need to wait after other nodes already received the msg
> +		 * and handled the change
> +		 */
> +		ret = __sendmsg(cinfo, &cmsg);
> +		if (ret) {
> +			pr_err("%s:%d: failed to send METADATA_UPDATED msg\n",
> +			       __func__, __LINE__);
> +			unlock_comm(cinfo);
> +			return;
> +		}
> +	} else {
> +		pr_err("md-cluster: No good device id found to send\n");
> +		unlock_comm(cinfo);
> +		return;
> +	}
> +
> +	/*
> +	 * check the sync_size from other node's bitmap, if sync_size
> +	 * have already updated in other nodes as expected, send an
> +	 * empty metadata msg to permit the change of capacity
> +	 */
> +	if (cluster_check_sync_size(mddev) == 0) {
> +		memset(&cmsg, 0, sizeof(cmsg));
> +		cmsg.type = cpu_to_le32(CHANGE_CAPACITY);
> +		ret = __sendmsg(cinfo, &cmsg);
> +		if (ret)
> +			pr_err("%s:%d: failed to send CHANGE_CAPACITY msg\n",
> +			       __func__, __LINE__);
> +		set_capacity(mddev->gendisk, mddev->array_sectors);

don't call revalidate_disk here? And why don't move the gendisk related stuff to md.c.

> +	} else {
> +		/* revert to previous sectors */
> +		ret = mddev->pers->resize(mddev, old_dev_sectors);
> +		if (!ret)
> +			revalidate_disk(mddev->gendisk);
> +		ret = __sendmsg(cinfo, &cmsg);
> +		if (ret)
> +			pr_err("%s:%d: failed to send METADATA_UPDATED msg\n",
> +			       __func__, __LINE__);
> +	}
> +	unlock_comm(cinfo);
> +}
> +
>  static int resync_start(struct mddev *mddev)
>  {
>  	struct md_cluster_info *cinfo = mddev->cluster_info;
> @@ -1392,6 +1466,7 @@ static struct md_cluster_operations cluster_ops = {
>  	.gather_bitmaps = gather_bitmaps,
>  	.lock_all_bitmaps = lock_all_bitmaps,
>  	.unlock_all_bitmaps = unlock_all_bitmaps,
> +	.update_size = update_size,
>  };
>  
>  static int __init cluster_init(void)
> diff --git a/drivers/md/md-cluster.h b/drivers/md/md-cluster.h
> index 8f26a5e80810..e939c14222ba 100644
> --- a/drivers/md/md-cluster.h
> +++ b/drivers/md/md-cluster.h
> @@ -25,6 +25,7 @@ struct md_cluster_operations {
>  	int (*gather_bitmaps)(struct md_rdev *rdev);
>  	int (*lock_all_bitmaps)(struct mddev *mddev);
>  	void (*unlock_all_bitmaps)(struct mddev *mddev);
> +	void (*update_size)(struct mddev *mddev, sector_t old_dev_sectors);
>  };
>  
>  #endif /* _MD_CLUSTER_H */
> diff --git a/drivers/md/md.c b/drivers/md/md.c
> index 975c9dd60c05..2bc64059ff57 100644
> --- a/drivers/md/md.c
> +++ b/drivers/md/md.c
> @@ -6503,10 +6503,7 @@ static int update_size(struct mddev *mddev, sector_t num_sectors)
>  	struct md_rdev *rdev;
>  	int rv;
>  	int fit = (num_sectors == 0);
> -
> -	/* cluster raid doesn't support update size */
> -	if (mddev_is_clustered(mddev))
> -		return -EINVAL;
> +	sector_t old_dev_sectors = mddev->dev_sectors;
>  
>  	if (mddev->pers->resize == NULL)
>  		return -EINVAL;
> @@ -6535,7 +6532,9 @@ static int update_size(struct mddev *mddev, sector_t num_sectors)
>  	}
>  	rv = mddev->pers->resize(mddev, num_sectors);
>  	if (!rv) {
> -		if (mddev->queue) {
> +		if (mddev_is_clustered(mddev))
> +			md_cluster_ops->update_size(mddev, old_dev_sectors);
> +		else if (mddev->queue) {
>  			set_capacity(mddev->gendisk, mddev->array_sectors);
>  			revalidate_disk(mddev->gendisk);
>  		}
> @@ -8753,6 +8752,18 @@ static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev)
>  	int role, ret;
>  	char b[BDEVNAME_SIZE];
>  
> +	/*
> +	 * If size is changed in another node then we need to
> +	 * do resize as well.
> +	 */
> +	if (mddev->dev_sectors != le64_to_cpu(sb->size)) {
> +		ret = mddev->pers->resize(mddev, le64_to_cpu(sb->size));
> +		if (ret)
> +			pr_info("md-cluster: resize failed\n");
> +		else
> +			bitmap_update_sb(mddev->bitmap);
> +	}

I'm confused, who will trigger this? The patch 10 only calls set_capacity.
Please describe the details in each node. Also please add it to md-cluster.txt
document.

Thanks,
Shaohua

^ permalink raw reply

* Re: [PATCH 00/14] the latest changes for md-cluster
From: Shaohua Li @ 2017-02-28 19:27 UTC (permalink / raw)
  To: Guoqing Jiang; +Cc: linux-raid, shli, neilb
In-Reply-To: <1487906124-20107-1-git-send-email-gqjiang@suse.com>

On Fri, Feb 24, 2017 at 11:15:10AM +0800, Guoqing Jiang wrote:
> Hi,
> 
> This patchset is based on md/for-next branch, which includes below parts:
> 
> 1. some code improvements (0001-0004).
> 2. changes for use sync way to handle METADATA_UPDATED msg (0005-0009).
> 3. add resize support for md-cluster (0010-0012 and 0014), and also
>    make some related changes inside md (0013).

Applied patch 2, 3, 13. Replied to other patches.

BTW, is it possible you can create a test suite people can verify the md-cluster stuff?

Thanks,
Shaohua

^ permalink raw reply

* Re: [PATCH V3 1/2] RAID1: a new I/O barrier implementation to remove resync window
From: Shaohua Li @ 2017-02-28 19:42 UTC (permalink / raw)
  To: 王金浦
  Cc: Coly Li, NeilBrown, linux-raid, Shaohua Li, Johannes Thumshirn,
	Guoqing Jiang
In-Reply-To: <CAD9gYJ+FqnFbJp3sY9k5j=ZtJiN6qPivJ665hJPX94eWaAMkJA@mail.gmail.com>

On Fri, Feb 24, 2017 at 11:19:05AM +0100, 王金浦 wrote:
> Hi Coly, Hi Shaohua,
> 
> 
> >
> > Hi Shaohua,
> >
> > I try to catch up with you, let me try to follow your mind by the
> > split-in-while-loop condition (this is my new I/O barrier patch). I
> > assume the original BIO is a write bio, and original bio is split and
> > handled in a while loop in raid1_make_request().
> 
> It's still possible for read bio. We hit a deadlock in the past.
> See https://patchwork.kernel.org/patch/9498949/
> 
> Also:
> http://www.spinics.net/lists/raid/msg52792.html

Thanks Jinpu. So this is for the read side, where we don't have plug stuff.
Yep, that finally makes sense. It's my fault I didn't look at the read side
code carefully. Looks we need the patch Neil suggested.

Thanks,
Shaohua

^ permalink raw reply

* Re: WARNING: mismatch_cnt is not 0 on <array device> [SOLVED?]
From: Benjammin2068 @ 2017-02-28 19:50 UTC (permalink / raw)
  To: Linux-RAID
In-Reply-To: <58223D29.6030401@youngman.org.uk>

As an interesting update for everyone on this thread....

I replaced all my drives with a single type/speed (WD Red) and now that the older Samsung HD103SJ (now Seagate) drives have been gone, I haven't had any mismatch_cnt hiccups with that marvell JBOD MV8 controller.

Just thought I'd offer that as a current data point.

Cheers,

 -Ben

^ permalink raw reply

* Re: [BUG] non-metadata arrays cannot use more than 27 component devices
From: NeilBrown @ 2017-02-28 20:29 UTC (permalink / raw)
  To: ian_bruce; +Cc: linux-raid
In-Reply-To: <20170228022513.2cf5445a.ian_bruce@mail.ru>

[-- Attachment #1: Type: text/plain, Size: 3853 bytes --]

On Tue, Feb 28 2017, ian_bruce@mail.ru wrote:

> On Mon, 27 Feb 2017 16:55:56 +1100
> NeilBrown <neilb@suse.com> wrote:
>
>>> When assembling non-metadata arrays ("mdadm --build"), the in-kernel
>>> superblock apparently defaults to the MD-RAID v0.90 type. This
>>> imposes a maximum of 27 component block devices, presumably as well
>>> as limits on device size.
>>>
>>> mdadm does not allow you to override this default, by specifying the
>>> v1.2 superblock. It is not clear whether mdadm tells the kernel to
>>> use the v0.90 superblock, or the kernel assumes this by itself. One
>>> or other of them should be fixed; there does not appear to be any
>>> reason why the v1.2 superblock should not be the default in this
>>> case.
>> 
>> Can you see if this change improves the behavior for you?
>
> Unfortunately, I'm not set up for kernel compilation at the moment. But
> here is my test case; it shouldn't be any harder to reproduce than this,
> on extremely ordinary hardware (= no actual disk RAID array):
>
>
> # truncate -s 64M img64m.{00..31}   # requires no space on ext4,
> #                                   # because sparse files are created
> # 
> # ls img64m.*
> img64m.00  img64m.04  img64m.08  img64m.12  img64m.16  img64m.20  img64m.24  img64m.28
> img64m.01  img64m.05  img64m.09  img64m.13  img64m.17  img64m.21  img64m.25  img64m.29
> img64m.02  img64m.06  img64m.10  img64m.14  img64m.18  img64m.22  img64m.26  img64m.30
> img64m.03  img64m.07  img64m.11  img64m.15  img64m.19  img64m.23  img64m.27  img64m.31
> # 
> # RAID=$(for x in img64m.* ; do losetup --show -f $x ; done)
> # 
> # echo $RAID
> /dev/loop0 /dev/loop1 /dev/loop2 /dev/loop3 /dev/loop4 /dev/loop5 /dev/loop6 /dev/loop7
> /dev/loop8 /dev/loop9 /dev/loop10 /dev/loop11 /dev/loop12 /dev/loop13 /dev/loop14 /dev/loop15
> /dev/loop16 /dev/loop17 /dev/loop18 /dev/loop19 /dev/loop20 /dev/loop21 /dev/loop22 /dev/loop23
> /dev/loop24 /dev/loop25 /dev/loop26 /dev/loop27 /dev/loop28 /dev/loop29 /dev/loop30 /dev/loop31
> # 
> # mdadm --build /dev/md/md-test --level=linear --raid-devices=32 $RAID
> mdadm: ADD_NEW_DISK failed for /dev/loop27: Device or resource busy
> # 

Thanks.  That makes it easy.
Test works with my patch applied.
....

>
>> diff --git a/drivers/md/md.c b/drivers/md/md.c
>> index ba485dcf1064..e0ac7f5a8e68 100644
>> --- a/drivers/md/md.c
>> +++ b/drivers/md/md.c
>> @@ -6464,9 +6464,8 @@ static int set_array_info(struct mddev *mddev, mdu_array_info_t *info)
>>  	mddev->layout        = info->layout;
>>  	mddev->chunk_sectors = info->chunk_size >> 9;
>>  
>> -	mddev->max_disks     = MD_SB_DISKS;
>> -
>>  	if (mddev->persistent) {
>> +		mddev->max_disks     = MD_SB_DISKS;
>>  		mddev->flags         = 0;
>>  		mddev->sb_flags         = 0;
>>  	}
>
> What value does mddev->max_disks get in the opposite case,
> (!mddev->persistent) ?

Default value is zero, which causes no limit to be imposed.

>
> I note this comment from the top of the function:
>
>     * set_array_info is used two different ways
>     * The original usage is when creating a new array.
>     * In this usage, raid_disks is > 0 and it together with
>     *  level, size, not_persistent,layout,chunksize determine the
>     *  shape of the array.
>     *  This will always create an array with a type-0.90.0 superblock.

Unfortunately you cannot always trust comments.  They are more like hints.

>
> http://lxr.free-electrons.com/source/drivers/md/md.c#L6410
>
> Surely there is an equivalent function which creates arrays with a
> type-1 superblock?

Not really.  type-1 superblock are created from userspace by mdadm.
mdadm then tells the kernel "here are some devices that form an array".
md reads the devices, finds the type-1 metadata, and proceeds.

Thanks,
NeilBrown

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 832 bytes --]

^ permalink raw reply

* [PATCH] md: don't impose the MD_SB_DISKS limit on arrays without metadata.
From: NeilBrown @ 2017-02-28 20:31 UTC (permalink / raw)
  To: Shaohua Li; +Cc: linux-raid, ian_bruce

[-- Attachment #1: Type: text/plain, Size: 834 bytes --]


These arrays, created with "mdadm --build" don't benefit from a limit.
The default will be used, which is '0' and is interpreted as "don't
impose a limit".

Reported-by: ian_bruce@mail.ru
Signed-off-by: NeilBrown <neilb@suse.com>
---
 drivers/md/md.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index ba485dcf1064..e0ac7f5a8e68 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -6464,9 +6464,8 @@ static int set_array_info(struct mddev *mddev, mdu_array_info_t *info)
 	mddev->layout        = info->layout;
 	mddev->chunk_sectors = info->chunk_size >> 9;
 
-	mddev->max_disks     = MD_SB_DISKS;
-
 	if (mddev->persistent) {
+		mddev->max_disks     = MD_SB_DISKS;
 		mddev->flags         = 0;
 		mddev->sb_flags         = 0;
 	}
-- 
2.11.0


[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 832 bytes --]

^ permalink raw reply related

* RE: GRUB warning after replacing disk drive in RAID1
From: Peter Sangas @ 2017-02-28 21:01 UTC (permalink / raw)
  To: 'Reindl Harald', linux-raid
In-Reply-To: <ab6d048b-b944-6661-11a5-503012dfad50@thelounge.net>

But I issue the grub command AFTER the re-sync is completed.

-----Original Message-----
From: Reindl Harald [mailto:h.reindl@thelounge.net] 
Sent: Tuesday, February 28, 2017 1:23 AM
To: linux-raid@vger.kernel.org
Subject: Re: GRUB warning after replacing disk drive in RAID1



Am 28.02.2017 um 00:37 schrieb Peter Sangas:
> I have a RAID1 with 3 disks sda,sdb,sdc.  After replacing sdc and 
> re-syncing it to the array I issued the following command to load grub 
> but I get this
> warning:
>
> grub-install /dev/sdc
>
> Installing for i386-pc platform.
> grub-install: warning: Couldn't find physical volume `(null)'. Some 
> modules may be missing from core image..
> grub-install: warning: Couldn't find physical volume `(null)'. Some 
> modules may be missing from core image..
> Installation finished. No error reported.
>
> Does anyone know why I get this warning and how to avoid it

it's harmless and disappears after the resync finished
--
To unsubscribe from this list: send the line "unsubscribe linux-raid" in the
body of a message to majordomo@vger.kernel.org More majordomo info at
http://vger.kernel.org/majordomo-info.html


^ permalink raw reply

* Re: [RFC PATCH v4] IV Generation algorithms for dm-crypt
From: Milan Broz @ 2017-02-28 21:05 UTC (permalink / raw)
  To: Binoy Jayan
  Cc: Rajendra, Herbert Xu, Oded, Mike Snitzer,
	Linux kernel mailing list, Ondrej Mosnacek, linux-raid,
	Gilad Ben-Yossef, dm-devel, Mark Brown, Arnd Bergmann,
	linux-crypto, Shaohua Li, David S. Miller, Alasdair Kergon, Ofir
In-Reply-To: <CAHv-k_8rgArzV2uQ64h1ZTNRpssX1jMf=RwuPoBmsjQ0FhCsWA@mail.gmail.com>

On 02/22/2017 07:12 AM, Binoy Jayan wrote:
> 
> I was wondering if this is near to be ready for submission (apart from
> the testmgr.c
> changes) or I need to make some changes to make it similar to the IPSec offload?

I just tried this and except it registers the IV for every new device again, it works...
(After a while you have many duplicate entries in /proc/crypto.)

But I would like to see some summary why such a big patch is needed in the first place.
(During an internal discussions seems that people are already lost in mails and
patches here, so Ondra promised me to send some summary mail soon here.)

IIRC the first initial problem was dmcrypt performance on some embedded
crypto processors that are not able to cope with small crypto requests effectively.

Do you have some real performance numbers that proves that such a patch is adequate?

I would really like to see the performance issue fixed but I am really not sure
this approach works for everyone. It would be better to avoid repeating this exercise later.
IIRC Ondra's "bulk" mode, despite rejected, shows that there is a potential
to speedup things even for crypt drivers that do not support own IV generators.

I like the patch is now contained inside dmcrypt, but it still exposes IVs that
are designed just for old, insecure, compatibility-only containers.

I really do not think every compatible crap must be accessible through crypto API.
(I wrote the dmcrypt lrw and tcw compatibility IVs and I would never do that this way
if I know it is accessible outside of dmcrypt internals...)
Even the ESSIV is something that was born to fix predictive IVs (CBC watermarking
attacks) for disk encryption only, no reason to expose it outside of disk encryption.

Milan

^ permalink raw reply

* [PATCH] md/raid1/10: fix potential deadlock
From: Shaohua Li @ 2017-02-28 21:08 UTC (permalink / raw)
  To: linux-raid
  Cc: neilb, Coly Li, 王金浦,
	v3.14+, only the raid10 part

Neil Brown pointed out a potential deadlock in raid 10 code with
bio_split/chain. The raid1 code could have the same issue, but recent
barrier rework makes it less likely to happen. The deadlock happens in
below sequence:

1. generic_make_request(bio), this will set current->bio_list
2. raid10_make_request will split bio to bio1 and bio2
3. __make_request(bio1), wait_barrer, add underlayer disk bio to
current->bio_list
4. __make_request(bio2), wait_barrer

If raise_barrier happens between 3 & 4, since wait_barrier runs at 3,
raise_barrier waits for IO completion from 3. And since raise_barrier
sets barrier, 4 waits for raise_barrier. But IO from 3 can't be
dispatched because raid10_make_request() doesn't finished yet.

The solution is to adjust the IO ordering. Quotes from Neil:
"
It is much safer to:

    if (need to split) {
        split = bio_split(bio, ...)
        bio_chain(...)
        make_request_fn(split);
        generic_make_request(bio);
   } else
        make_request_fn(mddev, bio);

This way we first process the initial section of the bio (in 'split')
which will queue some requests to the underlying devices.  These
requests will be queued in generic_make_request.
Then we queue the remainder of the bio, which will be added to the end
of the generic_make_request queue.
Then we return.
generic_make_request() will pop the lower-level device requests off the
queue and handle them first.  Then it will process the remainder
of the original bio once the first section has been fully processed.
"

Cc: Coly Li <colyli@suse.de>
Cc: 王金浦 <jinpuwang@gmail.com>
Cc: stable@vger.kernel.org (v3.14+, only the raid10 part)
Suggested-by: NeilBrown <neilb@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid1.c  | 25 +++++++++++++++++++++++--
 drivers/md/raid10.c | 18 ++++++++++++++++++
 2 files changed, 41 insertions(+), 2 deletions(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 551d654..3c5933b 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1584,9 +1584,30 @@ static void raid1_make_request(struct mddev *mddev, struct bio *bio)
 			split = bio;
 		}
 
-		if (bio_data_dir(split) == READ)
+		if (bio_data_dir(split) == READ) {
 			raid1_read_request(mddev, split);
-		else
+
+			/*
+			 * If a bio is splitted, the first part of bio will
+			 * pass barrier but the bio is queued in
+			 * current->bio_list (see generic_make_request). If
+			 * there is a raise_barrier() called here, the second
+			 * part of bio can't pass barrier. But since the first
+			 * part bio isn't dispatched to underlaying disks yet,
+			 * the barrier is never released, hence raise_barrier
+			 * will alays wait. We have a deadlock.
+			 * Note, this only happens in read path. For write
+			 * path, the first part of bio is dispatched in a
+			 * schedule() call (because of blk plug) or offloaded
+			 * to raid10d.
+			 * Quitting from the function immediately can change
+			 * the bio order queued in bio_list and avoid the deadlock.
+			 */
+			if (split != bio) {
+				generic_make_request(bio);
+				break;
+			}
+		} else
 			raid1_write_request(mddev, split);
 	} while (split != bio);
 }
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index c4db6d1..b1b1f98 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1584,7 +1584,25 @@ static void raid10_make_request(struct mddev *mddev, struct bio *bio)
 			split = bio;
 		}
 
+		/*
+		 * If a bio is splitted, the first part of bio will pass
+		 * barrier but the bio is queued in current->bio_list (see
+		 * generic_make_request). If there is a raise_barrier() called
+		 * here, the second part of bio can't pass barrier. But since
+		 * the first part bio isn't dispatched to underlaying disks
+		 * yet, the barrier is never released, hence raise_barrier will
+		 * alays wait. We have a deadlock.
+		 * Note, this only happens in read path. For write path, the
+		 * first part of bio is dispatched in a schedule() call
+		 * (because of blk plug) or offloaded to raid10d.
+		 * Quitting from the function immediately can change the bio
+		 * order queued in bio_list and avoid the deadlock.
+		 */
 		__make_request(mddev, split);
+		if (split != bio && bio_data_dir(bio) == READ) {
+			generic_make_request(bio);
+			break;
+		}
 	} while (split != bio);
 
 	/* In case raid10d snuck in to freeze_array */
-- 
2.9.3


^ permalink raw reply related

* Re: [PATCH] md: don't impose the MD_SB_DISKS limit on arrays without metadata.
From: Shaohua Li @ 2017-02-28 21:15 UTC (permalink / raw)
  To: NeilBrown; +Cc: linux-raid, ian_bruce
In-Reply-To: <87k28aoy5b.fsf@notabene.neil.brown.name>

On Wed, Mar 01, 2017 at 07:31:28AM +1100, Neil Brown wrote:
> 
> These arrays, created with "mdadm --build" don't benefit from a limit.
> The default will be used, which is '0' and is interpreted as "don't
> impose a limit".
> 
> Reported-by: ian_bruce@mail.ru
> Signed-off-by: NeilBrown <neilb@suse.com>

applied, thanks!

> ---
>  drivers/md/md.c | 3 +--
>  1 file changed, 1 insertion(+), 2 deletions(-)
> 
> diff --git a/drivers/md/md.c b/drivers/md/md.c
> index ba485dcf1064..e0ac7f5a8e68 100644
> --- a/drivers/md/md.c
> +++ b/drivers/md/md.c
> @@ -6464,9 +6464,8 @@ static int set_array_info(struct mddev *mddev, mdu_array_info_t *info)
>  	mddev->layout        = info->layout;
>  	mddev->chunk_sectors = info->chunk_size >> 9;
>  
> -	mddev->max_disks     = MD_SB_DISKS;
> -
>  	if (mddev->persistent) {
> +		mddev->max_disks     = MD_SB_DISKS;
>  		mddev->flags         = 0;
>  		mddev->sb_flags         = 0;
>  	}
> -- 
> 2.11.0
> 



^ permalink raw reply

* Re: GRUB warning after replacing disk drive in RAID1
From: Reindl Harald @ 2017-02-28 22:34 UTC (permalink / raw)
  To: linux-raid
In-Reply-To: <008001d29205$e4f503c0$aedf0b40$@wnsdev.com>



Am 28.02.2017 um 22:01 schrieb Peter Sangas:
> But I issue the grub command AFTER the re-sync is completed

output of "cat /proc/mdstat" and your environment missing!

* cat /proc/mdstat
* df -hT
* lsscsi
* lsblk

no pictures and interpretations, just copy&paste from the terminal 
(input as well as output)

please help others to help you

> -----Original Message-----
> From: Reindl Harald [mailto:h.reindl@thelounge.net]
> Sent: Tuesday, February 28, 2017 1:23 AM
> To: linux-raid@vger.kernel.org
> Subject: Re: GRUB warning after replacing disk drive in RAID1
>
> Am 28.02.2017 um 00:37 schrieb Peter Sangas:
>> I have a RAID1 with 3 disks sda,sdb,sdc.  After replacing sdc and
>> re-syncing it to the array I issued the following command to load grub
>> but I get this
>> warning:
>>
>> grub-install /dev/sdc
>>
>> Installing for i386-pc platform.
>> grub-install: warning: Couldn't find physical volume `(null)'. Some
>> modules may be missing from core image..
>> grub-install: warning: Couldn't find physical volume `(null)'. Some
>> modules may be missing from core image..
>> Installation finished. No error reported.
>>
>> Does anyone know why I get this warning and how to avoid it
>
> it's harmless and disappears after the resync finished

^ permalink raw reply

* RE: GRUB warning after replacing disk drive in RAID1
From: Peter Sangas @ 2017-02-28 23:15 UTC (permalink / raw)
  To: 'Reindl Harald', linux-raid
In-Reply-To: <59c85e04-2f6a-1c81-ccba-fa8aac1c50ee@thelounge.net>

Thanks for your help.  See below for output

-----Original Message-----
From: Reindl Harald [mailto:h.reindl@thelounge.net] 
Sent: Tuesday, February 28, 2017 2:34 PM
To: linux-raid@vger.kernel.org
Subject: Re: GRUB warning after replacing disk drive in RAID1



Am 28.02.2017 um 22:01 schrieb Peter Sangas:
> But I issue the grub command AFTER the re-sync is completed

>>>output of "cat /proc/mdstat" and your environment missing!

>>>* cat /proc/mdstat
>>>* df -hT
>>>* lsscsi
>>>* lsblk

>>>no pictures and interpretations, just copy&paste from the terminal (input
as well as output)

>>>please help others to help you


cat /proc/mdstat
Personalities : [raid1] [linear] [multipath] [raid0] [raid6] [raid5] [raid4]
[raid10] 
md3 : active raid1 sdc5[3] sdb5[1] sda5[0]
      97589248 blocks super 1.2 [3/3] [UUU]
      
md1 : active raid1 sdc2[3] sdb2[1] sda2[0]
      126887936 blocks super 1.2 [3/3] [UUU]
      bitmap: 0/1 pages [0KB], 65536KB chunk

md5 : active raid1 sdc7[3] sdb7[1] sda7[0]
      244169728 blocks super 1.2 [3/3] [UUU]
      bitmap: 0/2 pages [0KB], 65536KB chunk

md2 : active raid1 sdc3[3] sdb3[1] sda3[0]
      195181568 blocks super 1.2 [3/3] [UUU]
      bitmap: 1/2 pages [4KB], 65536KB chunk

md4 : active raid1 sdc6[3] sdb6[1] sda6[0]
      97589248 blocks super 1.2 [3/3] [UUU]
      
md0 : active raid1 sdc1[3] sdb1[1] sda1[0]
      19514368 blocks super 1.2 [3/3] [UUU]
      
unused devices: <none>

uname -a 
Linux green 4.4.0-47-generic #68-Ubuntu SMP Wed Oct 26 19:39:52 UTC 2016
x86_64 x86_64 x86_64 GNU/Linux

df -hT
Filesystem     Type      Size  Used Avail Use% Mounted on
udev           devtmpfs   63G     0   63G   0% /dev
tmpfs          tmpfs      13G  746M   12G   6% /run
/dev/md2       ext4      184G   31G  144G  18% /
tmpfs          tmpfs      63G     0   63G   0% /dev/shm
tmpfs          tmpfs     5.0M     0  5.0M   0% /run/lock
tmpfs          tmpfs      63G     0   63G   0% /sys/fs/cgroup
/dev/md0       ext4       19G  289M   17G   2% /boot
/dev/md3       ext4       92G   40G   48G  46% /cl
/dev/md5       ext4      230G   31G  187G  15% /sd
/dev/md4       ext4       92G   20G   68G  22% /pc
tan:/clbck     nfs4      596G  169G  398G  30% /clbck
tan:/sdbck     nfs4      596G  169G  398G  30% /sdbck
tmpfs          tmpfs      13G  4.0K   13G   1% /run/user/275
/dev/sde1      ext3      2.7T  676G  1.9T  26% /archive
tmpfs          tmpfs      13G  4.0K   13G   1% /run/user/286
/dev/sdd1      ext3      1.8T  1.6T  182G  90% /backupdisk
tmpfs          tmpfs      13G   12K   13G   1% /run/user/277
tmpfs          tmpfs      13G     0   13G   0% /run/user/283
tmpfs          tmpfs      13G  4.0K   13G   1% /run/user/280
tmpfs          tmpfs      13G  4.0K   13G   1% /run/user/285
tmpfs          tmpfs      13G     0   13G   0% /run/user/299
tmpfs          tmpfs      13G     0   13G   0% /run/user/1100
tmpfs          tmpfs      13G     0   13G   0% /run/user/1685


lsscsi
[2:0:0:0]    disk    ATA      WDC WD30EZRS-00J 0A80  /dev/sde 
[3:0:0:0]    disk    ATA      WDC WD2000FYYZ-0 1K03  /dev/sdd 
[4:0:0:0]    disk    ATA      INTEL SSDSC2BX80 0140  /dev/sda 
[5:0:0:0]    disk    ATA      INTEL SSDSC2BX80 0140  /dev/sdb 
[6:0:0:0]    disk    ATA      INTEL SSDSC2BX80 0140  /dev/sdc

lsblk
NAME    MAJ:MIN RM   SIZE RO TYPE  MOUNTPOINT
sda       8:0    0 745.2G  0 disk  
+-sda1    8:1    0  18.6G  0 part  
│ L-md0   9:0    0  18.6G  0 raid1 /boot
+-sda2    8:2    0 121.1G  0 part  
│ L-md1   9:1    0   121G  0 raid1 [SWAP]
+-sda3    8:3    0 186.3G  0 part  
│ L-md2   9:2    0 186.1G  0 raid1 /
+-sda4    8:4    0     1K  0 part  
+-sda5    8:5    0  93.1G  0 part  
│ L-md3   9:3    0  93.1G  0 raid1 /cl
+-sda6    8:6    0  93.1G  0 part  
│ L-md4   9:4    0  93.1G  0 raid1 /pc
L-sda7    8:7    0   233G  0 part  
  L-md5   9:5    0 232.9G  0 raid1 /sd
sdb       8:16   0 745.2G  0 disk  
+-sdb1    8:17   0  18.6G  0 part  
│ L-md0   9:0    0  18.6G  0 raid1 /boot
+-sdb2    8:18   0 121.1G  0 part  
│ L-md1   9:1    0   121G  0 raid1 [SWAP]
+-sdb3    8:19   0 186.3G  0 part  
│ L-md2   9:2    0 186.1G  0 raid1 /
+-sdb4    8:20   0     1K  0 part  
+-sdb5    8:21   0  93.1G  0 part  
│ L-md3   9:3    0  93.1G  0 raid1 /cl
+-sdb6    8:22   0  93.1G  0 part  
│ L-md4   9:4    0  93.1G  0 raid1 /pc
L-sdb7    8:23   0   233G  0 part  
  L-md5   9:5    0 232.9G  0 raid1 /sd
sdc       8:32   0 745.2G  0 disk  
+-sdc1    8:33   0  18.6G  0 part  
│ L-md0   9:0    0  18.6G  0 raid1 /boot
+-sdc2    8:34   0 121.1G  0 part  
│ L-md1   9:1    0   121G  0 raid1 [SWAP]
+-sdc3    8:35   0 186.3G  0 part  
│ L-md2   9:2    0 186.1G  0 raid1 /
+-sdc4    8:36   0     1K  0 part  
+-sdc5    8:37   0  93.1G  0 part  
│ L-md3   9:3    0  93.1G  0 raid1 /cl
+-sdc6    8:38   0  93.1G  0 part  
│ L-md4   9:4    0  93.1G  0 raid1 /pc
L-sdc7    8:39   0   233G  0 part  
  L-md5   9:5    0 232.9G  0 raid1 /sd
sdd       8:48   0   1.8T  0 disk  
L-sdd1    8:49   0   1.8T  0 part  /backupdisk
sde       8:64   0   2.7T  0 disk  
L-sde1    8:65   0   2.7T  0 part  /archive

> -----Original Message-----
> From: Reindl Harald [mailto:h.reindl@thelounge.net]
> Sent: Tuesday, February 28, 2017 1:23 AM
> To: linux-raid@vger.kernel.org
> Subject: Re: GRUB warning after replacing disk drive in RAID1
>
> Am 28.02.2017 um 00:37 schrieb Peter Sangas:
>> I have a RAID1 with 3 disks sda,sdb,sdc.  After replacing sdc and 
>> re-syncing it to the array I issued the following command to load 
>> grub but I get this
>> warning:
>>
>> grub-install /dev/sdc
>>
>> Installing for i386-pc platform.
>> grub-install: warning: Couldn't find physical volume `(null)'. Some 
>> modules may be missing from core image..
>> grub-install: warning: Couldn't find physical volume `(null)'. Some 
>> modules may be missing from core image..
>> Installation finished. No error reported.
>>
>> Does anyone know why I get this warning and how to avoid it
>
> it's harmless and disappears after the resync finished
--
To unsubscribe from this list: send the line "unsubscribe linux-raid" in the
body of a message to majordomo@vger.kernel.org More majordomo info at
http://vger.kernel.org/majordomo-info.html


^ permalink raw reply

* Re: [PATCH v2 04/13] md: prepare for managing resync I/O pages in clean way
From: Shaohua Li @ 2017-02-28 23:30 UTC (permalink / raw)
  To: Ming Lei; +Cc: Jens Axboe, linux-raid, linux-block, Christoph Hellwig
In-Reply-To: <1488296503-4987-5-git-send-email-tom.leiming@gmail.com>

On Tue, Feb 28, 2017 at 11:41:34PM +0800, Ming Lei wrote:
> Now resync I/O use bio's bec table to manage pages,
> this way is very hacky, and may not work any more
> once multipage bvec is introduced.
> 
> So introduce helpers and new data structure for
> managing resync I/O pages more cleanly.
> 
> Signed-off-by: Ming Lei <tom.leiming@gmail.com>
> ---
>  drivers/md/md.h | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 54 insertions(+)
> 
> diff --git a/drivers/md/md.h b/drivers/md/md.h
> index 1d63239a1be4..b5a638d85cb4 100644
> --- a/drivers/md/md.h
> +++ b/drivers/md/md.h
> @@ -720,4 +720,58 @@ static inline void mddev_check_writesame(struct mddev *mddev, struct bio *bio)
>  #define RESYNC_BLOCK_SIZE (64*1024)
>  #define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)
>  
> +/* for managing resync I/O pages */
> +struct resync_pages {
> +	unsigned	idx;	/* for get/put page from the pool */
> +	void		*raid_bio;
> +	struct page	*pages[RESYNC_PAGES];
> +};

I'd like this to be embedded into r1bio directly. Not sure if we really need a
structure.

> +
> +static inline int resync_alloc_pages(struct resync_pages *rp,
> +				     gfp_t gfp_flags)
> +{
> +	int i;
> +
> +	for (i = 0; i < RESYNC_PAGES; i++) {
> +		rp->pages[i] = alloc_page(gfp_flags);
> +		if (!rp->pages[i])
> +			goto out_free;
> +	}
> +
> +	return 0;
> +
> + out_free:
> +	while (--i >= 0)
> +		__free_page(rp->pages[i]);
> +	return -ENOMEM;
> +}
> +
> +static inline void resync_free_pages(struct resync_pages *rp)
> +{
> +	int i;
> +
> +	for (i = 0; i < RESYNC_PAGES; i++)
> +		__free_page(rp->pages[i]);

Since we will use get_page, shouldn't this be put_page?

> +}
> +
> +static inline void resync_get_all_pages(struct resync_pages *rp)
> +{
> +	int i;
> +
> +	for (i = 0; i < RESYNC_PAGES; i++)
> +		get_page(rp->pages[i]);
> +}
> +
> +static inline struct page *resync_fetch_page(struct resync_pages *rp)
> +{
> +	if (WARN_ON_ONCE(rp->idx >= RESYNC_PAGES))
> +		return NULL;
> +	return rp->pages[rp->idx++];

I'd like the caller explicitly specify the index instead of a global variable
to track it, which will make the code more understandable and less error prone.

> +}
> +
> +static inline bool resync_page_available(struct resync_pages *rp)
> +{
> +	return rp->idx < RESYNC_PAGES;
> +}

Then we don't need this obscure API.

Thanks,
Shaohua

^ permalink raw reply

* Re: [PATCH v2 05/13] md: raid1: simplify r1buf_pool_free()
From: Shaohua Li @ 2017-02-28 23:31 UTC (permalink / raw)
  To: Ming Lei; +Cc: Jens Axboe, linux-raid, linux-block, Christoph Hellwig
In-Reply-To: <1488296503-4987-6-git-send-email-tom.leiming@gmail.com>

On Tue, Feb 28, 2017 at 11:41:35PM +0800, Ming Lei wrote:
> This patch gets each page's reference of each bio for resync,
> then r1buf_pool_free() gets simplified a lot.
> 
> The same policy has been taken in raid10's buf pool allocation/free
> too.

We are going to delete the code, this simplify isn't really required.

> 
> Signed-off-by: Ming Lei <tom.leiming@gmail.com>
> ---
>  drivers/md/raid1.c | 15 +++++++--------
>  1 file changed, 7 insertions(+), 8 deletions(-)
> 
> diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
> index 25c9172db639..c442b4657e2f 100644
> --- a/drivers/md/raid1.c
> +++ b/drivers/md/raid1.c
> @@ -139,9 +139,12 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
>  	/* If not user-requests, copy the page pointers to all bios */
>  	if (!test_bit(MD_RECOVERY_REQUESTED, &pi->mddev->recovery)) {
>  		for (i=0; i<RESYNC_PAGES ; i++)
> -			for (j=1; j<pi->raid_disks; j++)
> -				r1_bio->bios[j]->bi_io_vec[i].bv_page =
> +			for (j=1; j<pi->raid_disks; j++) {
> +				struct page *page =
>  					r1_bio->bios[0]->bi_io_vec[i].bv_page;
> +				get_page(page);
> +				r1_bio->bios[j]->bi_io_vec[i].bv_page = page;
> +			}
>  	}
>  
>  	r1_bio->master_bio = NULL;
> @@ -166,12 +169,8 @@ static void r1buf_pool_free(void *__r1_bio, void *data)
>  	struct r1bio *r1bio = __r1_bio;
>  
>  	for (i = 0; i < RESYNC_PAGES; i++)
> -		for (j = pi->raid_disks; j-- ;) {
> -			if (j == 0 ||
> -			    r1bio->bios[j]->bi_io_vec[i].bv_page !=
> -			    r1bio->bios[0]->bi_io_vec[i].bv_page)
> -				safe_put_page(r1bio->bios[j]->bi_io_vec[i].bv_page);
> -		}
> +		for (j = pi->raid_disks; j-- ;)
> +			safe_put_page(r1bio->bios[j]->bi_io_vec[i].bv_page);
>  	for (i=0 ; i < pi->raid_disks; i++)
>  		bio_put(r1bio->bios[i]);
>  
> -- 
> 2.7.4
> 

^ permalink raw reply

* Re: [PATCH v2 06/13] md: raid1: don't use bio's vec table to manage resync pages
From: Shaohua Li @ 2017-02-28 23:37 UTC (permalink / raw)
  To: Ming Lei; +Cc: Jens Axboe, linux-raid, linux-block, Christoph Hellwig
In-Reply-To: <1488296503-4987-7-git-send-email-tom.leiming@gmail.com>

On Tue, Feb 28, 2017 at 11:41:36PM +0800, Ming Lei wrote:
> Now we allocate one page array for managing resync pages, instead
> of using bio's vec table to do that, and the old way is very hacky
> and won't work any more if multipage bvec is enabled.
> 
> The introduced cost is that we need to allocate (128 + 16) * raid_disks
> bytes per r1_bio, and it is fine because the inflight r1_bio for
> resync shouldn't be much, as pointed by Shaohua.
> 
> Also the bio_reset() in raid1_sync_request() is removed because
> all bios are freshly new now and not necessary to reset any more.
> 
> This patch can be thought as a cleanup too
> 
> Suggested-by: Shaohua Li <shli@kernel.org>
> Signed-off-by: Ming Lei <tom.leiming@gmail.com>
> ---
>  drivers/md/raid1.c | 83 ++++++++++++++++++++++++++++++++++--------------------
>  1 file changed, 53 insertions(+), 30 deletions(-)
> 
> diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
> index c442b4657e2f..900144f39630 100644
> --- a/drivers/md/raid1.c
> +++ b/drivers/md/raid1.c
> @@ -77,6 +77,16 @@ static void lower_barrier(struct r1conf *conf, sector_t sector_nr);
>  #define raid1_log(md, fmt, args...)				\
>  	do { if ((md)->queue) blk_add_trace_msg((md)->queue, "raid1 " fmt, ##args); } while (0)
>  
> +static inline struct resync_pages *get_resync_pages(struct bio *bio)
> +{
> +	return bio->bi_private;
> +}
> +
> +static inline struct r1bio *get_resync_r1bio(struct bio *bio)
> +{
> +	return get_resync_pages(bio)->raid_bio;
> +}

This is a weird between bio, r1bio and the resync_pages. I'd like the pages are
embedded in r1bio. Maybe a pointer of r1bio to the pages. It's cleaner and more
straightforward.

I think the patch 6, 7 8 should be in a same patch. Otherwise bisect will be broken.

Thanks,
Shaohua

^ permalink raw reply

* Re: [PATCH v2 08/13] md: raid1: use bio helper in process_checks()
From: Shaohua Li @ 2017-02-28 23:39 UTC (permalink / raw)
  To: Ming Lei; +Cc: Jens Axboe, linux-raid, linux-block, Christoph Hellwig
In-Reply-To: <1488296503-4987-9-git-send-email-tom.leiming@gmail.com>

On Tue, Feb 28, 2017 at 11:41:38PM +0800, Ming Lei wrote:
> Avoid to direct access to bvec table.
> 
> Signed-off-by: Ming Lei <tom.leiming@gmail.com>
> ---
>  drivers/md/raid1.c | 12 ++++++++----
>  1 file changed, 8 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
> index d0cb5c026506..316bd6dd6cc1 100644
> --- a/drivers/md/raid1.c
> +++ b/drivers/md/raid1.c
> @@ -2108,6 +2108,7 @@ static void process_checks(struct r1bio *r1_bio)
>  		int j;
>  		int size;
>  		int error;
> +		struct bio_vec *bi;
>  		struct bio *b = r1_bio->bios[i];
>  		struct resync_pages *rp = get_resync_pages(b);
>  		if (b->bi_end_io != end_sync_read)
> @@ -2126,9 +2127,7 @@ static void process_checks(struct r1bio *r1_bio)
>  		b->bi_private = rp;
>  
>  		size = b->bi_iter.bi_size;
> -		for (j = 0; j < vcnt ; j++) {
> -			struct bio_vec *bi;
> -			bi = &b->bi_io_vec[j];
> +		bio_for_each_segment_all(bi, b, j) {
>  			bi->bv_offset = 0;
>  			if (size > PAGE_SIZE)
>  				bi->bv_len = PAGE_SIZE;
> @@ -2152,17 +2151,22 @@ static void process_checks(struct r1bio *r1_bio)
>  		int error = sbio->bi_error;
>  		struct page **ppages = get_resync_pages(pbio)->pages;
>  		struct page **spages = get_resync_pages(sbio)->pages;
> +		struct bio_vec *bi;
> +		int page_len[RESYNC_PAGES];
>  
>  		if (sbio->bi_end_io != end_sync_read)
>  			continue;
>  		/* Now we can 'fixup' the error value */
>  		sbio->bi_error = 0;
>  
> +		bio_for_each_segment_all(bi, sbio, j)
> +			page_len[j] = bi->bv_len;
> +

If we record length for each page in resync_pages (don't need offset, because
it should be 0), we don't need to access the bio_vec too.

Thanks,
Shaohua

^ permalink raw reply

* Re: [PATCH v2 09/13] md: raid1: use bio_segments_all()
From: Shaohua Li @ 2017-02-28 23:42 UTC (permalink / raw)
  To: Ming Lei; +Cc: Jens Axboe, linux-raid, linux-block, Christoph Hellwig
In-Reply-To: <1488296503-4987-10-git-send-email-tom.leiming@gmail.com>

On Tue, Feb 28, 2017 at 11:41:39PM +0800, Ming Lei wrote:
> Use this helper, instead of direct access to .bi_vcnt.

what We really need to do for the behind IO is:
- allocate memory and copy bio data to the memory
- let behind bio do IO against the memory

The behind bio doesn't need to have the exactly same bio_vec setting. If we
just track the new memory, we don't need use the bio_segments_all and access
bio_vec too.

Thanks,
Shaohua
 
> Signed-off-by: Ming Lei <tom.leiming@gmail.com>
> ---
>  drivers/md/raid1.c | 7 ++++---
>  1 file changed, 4 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
> index 316bd6dd6cc1..7396c99ff7b1 100644
> --- a/drivers/md/raid1.c
> +++ b/drivers/md/raid1.c
> @@ -1091,7 +1091,8 @@ static void alloc_behind_pages(struct bio *bio, struct r1bio *r1_bio)
>  {
>  	int i;
>  	struct bio_vec *bvec;
> -	struct bio_vec *bvecs = kzalloc(bio->bi_vcnt * sizeof(struct bio_vec),
> +	unsigned vcnt = bio_segments_all(bio);
> +	struct bio_vec *bvecs = kzalloc(vcnt * sizeof(struct bio_vec),
>  					GFP_NOIO);
>  	if (unlikely(!bvecs))
>  		return;
> @@ -1107,12 +1108,12 @@ static void alloc_behind_pages(struct bio *bio, struct r1bio *r1_bio)
>  		kunmap(bvec->bv_page);
>  	}
>  	r1_bio->behind_bvecs = bvecs;
> -	r1_bio->behind_page_count = bio->bi_vcnt;
> +	r1_bio->behind_page_count = vcnt;
>  	set_bit(R1BIO_BehindIO, &r1_bio->state);
>  	return;
>  
>  do_sync_io:
> -	for (i = 0; i < bio->bi_vcnt; i++)
> +	for (i = 0; i < vcnt; i++)
>  		if (bvecs[i].bv_page)
>  			put_page(bvecs[i].bv_page);
>  	kfree(bvecs);
> -- 
> 2.7.4
> 

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox