Linux RAID subsystem development

Linux RAID subsystem development
 help / color / mirror / Atom feed

* [PATCH 03/14] md-cluster: remove useless memset from gather_all_resync_info
From: Guoqing Jiang @ 2017-02-24  3:15 UTC (permalink / raw)
  To: linux-raid; +Cc: shli, neilb, Guoqing Jiang
In-Reply-To: <1487906124-20107-1-git-send-email-gqjiang@suse.com>

This memset is not needed.  The lvb is already zeroed because
it was recently allocated by lockres_init, which uses kzalloc(),
and read_resync_info() doesn't need it to be zero anyway.

Reviewed-by: NeilBrown <neilb@suse.com>
Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
---
 drivers/md/md-cluster.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index 84c001063983..620828e56dc8 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -776,7 +776,6 @@ static int gather_all_resync_info(struct mddev *mddev, int total_slots)
 		bm_lockres->flags |= DLM_LKF_NOQUEUE;
 		ret = dlm_lock_sync(bm_lockres, DLM_LOCK_PW);
 		if (ret == -EAGAIN) {
-			memset(bm_lockres->lksb.sb_lvbptr, '\0', LVB_SIZE);
 			s = read_resync_info(mddev, bm_lockres);
 			if (s) {
 				pr_info("%s:%d Resync[%llu..%llu] in progress on %d\n",
-- 
2.6.2


^ permalink raw reply related

* [PATCH 04/14] md-cluster: add mddev into struct md_cluster_info
From: Guoqing Jiang @ 2017-02-24  3:15 UTC (permalink / raw)
  To: linux-raid; +Cc: shli, neilb, Guoqing Jiang
In-Reply-To: <1487906124-20107-1-git-send-email-gqjiang@suse.com>

Store "struct mddev *mddev" in md_cluster_info,
then we can get mddev inside lock_token in next
patch.

Reviewed-by: NeilBrown <neilb@suse.com>
Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
---
 drivers/md/md-cluster.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index 620828e56dc8..1ec89273ff99 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -69,6 +69,7 @@ struct resync_info {
 
 
 struct md_cluster_info {
+	struct mddev *mddev; /* the md device which md_cluster_info belongs to */
 	/* dlm lock space and resources for clustered raid. */
 	dlm_lockspace_t *lockspace;
 	int slot_number;
@@ -833,6 +834,7 @@ static int join(struct mddev *mddev, int nodes)
 	mutex_init(&cinfo->recv_mutex);
 
 	mddev->cluster_info = cinfo;
+	cinfo->mddev = mddev;
 
 	memset(str, 0, 64);
 	sprintf(str, "%pU", mddev->uuid);
-- 
2.6.2


^ permalink raw reply related

* [PATCH 05/14] md-cluster: add new parameter for lock_token
From: Guoqing Jiang @ 2017-02-24  3:15 UTC (permalink / raw)
  To: linux-raid; +Cc: shli, neilb, Guoqing Jiang
In-Reply-To: <1487906124-20107-1-git-send-email-gqjiang@suse.com>

lock_token is called from either lock_comm or
metadata_update_start, if we look back more for
the call chain, some of them are called with the
reconfig_mutex held while a few of them don't.

Specifically, resync_info_update is mostly called
without the protection of reconfig_mutex. But
resync_finish calls resync_info_update within
the mutex.

We make the change to lock_token since we need
to use synchronous way to handle METADATA_UPDATED
message in latter patch.

Reviewed-by: NeilBrown <neilb@suse.com>
Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
---
 drivers/md/md-cluster.c | 30 +++++++++++++++++++-----------
 1 file changed, 19 insertions(+), 11 deletions(-)

diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index 1ec89273ff99..a6cf02c5c7bc 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -646,7 +646,7 @@ static void recv_daemon(struct md_thread *thread)
  * Takes the lock on the TOKEN lock resource so no other
  * node can communicate while the operation is underway.
  */
-static int lock_token(struct md_cluster_info *cinfo)
+static int lock_token(struct md_cluster_info *cinfo, bool mddev_locked)
 {
 	int error;
 
@@ -663,12 +663,12 @@ static int lock_token(struct md_cluster_info *cinfo)
 /* lock_comm()
  * Sets the MD_CLUSTER_SEND_LOCK bit to lock the send channel.
  */
-static int lock_comm(struct md_cluster_info *cinfo)
+static int lock_comm(struct md_cluster_info *cinfo, bool mddev_locked)
 {
 	wait_event(cinfo->wait,
 		   !test_and_set_bit(MD_CLUSTER_SEND_LOCK, &cinfo->state));
 
-	return lock_token(cinfo);
+	return lock_token(cinfo, mddev_locked);
 }
 
 static void unlock_comm(struct md_cluster_info *cinfo)
@@ -743,11 +743,12 @@ static int __sendmsg(struct md_cluster_info *cinfo, struct cluster_msg *cmsg)
 	return error;
 }
 
-static int sendmsg(struct md_cluster_info *cinfo, struct cluster_msg *cmsg)
+static int sendmsg(struct md_cluster_info *cinfo, struct cluster_msg *cmsg,
+		   bool mddev_locked)
 {
 	int ret;
 
-	lock_comm(cinfo);
+	lock_comm(cinfo, mddev_locked);
 	ret = __sendmsg(cinfo, cmsg);
 	unlock_comm(cinfo);
 	return ret;
@@ -944,7 +945,7 @@ static void resync_bitmap(struct mddev *mddev)
 	int err;
 
 	cmsg.type = cpu_to_le32(BITMAP_NEEDS_SYNC);
-	err = sendmsg(cinfo, &cmsg);
+	err = sendmsg(cinfo, &cmsg, 1);
 	if (err)
 		pr_err("%s:%d: failed to send BITMAP_NEEDS_SYNC message (%d)\n",
 			__func__, __LINE__, err);
@@ -1007,7 +1008,7 @@ static int metadata_update_start(struct mddev *mddev)
 	if (cinfo->token_lockres->mode == DLM_LOCK_EX)
 		return 0;
 
-	return lock_token(cinfo);
+	return lock_token(cinfo, 1);
 }
 
 static int metadata_update_finish(struct mddev *mddev)
@@ -1070,7 +1071,14 @@ static int resync_info_update(struct mddev *mddev, sector_t lo, sector_t hi)
 	cmsg.low = cpu_to_le64(lo);
 	cmsg.high = cpu_to_le64(hi);
 
-	return sendmsg(cinfo, &cmsg);
+	/*
+	 * mddev_lock is held if resync_info_update is called from
+	 * resync_finish (md_reap_sync_thread -> resync_finish)
+	 */
+	if (lo == 0 && hi == 0)
+		return sendmsg(cinfo, &cmsg, 1);
+	else
+		return sendmsg(cinfo, &cmsg, 0);
 }
 
 static int resync_finish(struct mddev *mddev)
@@ -1120,7 +1128,7 @@ static int add_new_disk(struct mddev *mddev, struct md_rdev *rdev)
 	cmsg.type = cpu_to_le32(NEWDISK);
 	memcpy(cmsg.uuid, uuid, 16);
 	cmsg.raid_slot = cpu_to_le32(rdev->desc_nr);
-	lock_comm(cinfo);
+	lock_comm(cinfo, 1);
 	ret = __sendmsg(cinfo, &cmsg);
 	if (ret)
 		return ret;
@@ -1180,7 +1188,7 @@ static int remove_disk(struct mddev *mddev, struct md_rdev *rdev)
 	struct md_cluster_info *cinfo = mddev->cluster_info;
 	cmsg.type = cpu_to_le32(REMOVE);
 	cmsg.raid_slot = cpu_to_le32(rdev->desc_nr);
-	return sendmsg(cinfo, &cmsg);
+	return sendmsg(cinfo, &cmsg, 1);
 }
 
 static int lock_all_bitmaps(struct mddev *mddev)
@@ -1244,7 +1252,7 @@ static int gather_bitmaps(struct md_rdev *rdev)
 
 	cmsg.type = cpu_to_le32(RE_ADD);
 	cmsg.raid_slot = cpu_to_le32(rdev->desc_nr);
-	err = sendmsg(cinfo, &cmsg);
+	err = sendmsg(cinfo, &cmsg, 1);
 	if (err)
 		goto out;
 
-- 
2.6.2


^ permalink raw reply related

* [PATCH 07/14] md: move bitmap_destroy before __md_stop
From: Guoqing Jiang @ 2017-02-24  3:15 UTC (permalink / raw)
  To: linux-raid; +Cc: shli, neilb, Guoqing Jiang
In-Reply-To: <1487906124-20107-1-git-send-email-gqjiang@suse.com>

Since we have switched to sync way to handle METADATA_UPDATED
msg for md-cluster, then process_metadata_update is depended
on mddev->thread->wqueue.

With the new change, clustered raid could possible hang if
array received a METADATA_UPDATED msg after array unregistered
mddev->thread, so we need to stop clustered raid earlier
than before.

And this change should be safe for non-clustered raid since
all writes are stopped before the destroy. Also in md_run,
we activate the personality (pers->run()) before activating
the bitmap (bitmap_create()). So it is pleasingly symmetric
to stop the bitmap (bitmap_destroy()) before stopping the
personality (__md_stop() calls pers->free()).

Reviewed-by: NeilBrown <neilb@suse.com>
Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
---
 drivers/md/md.c | 28 +++++++++++++++-------------
 1 file changed, 15 insertions(+), 13 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 6e910d99c9c1..7bcc757386e1 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -5582,8 +5582,8 @@ void md_stop(struct mddev *mddev)
 	/* stop the array and free an attached data structures.
 	 * This is called from dm-raid
 	 */
-	__md_stop(mddev);
 	bitmap_destroy(mddev);
+	__md_stop(mddev);
 	if (mddev->bio_set)
 		bioset_free(mddev->bio_set);
 }
@@ -5696,6 +5696,20 @@ static int do_md_stop(struct mddev *mddev, int mode,
 			set_disk_ro(disk, 0);
 
 		__md_stop_writes(mddev);
+
+		/*
+		 * Destroy bitmap after all writes are stopped
+		 */
+		bitmap_destroy(mddev);
+		if (mddev->bitmap_info.file) {
+			struct file *f = mddev->bitmap_info.file;
+			spin_lock(&mddev->lock);
+			mddev->bitmap_info.file = NULL;
+			spin_unlock(&mddev->lock);
+			fput(f);
+		}
+		mddev->bitmap_info.offset = 0;
+
 		__md_stop(mddev);
 		mddev->queue->backing_dev_info.congested_fn = NULL;
 
@@ -5720,19 +5734,7 @@ static int do_md_stop(struct mddev *mddev, int mode,
 	 */
 	if (mode == 0) {
 		pr_info("md: %s stopped.\n", mdname(mddev));
-
-		bitmap_destroy(mddev);
-		if (mddev->bitmap_info.file) {
-			struct file *f = mddev->bitmap_info.file;
-			spin_lock(&mddev->lock);
-			mddev->bitmap_info.file = NULL;
-			spin_unlock(&mddev->lock);
-			fput(f);
-		}
-		mddev->bitmap_info.offset = 0;
-
 		export_array(mddev);
-
 		md_clean(mddev);
 		if (mddev->hold_active == UNTIL_STOP)
 			mddev->hold_active = 0;
-- 
2.6.2


^ permalink raw reply related

* [PATCH 08/14] md-cluster: set MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD before unregister thread
From: Guoqing Jiang @ 2017-02-24  3:15 UTC (permalink / raw)
  To: linux-raid; +Cc: shli, neilb, Guoqing Jiang
In-Reply-To: <1487906124-20107-1-git-send-email-gqjiang@suse.com>

After used sync way to handle METADATA_UPDATED msg, a deadlock
could appear if stop a resyncing array.

betalinux244:~ # ps aux|grep md|grep D
root     17164  0.0  0.0      0     0 ?        D    Jan09   0:00 [md0_cluster_rec]
root     18151  0.0  0.1  19852  2008 ?        Ds   Jan09   0:00 /sbin/mdadm -Ssq
betalinux244:~ # cat /proc/17164/stack
[<ffffffffa06a7395>] recv_daemon+0x1f5/0x590 [md_cluster]
[<ffffffffa067be20>] md_thread+0x130/0x150 [md_mod]
[<ffffffff810995ed>] kthread+0xbd/0xe0
[<ffffffff815e96bf>] ret_from_fork+0x3f/0x70
[<ffffffff81099530>] kthread+0x0/0xe0
[<ffffffffffffffff>] 0xffffffffffffffff
betalinux244:~ # cat /proc/18151/stack
[<ffffffff81099879>] kthread_stop+0x59/0x130
[<ffffffffa067c566>] md_unregister_thread+0x46/0x80 [md_mod]
[<ffffffffa06a6e71>] leave+0x81/0x120 [md_cluster]
[<ffffffffa0684e94>] md_cluster_stop+0x14/0x30 [md_mod]
[<ffffffffa06858b6>] bitmap_free+0x126/0x130 [md_mod]
[<ffffffffa0682d06>] do_md_stop+0x356/0x5f0 [md_mod]
[<ffffffffa0683cbe>] md_ioctl+0x6fe/0x1680 [md_mod]
[<ffffffff812ed158>] blkdev_ioctl+0x258/0x920
[<ffffffff8122f81d>] block_ioctl+0x3d/0x40
[<ffffffff8120ac0d>] do_vfs_ioctl+0x2cd/0x4a0
[<ffffffff8120ae54>] SyS_ioctl+0x74/0x80
[<ffffffff815e936e>] entry_SYSCALL_64_fastpath+0x12/0x6d
[<ffffffffffffffff>] 0xffffffffffffffff

Since md_unregister_thread(&cinfo->recv_thread) is blocked by
recv_daemon -> process_recvd_msg -> process_metadata_update.
To resolve the issue, MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD is
also need to be set before unregister thread.

Reviewed-by: NeilBrown <neilb@suse.com>
Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
---
 drivers/md/md-cluster.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index 0aad477d1b20..5e2c54be6f30 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -932,6 +932,7 @@ static int join(struct mddev *mddev, int nodes)
 
 	return 0;
 err:
+	set_bit(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state);
 	md_unregister_thread(&cinfo->recovery_thread);
 	md_unregister_thread(&cinfo->recv_thread);
 	lockres_free(cinfo->message_lockres);
@@ -987,6 +988,7 @@ static int leave(struct mddev *mddev)
 	if (cinfo->slot_number > 0 && mddev->recovery_cp != MaxSector)
 		resync_bitmap(mddev);
 
+	set_bit(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state);
 	md_unregister_thread(&cinfo->recovery_thread);
 	md_unregister_thread(&cinfo->recv_thread);
 	lockres_free(cinfo->message_lockres);
-- 
2.6.2


^ permalink raw reply related

* [PATCH 09/14] md-cluster: set MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD in metadata_update_start
From: Guoqing Jiang @ 2017-02-24  3:15 UTC (permalink / raw)
  To: linux-raid; +Cc: shli, neilb, Guoqing Jiang
In-Reply-To: <1487906124-20107-1-git-send-email-gqjiang@suse.com>

Since we have switched to sync way to handle METADATA_UPDATED
msg, then process_metadata_update can only continue with either
get the reconfig_mutex or MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD is
set.

However, there is a deadlock issue which happened as follows:

1. Node A sends METADATA_UPDATED msg (held Token lock).
2. Node B wants to do resync, and is blocked since it can't
   get Token lock, but MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD is
   not set since the callchain
   (md_do_sync -> sync_request
               -> resync_info_update
	       -> sendmsg
	       -> lock_comm -> lock_token)
   doesn't hold reconfig_mutex.
3. Node B trys to update sb (held reconfig_mutex), but stopped
   at wait_event() in metadata_update_start since we have set
   MD_CLUSTER_SEND_LOCK flag in lock_comm (step 2).
4. Then Node B receives METADATA_UPDATED msg from A, of course
   recv_daemon is blocked forever.

The followings are more detailed infos.
betalinux240:~ # cat /proc/mdstat
Personalities : [raid1]
md0 : active raid1 vdc[1] vdb[0]
2095104 blocks super 1.2 [2/2] [UU]
[>....................]  resync =  4.6% (98112/2095104) finish=28.8min speed=1154K/sec
bitmap: 1/1 pages [4KB], 65536KB chunk

unused devices: <none>
betalinux240:~ # ps aux|grep md|grep D
root      4393  0.0  0.0      0     0 ?        D    11:42   0:00 [md0_raid1]
root      4402  0.0  0.0      0     0 ?        D    11:42   0:00 [md0_cluster_rec]
root      4407  0.0  0.0      0     0 ?        D    11:42   0:00 [md0_resync]
betalinux240:~ # cat /proc/4407/stack
[<ffffffffa05eb531>] dlm_lock_sync+0x81/0xc0 [md_cluster]
[<ffffffffa05eba23>] lock_token+0x23/0xa0 [md_cluster]
[<ffffffffa05ebad2>] lock_comm+0x32/0x90 [md_cluster]
[<ffffffffa05ebb45>] sendmsg+0x15/0x30 [md_cluster]
[<ffffffffa05ebd5a>] resync_info_update+0x8a/0xc0 [md_cluster]
[<ffffffffa06b6ae7>] sync_request+0xa57/0xaf0 [raid1]
[<ffffffffa069825d>] md_do_sync+0x90d/0xe70 [md_mod]
[<ffffffffa0694c50>] md_thread+0x130/0x150 [md_mod]
[<ffffffff810995ed>] kthread+0xbd/0xe0
[<ffffffff815e96bf>] ret_from_fork+0x3f/0x70
[<ffffffff81099530>] kthread+0x0/0xe0
betalinux240:~ # cat /proc/4402/stack
[<ffffffffa05ecf77>] recv_daemon+0x1d7/0x570 [md_cluster]
[<ffffffffa0694c50>] md_thread+0x130/0x150 [md_mod]
[<ffffffff810995ed>] kthread+0xbd/0xe0
[<ffffffff815e96bf>] ret_from_fork+0x3f/0x70
[<ffffffff81099530>] kthread+0x0/0xe0
betalinux240:~ # cat /proc/4393/stack
[<ffffffffa05ec0e5>] metadata_update_start+0xa5/0xb0 [md_cluster]
[<ffffffffa069913e>] md_update_sb.part.50+0x8e/0x850 [md_mod]
[<ffffffffa069ab1d>] md_check_recovery+0x21d/0x4e0 [md_mod]
[<ffffffffa06b9322>] raid1d+0x42/0x7d0 [raid1]
[<ffffffffa0694c50>] md_thread+0x130/0x150 [md_mod]
[<ffffffff810995ed>] kthread+0xbd/0xe0
[<ffffffff815e96bf>] ret_from_fork+0x3f/0x70
[<ffffffff81099530>] kthread+0x0/0xe0

Since metadata_update_start always calls lock_token with
reconfig_mutex, we need to set MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD
here as well, and lock_token don't need to set it twice unless
lock_token is invoked from lock_comm.

Reviewed-by: NeilBrown <neilb@suse.com>
Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
---
 drivers/md/md-cluster.c | 26 +++++++++++++++++++++-----
 1 file changed, 21 insertions(+), 5 deletions(-)

diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index 5e2c54be6f30..bcc269312b71 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -654,7 +654,7 @@ static void recv_daemon(struct md_thread *thread)
  */
 static int lock_token(struct md_cluster_info *cinfo, bool mddev_locked)
 {
-	int error;
+	int error, set_bit = 0;
 	struct mddev *mddev = cinfo->mddev;
 
 	/*
@@ -663,14 +663,16 @@ static int lock_token(struct md_cluster_info *cinfo, bool mddev_locked)
 	 * since another node already got EX on Token and waitting the EX of Ack),
 	 * so let resync wake up thread in case flag is set.
 	 */
-	if (mddev_locked) {
+	if (mddev_locked && !test_bit(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD,
+				      &cinfo->state)) {
 		error = test_and_set_bit_lock(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD,
 					      &cinfo->state);
 		WARN_ON_ONCE(error);
 		md_wakeup_thread(mddev->thread);
+		set_bit = 1;
 	}
 	error = dlm_lock_sync(cinfo->token_lockres, DLM_LOCK_EX);
-	if (mddev_locked)
+	if (set_bit)
 		clear_bit_unlock(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state);
 
 	if (error)
@@ -1023,16 +1025,30 @@ static int slot_number(struct mddev *mddev)
 static int metadata_update_start(struct mddev *mddev)
 {
 	struct md_cluster_info *cinfo = mddev->cluster_info;
+	int ret;
+
+	/*
+	 * metadata_update_start is always called with the protection of
+	 * reconfig_mutex, so set WAITING_FOR_TOKEN here.
+	 */
+	ret = test_and_set_bit_lock(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD,
+				    &cinfo->state);
+	WARN_ON_ONCE(ret);
+	md_wakeup_thread(mddev->thread);
 
 	wait_event(cinfo->wait,
 		   !test_and_set_bit(MD_CLUSTER_SEND_LOCK, &cinfo->state) ||
 		   test_and_clear_bit(MD_CLUSTER_SEND_LOCKED_ALREADY, &cinfo->state));
 
 	/* If token is already locked, return 0 */
-	if (cinfo->token_lockres->mode == DLM_LOCK_EX)
+	if (cinfo->token_lockres->mode == DLM_LOCK_EX) {
+		clear_bit_unlock(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state);
 		return 0;
+	}
 
-	return lock_token(cinfo, 1);
+	ret = lock_token(cinfo, 1);
+	clear_bit_unlock(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state);
+	return ret;
 }
 
 static int metadata_update_finish(struct mddev *mddev)
-- 
2.6.2


^ permalink raw reply related

* [PATCH 10/14] md-cluster: add CHANGE_CAPACITY message type
From: Guoqing Jiang @ 2017-02-24  3:15 UTC (permalink / raw)
  To: linux-raid; +Cc: shli, neilb, Guoqing Jiang
In-Reply-To: <1487906124-20107-1-git-send-email-gqjiang@suse.com>

The msg type CHANGE_CAPACITY is introduced to support
resize clustered raid in later patch, and it is sent
after all the nodes have the same sync_size, receiver
node just need to set new capacity once received this
msg.

Reviewed-by: NeilBrown <neilb@suse.com>
Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
---
 drivers/md/md-cluster.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index bcc269312b71..f1b9a7a3ddd2 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -103,6 +103,7 @@ enum msg_type {
 	REMOVE,
 	RE_ADD,
 	BITMAP_NEEDS_SYNC,
+	CHANGE_CAPACITY,
 };
 
 struct cluster_msg {
@@ -578,6 +579,9 @@ static int process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg)
 	case METADATA_UPDATED:
 		process_metadata_update(mddev, msg);
 		break;
+	case CHANGE_CAPACITY:
+		set_capacity(mddev->gendisk, mddev->array_sectors);
+		break;
 	case RESYNCING:
 		process_suspend_info(mddev, le32_to_cpu(msg->slot),
 				     le64_to_cpu(msg->low),
-- 
2.6.2


^ permalink raw reply related

* [PATCH 11/14] md-cluster: introduce cluster_check_sync_size
From: Guoqing Jiang @ 2017-02-24  3:15 UTC (permalink / raw)
  To: linux-raid; +Cc: shli, neilb, Guoqing Jiang
In-Reply-To: <1487906124-20107-1-git-send-email-gqjiang@suse.com>

Support resize is a little complex for clustered
raid, since we need to ensure all the nodes share
the same knowledge about the size of raid.

We achieve the goal by check the sync_size which
is in each node's bitmap, we can only change the
capacity after cluster_check_sync_size returns 0.

Also, get_bitmap_from_slot is added to get a slot's
bitmap. And we exported some funcs since they are
used in cluster_check_sync_size().

Reviewed-by: NeilBrown <neilb@suse.com>
Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
---
 drivers/md/bitmap.c     | 25 ++++++++++++++++++++-
 drivers/md/bitmap.h     |  2 ++
 drivers/md/md-cluster.c | 58 +++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 84 insertions(+), 1 deletion(-)

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 9fb2ccac958a..67a7d399f501 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -471,6 +471,7 @@ void bitmap_update_sb(struct bitmap *bitmap)
 	kunmap_atomic(sb);
 	write_page(bitmap, bitmap->storage.sb_page, 1);
 }
+EXPORT_SYMBOL(bitmap_update_sb);
 
 /* print out the bitmap file superblock */
 void bitmap_print_sb(struct bitmap *bitmap)
@@ -1727,7 +1728,7 @@ void bitmap_flush(struct mddev *mddev)
 /*
  * free memory that was allocated
  */
-static void bitmap_free(struct bitmap *bitmap)
+void bitmap_free(struct bitmap *bitmap)
 {
 	unsigned long k, pages;
 	struct bitmap_page *bp;
@@ -1761,6 +1762,7 @@ static void bitmap_free(struct bitmap *bitmap)
 	kfree(bp);
 	kfree(bitmap);
 }
+EXPORT_SYMBOL(bitmap_free);
 
 void bitmap_destroy(struct mddev *mddev)
 {
@@ -1920,6 +1922,27 @@ int bitmap_load(struct mddev *mddev)
 }
 EXPORT_SYMBOL_GPL(bitmap_load);
 
+struct bitmap *get_bitmap_from_slot(struct mddev *mddev, int slot)
+{
+	int rv = 0;
+	struct bitmap *bitmap;
+
+	bitmap = bitmap_create(mddev, slot);
+	if (IS_ERR(bitmap)) {
+		rv = PTR_ERR(bitmap);
+		return ERR_PTR(rv);
+	}
+
+	rv = bitmap_init_from_disk(bitmap, 0);
+	if (rv) {
+		bitmap_free(bitmap);
+		return ERR_PTR(rv);
+	}
+
+	return bitmap;
+}
+EXPORT_SYMBOL(get_bitmap_from_slot);
+
 /* Loads the bitmap associated with slot and copies the resync information
  * to our bitmap
  */
diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h
index 5b6dd63dda91..9f761097aab2 100644
--- a/drivers/md/bitmap.h
+++ b/drivers/md/bitmap.h
@@ -267,8 +267,10 @@ void bitmap_daemon_work(struct mddev *mddev);
 
 int bitmap_resize(struct bitmap *bitmap, sector_t blocks,
 		  int chunksize, int init);
+struct bitmap *get_bitmap_from_slot(struct mddev *mddev, int slot);
 int bitmap_copy_from_slot(struct mddev *mddev, int slot,
 				sector_t *lo, sector_t *hi, bool clear_bits);
+void bitmap_free(struct bitmap *bitmap);
 #endif
 
 #endif
diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index f1b9a7a3ddd2..d3c024e6bfcf 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -1089,6 +1089,64 @@ static void metadata_update_cancel(struct mddev *mddev)
 	unlock_comm(cinfo);
 }
 
+/*
+ * retun 0 if all the bitmaps have the same sync_size
+ */
+int cluster_check_sync_size(struct mddev *mddev)
+{
+	int i, rv;
+	bitmap_super_t *sb;
+	unsigned long my_sync_size, sync_size = 0;
+	int node_num = mddev->bitmap_info.nodes;
+	int current_slot = md_cluster_ops->slot_number(mddev);
+	struct bitmap *bitmap = mddev->bitmap;
+	char str[64];
+	struct dlm_lock_resource *bm_lockres;
+
+	sb = kmap_atomic(bitmap->storage.sb_page);
+	my_sync_size = sb->sync_size;
+	kunmap_atomic(sb);
+
+	for (i = 0; i < node_num; i++) {
+		if (i == current_slot)
+			continue;
+
+		bitmap = get_bitmap_from_slot(mddev, i);
+		if (IS_ERR(bitmap)) {
+			pr_err("can't get bitmap from slot %d\n", i);
+			return -1;
+		}
+
+		/*
+		 * If we can hold the bitmap lock of one node then
+		 * the slot is not occupied, update the sb.
+		 */
+		snprintf(str, 64, "bitmap%04d", i);
+		bm_lockres = lockres_init(mddev, str, NULL, 1);
+		if (!bm_lockres) {
+			pr_err("md-cluster: Cannot initialize %s\n", str);
+		}
+		bm_lockres->flags |= DLM_LKF_NOQUEUE;
+		rv = dlm_lock_sync(bm_lockres, DLM_LOCK_PW);
+		if (!rv)
+			bitmap_update_sb(bitmap);
+		lockres_free(bm_lockres);
+
+		sb = kmap_atomic(bitmap->storage.sb_page);
+		if (sync_size == 0)
+			sync_size = sb->sync_size;
+		else if (sync_size != sb->sync_size) {
+			kunmap_atomic(sb);
+			bitmap_free(bitmap);
+			return -1;
+		}
+		kunmap_atomic(sb);
+		bitmap_free(bitmap);
+	}
+
+	return (my_sync_size == sync_size) ? 0 : -1;
+}
+
 static int resync_start(struct mddev *mddev)
 {
 	struct md_cluster_info *cinfo = mddev->cluster_info;
-- 
2.6.2


^ permalink raw reply related

* [PATCH 12/14] md/bitmap: replace redundant codes with get_bitmap_from_slot
From: Guoqing Jiang @ 2017-02-24  3:15 UTC (permalink / raw)
  To: linux-raid; +Cc: shli, neilb, Guoqing Jiang
In-Reply-To: <1487906124-20107-1-git-send-email-gqjiang@suse.com>

Since get_bitmap_from_slot is introduced in previous
commit, we can use it in bitmap_copy_from_slot to
remove redundant code.

Reviewed-by: NeilBrown <neilb@suse.com>
Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
---
 drivers/md/bitmap.c | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 67a7d399f501..b6fa55a3cff8 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -1952,14 +1952,13 @@ int bitmap_copy_from_slot(struct mddev *mddev, int slot,
 	int rv = 0, i, j;
 	sector_t block, lo = 0, hi = 0;
 	struct bitmap_counts *counts;
-	struct bitmap *bitmap = bitmap_create(mddev, slot);
-
-	if (IS_ERR(bitmap))
-		return PTR_ERR(bitmap);
+	struct bitmap *bitmap;
 
-	rv = bitmap_init_from_disk(bitmap, 0);
-	if (rv)
-		goto err;
+	bitmap = get_bitmap_from_slot(mddev, slot);
+	if (IS_ERR(bitmap)) {
+		pr_err("%s can't get bitmap from slot %d\n", __func__, slot);
+		return -1;
+	}
 
 	counts = &bitmap->counts;
 	for (j = 0; j < counts->chunks; j++) {
@@ -1986,8 +1985,7 @@ int bitmap_copy_from_slot(struct mddev *mddev, int slot,
 	bitmap_unplug(mddev->bitmap);
 	*low = lo;
 	*high = hi;
-err:
-	bitmap_free(bitmap);
+
 	return rv;
 }
 EXPORT_SYMBOL_GPL(bitmap_copy_from_slot);
-- 
2.6.2


^ permalink raw reply related

* [PATCH 13/14] md: move funcs from pers->resize to update_size
From: Guoqing Jiang @ 2017-02-24  3:15 UTC (permalink / raw)
  To: linux-raid; +Cc: shli, neilb, Guoqing Jiang
In-Reply-To: <1487906124-20107-1-git-send-email-gqjiang@suse.com>

raid1_resize and raid5_resize should also check the
mddev->queue if run underneath dm-raid.

And both set_capacity and revalidate_disk are used in
pers->resize such as raid1, raid10 and raid5. So
move them from personality file to common code.

Reviewed-by: NeilBrown <neilb@suse.com>
Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
---
 drivers/md/md.c     | 8 ++++++--
 drivers/md/raid1.c  | 2 --
 drivers/md/raid10.c | 4 ----
 drivers/md/raid5.c  | 2 --
 4 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 7bcc757386e1..975c9dd60c05 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -6534,8 +6534,12 @@ static int update_size(struct mddev *mddev, sector_t num_sectors)
 			return -ENOSPC;
 	}
 	rv = mddev->pers->resize(mddev, num_sectors);
-	if (!rv)
-		revalidate_disk(mddev->gendisk);
+	if (!rv) {
+		if (mddev->queue) {
+			set_capacity(mddev->gendisk, mddev->array_sectors);
+			revalidate_disk(mddev->gendisk);
+		}
+	}
 	return rv;
 }
 
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index d4e8796b4569..9688f81ea7e9 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -3246,8 +3246,6 @@ static int raid1_resize(struct mddev *mddev, sector_t sectors)
 			return ret;
 	}
 	md_set_array_sectors(mddev, newsize);
-	set_capacity(mddev->gendisk, mddev->array_sectors);
-	revalidate_disk(mddev->gendisk);
 	if (sectors > mddev->dev_sectors &&
 	    mddev->recovery_cp > mddev->dev_sectors) {
 		mddev->recovery_cp = mddev->dev_sectors;
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index ade7d69234d5..59533cbea446 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -3943,10 +3943,6 @@ static int raid10_resize(struct mddev *mddev, sector_t sectors)
 			return ret;
 	}
 	md_set_array_sectors(mddev, size);
-	if (mddev->queue) {
-		set_capacity(mddev->gendisk, mddev->array_sectors);
-		revalidate_disk(mddev->gendisk);
-	}
 	if (sectors > mddev->dev_sectors &&
 	    mddev->recovery_cp > oldsize) {
 		mddev->recovery_cp = oldsize;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 7b7722bb2e8d..df63cd7830ce 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -7603,8 +7603,6 @@ static int raid5_resize(struct mddev *mddev, sector_t sectors)
 			return ret;
 	}
 	md_set_array_sectors(mddev, newsize);
-	set_capacity(mddev->gendisk, mddev->array_sectors);
-	revalidate_disk(mddev->gendisk);
 	if (sectors > mddev->dev_sectors &&
 	    mddev->recovery_cp > mddev->dev_sectors) {
 		mddev->recovery_cp = mddev->dev_sectors;
-- 
2.6.2


^ permalink raw reply related

* [PATCH 14/14] md-cluster: add the support for resize
From: Guoqing Jiang @ 2017-02-24  3:15 UTC (permalink / raw)
  To: linux-raid; +Cc: shli, neilb, Guoqing Jiang
In-Reply-To: <1487906124-20107-1-git-send-email-gqjiang@suse.com>

To update size for cluster raid, we need to make
sure all nodes can perform the change successfully.
However, it is possible that some of them can't do
it due to failure (bitmap_resize could fail). So
we need to consider the issue before we set the
capacity unconditionally, and we use below steps
to perform sanity check.

1. A change the size, then broadcast METADATA_UPDATED
   msg.
2. B and C receive METADATA_UPDATED change the size
   excepts call set_capacity, sync_size is not update
   if the change failed. Also call bitmap_update_sb
   to sync sb to disk.
3. A checks other node's sync_size, if sync_size has
   been updated in all nodes, then send CHANGE_CAPACITY
   msg otherwise send msg to revert previous change.
4. B and C call set_capacity if receive CHANGE_CAPACITY
   msg, otherwise pers->resize will be called to restore
   the old value.

Reviewed-by: NeilBrown <neilb@suse.com>
Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
---
 Documentation/md/md-cluster.txt |  2 +-
 drivers/md/md-cluster.c         | 75 +++++++++++++++++++++++++++++++++++++++++
 drivers/md/md-cluster.h         |  1 +
 drivers/md/md.c                 | 21 +++++++++---
 4 files changed, 93 insertions(+), 6 deletions(-)

diff --git a/Documentation/md/md-cluster.txt b/Documentation/md/md-cluster.txt
index 38883276d31c..2663d49dd8a0 100644
--- a/Documentation/md/md-cluster.txt
+++ b/Documentation/md/md-cluster.txt
@@ -321,4 +321,4 @@ The algorithm is:
 
 There are somethings which are not supported by cluster MD yet.
 
-- update size and change array_sectors.
+- change array_sectors.
diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index d3c024e6bfcf..75da83187c31 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -1147,6 +1147,80 @@ int cluster_check_sync_size(struct mddev *mddev)
 	return (my_sync_size == sync_size) ? 0 : -1;
 }
 
+/*
+ * Update the size for cluster raid is a little more complex, we perform it
+ * by the steps:
+ * 1. hold token lock and update superblock in initiator node.
+ * 2. send METADATA_UPDATED msg to other nodes.
+ * 3. The initiator node continues to check each bitmap's sync_size, if all
+ *    bitmaps have the same value of sync_size, then we can set capacity and
+ *    let other nodes to perform it. If one node can't update sync_size
+ *    accordingly, we need to revert to previous value.
+ */
+static void update_size(struct mddev *mddev, sector_t old_dev_sectors)
+{
+	struct md_cluster_info *cinfo = mddev->cluster_info;
+	struct cluster_msg cmsg;
+	struct md_rdev *rdev;
+	int ret = 0;
+	int raid_slot = -1;
+
+	md_update_sb(mddev, 1);
+	lock_comm(cinfo, 1);
+
+	memset(&cmsg, 0, sizeof(cmsg));
+	cmsg.type = cpu_to_le32(METADATA_UPDATED);
+	rdev_for_each(rdev, mddev)
+		if (rdev->raid_disk >= 0 && !test_bit(Faulty, &rdev->flags)) {
+			raid_slot = rdev->desc_nr;
+			break;
+		}
+	if (raid_slot >= 0) {
+		cmsg.raid_slot = cpu_to_le32(raid_slot);
+		/*
+		 * We can only change capiticy after all the nodes can do it,
+		 * so need to wait after other nodes already received the msg
+		 * and handled the change
+		 */
+		ret = __sendmsg(cinfo, &cmsg);
+		if (ret) {
+			pr_err("%s:%d: failed to send METADATA_UPDATED msg\n",
+			       __func__, __LINE__);
+			unlock_comm(cinfo);
+			return;
+		}
+	} else {
+		pr_err("md-cluster: No good device id found to send\n");
+		unlock_comm(cinfo);
+		return;
+	}
+
+	/*
+	 * check the sync_size from other node's bitmap, if sync_size
+	 * have already updated in other nodes as expected, send an
+	 * empty metadata msg to permit the change of capacity
+	 */
+	if (cluster_check_sync_size(mddev) == 0) {
+		memset(&cmsg, 0, sizeof(cmsg));
+		cmsg.type = cpu_to_le32(CHANGE_CAPACITY);
+		ret = __sendmsg(cinfo, &cmsg);
+		if (ret)
+			pr_err("%s:%d: failed to send CHANGE_CAPACITY msg\n",
+			       __func__, __LINE__);
+		set_capacity(mddev->gendisk, mddev->array_sectors);
+	} else {
+		/* revert to previous sectors */
+		ret = mddev->pers->resize(mddev, old_dev_sectors);
+		if (!ret)
+			revalidate_disk(mddev->gendisk);
+		ret = __sendmsg(cinfo, &cmsg);
+		if (ret)
+			pr_err("%s:%d: failed to send METADATA_UPDATED msg\n",
+			       __func__, __LINE__);
+	}
+	unlock_comm(cinfo);
+}
+
 static int resync_start(struct mddev *mddev)
 {
 	struct md_cluster_info *cinfo = mddev->cluster_info;
@@ -1392,6 +1466,7 @@ static struct md_cluster_operations cluster_ops = {
 	.gather_bitmaps = gather_bitmaps,
 	.lock_all_bitmaps = lock_all_bitmaps,
 	.unlock_all_bitmaps = unlock_all_bitmaps,
+	.update_size = update_size,
 };
 
 static int __init cluster_init(void)
diff --git a/drivers/md/md-cluster.h b/drivers/md/md-cluster.h
index 8f26a5e80810..e939c14222ba 100644
--- a/drivers/md/md-cluster.h
+++ b/drivers/md/md-cluster.h
@@ -25,6 +25,7 @@ struct md_cluster_operations {
 	int (*gather_bitmaps)(struct md_rdev *rdev);
 	int (*lock_all_bitmaps)(struct mddev *mddev);
 	void (*unlock_all_bitmaps)(struct mddev *mddev);
+	void (*update_size)(struct mddev *mddev, sector_t old_dev_sectors);
 };
 
 #endif /* _MD_CLUSTER_H */
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 975c9dd60c05..2bc64059ff57 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -6503,10 +6503,7 @@ static int update_size(struct mddev *mddev, sector_t num_sectors)
 	struct md_rdev *rdev;
 	int rv;
 	int fit = (num_sectors == 0);
-
-	/* cluster raid doesn't support update size */
-	if (mddev_is_clustered(mddev))
-		return -EINVAL;
+	sector_t old_dev_sectors = mddev->dev_sectors;
 
 	if (mddev->pers->resize == NULL)
 		return -EINVAL;
@@ -6535,7 +6532,9 @@ static int update_size(struct mddev *mddev, sector_t num_sectors)
 	}
 	rv = mddev->pers->resize(mddev, num_sectors);
 	if (!rv) {
-		if (mddev->queue) {
+		if (mddev_is_clustered(mddev))
+			md_cluster_ops->update_size(mddev, old_dev_sectors);
+		else if (mddev->queue) {
 			set_capacity(mddev->gendisk, mddev->array_sectors);
 			revalidate_disk(mddev->gendisk);
 		}
@@ -8753,6 +8752,18 @@ static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev)
 	int role, ret;
 	char b[BDEVNAME_SIZE];
 
+	/*
+	 * If size is changed in another node then we need to
+	 * do resize as well.
+	 */
+	if (mddev->dev_sectors != le64_to_cpu(sb->size)) {
+		ret = mddev->pers->resize(mddev, le64_to_cpu(sb->size));
+		if (ret)
+			pr_info("md-cluster: resize failed\n");
+		else
+			bitmap_update_sb(mddev->bitmap);
+	}
+
 	/* Check for change of roles in the active devices */
 	rdev_for_each(rdev2, mddev) {
 		if (test_bit(Faulty, &rdev2->flags))
-- 
2.6.2


^ permalink raw reply related

* Re: [PATCH V3 1/2] RAID1: a new I/O barrier implementation to remove resync window
From: 王金浦 @ 2017-02-24 10:19 UTC (permalink / raw)
  To: Coly Li
  Cc: Shaohua Li, NeilBrown, linux-raid, Shaohua Li, Johannes Thumshirn,
	Guoqing Jiang
In-Reply-To: <b6134a31-6cf6-3ea4-a3e5-0073c38c3a51@suse.de>

Hi Coly, Hi Shaohua,


>
> Hi Shaohua,
>
> I try to catch up with you, let me try to follow your mind by the
> split-in-while-loop condition (this is my new I/O barrier patch). I
> assume the original BIO is a write bio, and original bio is split and
> handled in a while loop in raid1_make_request().

It's still possible for read bio. We hit a deadlock in the past.
See https://patchwork.kernel.org/patch/9498949/

Also:
http://www.spinics.net/lists/raid/msg52792.html

Regards,
Jack

>
>> 1. in level1, set current->bio_list, split bio to bio1 and bio2
>
> This is done in level1 raid1_make_request().
>
>> 2. remap bio1 to level2 disk, and queue bio1-level2 in current->bio_list
>
> Remap is done by raid1_write_request(), and bio1_level may be added into
> one of the two list:
> - plug->pending:
>   bios in plug->pending may be handled in raid1_unplug(), or in
> flush_pending_writes() of raid1d().
>   If current task is about to be scheduled, raid1_unplug() will merge
> plug->pending's bios to conf->pending_bio_list. And
> conf->pending_bio_list will be handled in raid1d.
>   If raid1_unplug() is triggered by blk_finish_plug(), it is also
> handled in raid1d.
>
> - conf->pending_bio_list:
>   bios in this list is handled in raid1d by calling flush_pending_writes().
>
>
> So generic_make_request() to handle bio1_level2 can only be called in
> context of raid1d thread, bio1_level2 is added into raid1d's
> bio_list_on_stack, not caller of level1 generic_make_request().
>
>> 3. queue bio2 in current->bio_list
>
> Same, bio2_level2 is in level1 raid1d's bio_list_on_stack.
> Then back to level1 generic_make_request()
>
>> 4. generic_make_request then pops bio1-level2
>
> At this moment, bio1_level2 and bio2_level2 are in either plug->pending
> or conf->pending_bio_list, bio_list_pop() returns NULL, and level1
> generic_make_request() returns to its caller.
>
> If before bio_list_pop() called, kernel thread raid1d wakes up and
> iterates conf->pending_bio_list in flush_pending_writes() or iterate
> plug->pending in raid1_unplug() by blk_finish_plug(), that happens in
> level1 raid1d's stack, bios will not show up in level1
> generic_make_reques(), bio_list_pop() still returns NULL.
>
>> 5. remap bio1-level2 to level3 disk, and queue bio1-level2-level3 in current->bio_list
>
> bio2_level2 is at head of conf->pending_bio_list or plug->pending, so
> bio2_level2 is handled firstly.
>
> level1 raid1 calls level2 generic_make_request(), then level2
> raid1_make_request() is called, then level raid1_write_request().
> bio2_level2 is remapped to bio2_level3, added into plug->pending (level1
> raid1d's context) or conf->pending_bio_list (level2 raid1's conf), it
> will be handled by level2 raid1d, when level2 raid1d wakes up.
> Then returns back to level1 raid1, bio1_level2
> is handled by level2 generic_make_request() and added into level2
> plug->pending or conf->pending_bio_list. In this case neither
> bio2_level2 nor bio1_level is added into any bio_list_on_stack.
>
> Then level1 raid1d handles all bios in level1 conf->pending_bio_list,
> and sleeps.
>
> Then level2 raid1d wakes up, and handle bio2_level3 and bio1_level3, by
> iterate level2 plug->pending or conf->pending_bio_list, and calling
> level3 generic_make_request().
>
> In level3 generic_make_request(), because it is level2 raid1d context,
> not level1 raid1d context, bio2_level3 is send into
> q->make_request_fn(), and finally added into level3 plug->pending or
> conf->pending_bio_list, then back to level3 generic_make_reqeust().
>
> Now level2 raid1d's current->bio_list is empty, so level3
> generic_make_request() returns to level2 raid1d and continue to iterate
> and send bio1_level3 into level3 generic_make_request().
>
> After all bios are added into level3 plug->pending or
> conf->pending_bio_list, level2 raid1d sleeps.
>
> Now level3 raid1d wakes up, continue to iterate level3 plug->pending or
> conf->pending_bio_list by calling generic_make_request() to underlying
> devices (which might be a read device).
>
> On the above whole patch, each lower level generic_make_request() is
> called in context of the lower level raid1d. No recursive call happens
> in normal code path.
>
> In raid1 code, recursive call of generic_make_request() only happens for
> READ bio, but if array is not frozen, no barrier is required, it doesn't
> hurt.
>
>
>> 6. generic_make_request then pops bio2, but bio1 hasn't finished yet, deadlock
>
> As my understand to the code, it won't happen neither.
>
>>
>> The problem is because we add new bio to current->bio_list tail.
>
> New bios are added into other context's current->bio_list, which are
> different lists. If what I understand is correct, a dead lock won't
> happen in this way.
>
> If my understanding is correct, suddenly I come to realize why raid1
> bios are handled indirectly in another kernel thread.
>
> (Just for your information, when I write to this location, another run
> of testing finished, no deadlock. This time I reduce I/O barrier bucket
> unit size to 512KB, and set blocksize to 33MB in fio job file. It is
> really slow (130MB/s), but no deadlock observed)
>
>
> The stacked raid1 devices are really really confused, if I am wrong, any
> hint is warmly welcome.
>
>>
>>> =============== P.S ==============
>>> When I run the stacked raid1 testing, I feel I see something behavior
>>> suspiciously, it is resync.
>>>
>>> The second time when I rebuild all the raid1 devices by "mdadm -C
>>> /dev/mdXX -l 1 -n 2 /dev/xxx /dev/xxx", I see the top level raid1 device
>>> /dev/md40 already accomplished 50%+ resync. I don't think it could be
>>> that fast...
>>
>> no idea, is this reproducible?
>
> It can be stably reproduced. I need to check whether bitmap is cleaned
> when create a stacked raid1. This is a little off topic in this thread,
> once I have some idea, I will send out another topic. Hope it is just so
> fast.
>
> Coly
> --
> To unsubscribe from this list: send the line "unsubscribe linux-raid" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* [BUG] non-metadata arrays cannot use more than 27 component devices
From: ian_bruce @ 2017-02-24 12:08 UTC (permalink / raw)
  To: linux-raid

When assembling non-metadata arrays ("mdadm --build"), the in-kernel
superblock apparently defaults to the MD-RAID v0.90 type. This imposes a
maximum of 27 component block devices, presumably as well as limits on
device size.

mdadm does not allow you to override this default, by specifying the
v1.2 superblock. It is not clear whether mdadm tells the kernel to use
the v0.90 superblock, or the kernel assumes this by itself. One or other
of them should be fixed; there does not appear to be any reason why the
v1.2 superblock should not be the default in this case.

details are here:

https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=855871

This problem is easy to reproduce, even with simple hardware. You can
use /dev/loop devices as the array components, as explained in the link.

-- Ian Bruce

^ permalink raw reply

* Re: [BUG] non-metadata arrays cannot use more than 27 component devices
From: Phil Turmel @ 2017-02-24 15:20 UTC (permalink / raw)
  To: ian_bruce, linux-raid
In-Reply-To: <20170224040816.41f2f372.ian_bruce@mail.ru>

On 02/24/2017 07:08 AM, ian_bruce@mail.ru wrote:
> When assembling non-metadata arrays ("mdadm --build"), the in-kernel 
> superblock apparently defaults to the MD-RAID v0.90 type. This
> imposes a maximum of 27 component block devices, presumably as well
> as limits on device size.
> 
> mdadm does not allow you to override this default, by specifying the 
> v1.2 superblock. It is not clear whether mdadm tells the kernel to
> use the v0.90 superblock, or the kernel assumes this by itself. One
> or other of them should be fixed; there does not appear to be any
> reason why the v1.2 superblock should not be the default in this
> case.
> 
> details are here:
> 
> https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=855871
> 
> This problem is easy to reproduce, even with simple hardware. You
> can use /dev/loop devices as the array components, as explained in
> the link.

Considering the existence of --build is strictly to support arrays that
predate MD raid, it seems a bit of a stretch to claim this as a bug
instead of a feature request.

When you implement this feature, you might want to consider extending
modern MD raid's support for external metadata to use RAM and then
you could do whatever you please with the array events.

See "man mdmon" for a summary of external metadata events.

Phil

^ permalink raw reply

* [PATCH v1 00/14] md: cleanup on direct access to bvec table
From: Ming Lei @ 2017-02-24 15:42 UTC (permalink / raw)
  To: Shaohua Li, Jens Axboe, linux-kernel, linux-raid, linux-block,
	Christoph Hellwig
  Cc: Ming Lei

In MD's resync I/O path, there are lots of direct access to bio's
bvec table. This patchset kills almost all, and the conversion
is quite straightforward. One root cause of direct access to bvec
table is that resync I/O uses the bio's bvec to manage pages.
In V1, as suggested by Shaohua, a new approach is used to manage
these pages for resync I/O, turns out code becomes more clean
and readable.

Once direct access to bvec table in MD is cleaned up, we may make
multipage bvec moving on.

V1:
	- allocate page array to manage resync pages

Thanks,
Ming

Ming Lei (14):
  block: introduce bio_segments_all()
  block: introduce bio_remove_last_page()
  md: raid1/raid10: use bio_remove_last_page()
  md: move two macros into md.h
  md: prepare for managing resync I/O pages in clean way
  md: raid1: simplify r1buf_pool_free()
  md: raid1: don't use bio's vec table to manage resync pages
  md: raid1: retrieve page from pre-allocated resync page array
  md: raid1: use bio helper in process_checks()
  md: raid1: use bio_segments_all()
  md: raid10: refactor code of read reshape's .bi_end_io
  md: raid10: don't use bio's vec table to manage resync pages
  md: raid10: retrieve page from preallocated resync page array
  md: raid10: avoid direct access to bvec table in
    handle_reshape_read_error

 block/bio.c         |  23 +++++++
 drivers/md/md.h     |  66 +++++++++++++++++++
 drivers/md/raid1.c  | 125 +++++++++++++++++++++--------------
 drivers/md/raid10.c | 187 +++++++++++++++++++++++++++++++---------------------
 include/linux/bio.h |   8 +++
 5 files changed, 285 insertions(+), 124 deletions(-)

-- 
2.7.4

^ permalink raw reply

* [PATCH v1 01/14] block: introduce bio_segments_all()
From: Ming Lei @ 2017-02-24 15:42 UTC (permalink / raw)
  To: Shaohua Li, Jens Axboe, linux-kernel, linux-raid, linux-block,
	Christoph Hellwig
  Cc: Ming Lei
In-Reply-To: <1487950971-1131-1-git-send-email-tom.leiming@gmail.com>

So that we can replace the direct access to .bi_vcnt.

Signed-off-by: Ming Lei <tom.leiming@gmail.com>
---
 include/linux/bio.h | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/include/linux/bio.h b/include/linux/bio.h
index 8e521194f6fc..3364b3ed90e7 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -293,6 +293,13 @@ static inline void bio_get_last_bvec(struct bio *bio, struct bio_vec *bv)
 		bv->bv_len = iter.bi_bvec_done;
 }
 
+static inline unsigned bio_segments_all(struct bio *bio)
+{
+	WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED));
+
+	return bio->bi_vcnt;
+}
+
 enum bip_flags {
 	BIP_BLOCK_INTEGRITY	= 1 << 0, /* block layer owns integrity data */
 	BIP_MAPPED_INTEGRITY	= 1 << 1, /* ref tag has been remapped */
-- 
2.7.4

^ permalink raw reply related

* [PATCH v1 02/14] block: introduce bio_remove_last_page()
From: Ming Lei @ 2017-02-24 15:42 UTC (permalink / raw)
  To: Shaohua Li, Jens Axboe, linux-kernel, linux-raid, linux-block,
	Christoph Hellwig
  Cc: Ming Lei
In-Reply-To: <1487950971-1131-1-git-send-email-tom.leiming@gmail.com>

MD need this helper to remove the last added page, so introduce
it.

Signed-off-by: Ming Lei <tom.leiming@gmail.com>
---
 block/bio.c         | 23 +++++++++++++++++++++++
 include/linux/bio.h |  1 +
 2 files changed, 24 insertions(+)

diff --git a/block/bio.c b/block/bio.c
index 5eec5e08417f..0ce7ffcd7939 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -837,6 +837,29 @@ int bio_add_pc_page(struct request_queue *q, struct bio *bio, struct page
 EXPORT_SYMBOL(bio_add_pc_page);
 
 /**
+ *	bio_remove_last_page	-	remove the last added page
+ *	@bio: destination bio
+ *
+ *	Attempt to remove the last added page from the bio_vec maplist.
+ */
+void bio_remove_last_page(struct bio *bio)
+{
+	/*
+	 * cloned bio must not modify vec list
+	 */
+	if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
+		return;
+
+	if (bio->bi_vcnt > 0) {
+		struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
+
+		bio->bi_iter.bi_size -= bv->bv_len;
+		bio->bi_vcnt--;
+	}
+}
+EXPORT_SYMBOL(bio_remove_last_page);
+
+/**
  *	bio_add_page	-	attempt to add page to bio
  *	@bio: destination bio
  *	@page: page to add
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 3364b3ed90e7..32aeb493d1fe 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -443,6 +443,7 @@ extern void bio_init(struct bio *bio, struct bio_vec *table,
 extern void bio_reset(struct bio *);
 void bio_chain(struct bio *, struct bio *);
 
+extern void bio_remove_last_page(struct bio *bio);
 extern int bio_add_page(struct bio *, struct page *, unsigned int,unsigned int);
 extern int bio_add_pc_page(struct request_queue *, struct bio *, struct page *,
 			   unsigned int, unsigned int);
-- 
2.7.4

^ permalink raw reply related

* [PATCH v1 03/14] md: raid1/raid10: use bio_remove_last_page()
From: Ming Lei @ 2017-02-24 15:42 UTC (permalink / raw)
  To: Shaohua Li, Jens Axboe, linux-kernel, linux-raid, linux-block,
	Christoph Hellwig
  Cc: Ming Lei
In-Reply-To: <1487950971-1131-1-git-send-email-tom.leiming@gmail.com>

Signed-off-by: Ming Lei <tom.leiming@gmail.com>
---
 drivers/md/raid1.c  | 3 +--
 drivers/md/raid10.c | 6 ++----
 2 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 0628c07dd16d..2a0bf5b430c9 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -2912,8 +2912,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
 						if (bio->bi_end_io==NULL)
 							continue;
 						/* remove last page from this bio */
-						bio->bi_vcnt--;
-						bio->bi_iter.bi_size -= len;
+						bio_remove_last_page(bio);
 						bio_clear_flag(bio, BIO_SEG_VALID);
 					}
 					goto bio_full;
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 33f6a535dc1f..125d74dba27e 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -3446,8 +3446,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 			     bio2 && bio2 != bio;
 			     bio2 = bio2->bi_next) {
 				/* remove last page from this bio */
-				bio2->bi_vcnt--;
-				bio2->bi_iter.bi_size -= len;
+				bio_remove_last_page(bio2);
 				bio_clear_flag(bio2, BIO_SEG_VALID);
 			}
 			goto bio_full;
@@ -4537,8 +4536,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
 			     bio2 && bio2 != bio;
 			     bio2 = bio2->bi_next) {
 				/* Remove last page from this bio */
-				bio2->bi_vcnt--;
-				bio2->bi_iter.bi_size -= len;
+				bio_remove_last_page(bio2);
 				bio_clear_flag(bio2, BIO_SEG_VALID);
 			}
 			goto bio_full;
-- 
2.7.4

^ permalink raw reply related

* [PATCH v1 04/14] md: move two macros into md.h
From: Ming Lei @ 2017-02-24 15:42 UTC (permalink / raw)
  To: Shaohua Li, Jens Axboe, linux-kernel, linux-raid, linux-block,
	Christoph Hellwig
  Cc: Ming Lei
In-Reply-To: <1487950971-1131-1-git-send-email-tom.leiming@gmail.com>

Both raid1 and raid10 share common resync
block size and page count, so move them into md.h.

Signed-off-by: Ming Lei <tom.leiming@gmail.com>
---
 drivers/md/md.h     | 5 +++++
 drivers/md/raid1.c  | 2 --
 drivers/md/raid10.c | 3 ---
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/md/md.h b/drivers/md/md.h
index b8859cbf84b6..1d63239a1be4 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -715,4 +715,9 @@ static inline void mddev_check_writesame(struct mddev *mddev, struct bio *bio)
 	    !bdev_get_queue(bio->bi_bdev)->limits.max_write_same_sectors)
 		mddev->queue->limits.max_write_same_sectors = 0;
 }
+
+/* Maximum size of each resync request */
+#define RESYNC_BLOCK_SIZE (64*1024)
+#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)
+
 #endif /* _MD_MD_H */
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 2a0bf5b430c9..2013e5870761 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -91,10 +91,8 @@ static void r1bio_pool_free(void *r1_bio, void *data)
 	kfree(r1_bio);
 }
 
-#define RESYNC_BLOCK_SIZE (64*1024)
 #define RESYNC_DEPTH 32
 #define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9)
-#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)
 #define RESYNC_WINDOW (RESYNC_BLOCK_SIZE * RESYNC_DEPTH)
 #define RESYNC_WINDOW_SECTORS (RESYNC_WINDOW >> 9)
 #define CLUSTER_RESYNC_WINDOW (16 * RESYNC_WINDOW)
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 125d74dba27e..227dd6ad7716 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -125,9 +125,6 @@ static void r10bio_pool_free(void *r10_bio, void *data)
 	kfree(r10_bio);
 }
 
-/* Maximum size of each resync request */
-#define RESYNC_BLOCK_SIZE (64*1024)
-#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)
 /* amount of memory to reserve for resync requests */
 #define RESYNC_WINDOW (1024*1024)
 /* maximum number of concurrent requests, memory permitting */
-- 
2.7.4

^ permalink raw reply related

* [PATCH v1 05/14] md: prepare for managing resync I/O pages in clean way
From: Ming Lei @ 2017-02-24 15:42 UTC (permalink / raw)
  To: Shaohua Li, Jens Axboe, linux-kernel, linux-raid, linux-block,
	Christoph Hellwig
  Cc: Ming Lei
In-Reply-To: <1487950971-1131-1-git-send-email-tom.leiming@gmail.com>

Now resync I/O use bio's bec table to manage pages,
this way is very hacky, and may not work any more
once multipage bvec is introduced.

So introduce helpers and new data structure for
managing resync I/O pages more cleanly.

Signed-off-by: Ming Lei <tom.leiming@gmail.com>
---
 drivers/md/md.h | 61 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 61 insertions(+)

diff --git a/drivers/md/md.h b/drivers/md/md.h
index 1d63239a1be4..df18ae05838d 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -720,4 +720,65 @@ static inline void mddev_check_writesame(struct mddev *mddev, struct bio *bio)
 #define RESYNC_BLOCK_SIZE (64*1024)
 #define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)
 
+/* for managing resync I/O pages */
+struct resync_pages {
+	unsigned	idx;	/* for get/put page from the pool */
+	void		*raid_bio;
+	struct page	*pages[RESYNC_PAGES];
+};
+
+static inline int resync_alloc_pages(struct resync_pages *rp,
+				     gfp_t gfp_flags)
+{
+	int i;
+
+	for (i = 0; i < RESYNC_PAGES; i++) {
+		rp->pages[i] = alloc_page(gfp_flags);
+		if (!rp->pages[i])
+			goto out_free;
+	}
+
+	return 0;
+
+ out_free:
+	while (--i >= 0)
+		__free_page(rp->pages[i]);
+	return -ENOMEM;
+}
+
+static inline void resync_free_pages(struct resync_pages *rp)
+{
+	int i;
+
+	for (i = 0; i < RESYNC_PAGES; i++)
+		__free_page(rp->pages[i]);
+}
+
+static inline void resync_get_all_pages(struct resync_pages *rp)
+{
+	int i;
+
+	for (i = 0; i < RESYNC_PAGES; i++)
+		get_page(rp->pages[i]);
+}
+
+static inline void resync_store_page(struct resync_pages *rp, struct page *page)
+{
+	if (WARN_ON(!rp->idx))
+		return;
+	rp->pages[--rp->idx] = page;
+}
+
+static inline struct page *resync_fetch_page(struct resync_pages *rp)
+{
+	if (WARN_ON_ONCE(rp->idx >= RESYNC_PAGES))
+		return NULL;
+	return rp->pages[rp->idx++];
+}
+
+static inline bool resync_page_available(struct resync_pages *rp)
+{
+	return rp->idx < RESYNC_PAGES;
+}
+
 #endif /* _MD_MD_H */
-- 
2.7.4

^ permalink raw reply related

* [PATCH v1 06/14] md: raid1: simplify r1buf_pool_free()
From: Ming Lei @ 2017-02-24 15:42 UTC (permalink / raw)
  To: Shaohua Li, Jens Axboe, linux-kernel, linux-raid, linux-block,
	Christoph Hellwig
  Cc: Ming Lei
In-Reply-To: <1487950971-1131-1-git-send-email-tom.leiming@gmail.com>

This patch gets each page's reference of each bio for resync,
then r1buf_pool_free() gets simplified a lot.

The same policy has been taken in raid10's buf pool allocation/free
too.

Signed-off-by: Ming Lei <tom.leiming@gmail.com>
---
 drivers/md/raid1.c | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 2013e5870761..2de0bd69d8da 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -139,9 +139,12 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
 	/* If not user-requests, copy the page pointers to all bios */
 	if (!test_bit(MD_RECOVERY_REQUESTED, &pi->mddev->recovery)) {
 		for (i=0; i<RESYNC_PAGES ; i++)
-			for (j=1; j<pi->raid_disks; j++)
-				r1_bio->bios[j]->bi_io_vec[i].bv_page =
+			for (j=1; j<pi->raid_disks; j++) {
+				struct page *page =
 					r1_bio->bios[0]->bi_io_vec[i].bv_page;
+				get_page(page);
+				r1_bio->bios[j]->bi_io_vec[i].bv_page = page;
+			}
 	}
 
 	r1_bio->master_bio = NULL;
@@ -166,12 +169,8 @@ static void r1buf_pool_free(void *__r1_bio, void *data)
 	struct r1bio *r1bio = __r1_bio;
 
 	for (i = 0; i < RESYNC_PAGES; i++)
-		for (j = pi->raid_disks; j-- ;) {
-			if (j == 0 ||
-			    r1bio->bios[j]->bi_io_vec[i].bv_page !=
-			    r1bio->bios[0]->bi_io_vec[i].bv_page)
-				safe_put_page(r1bio->bios[j]->bi_io_vec[i].bv_page);
-		}
+		for (j = pi->raid_disks; j-- ;)
+			safe_put_page(r1bio->bios[j]->bi_io_vec[i].bv_page);
 	for (i=0 ; i < pi->raid_disks; i++)
 		bio_put(r1bio->bios[i]);
 
-- 
2.7.4

^ permalink raw reply related

* [PATCH v1 07/14] md: raid1: don't use bio's vec table to manage resync pages
From: Ming Lei @ 2017-02-24 15:42 UTC (permalink / raw)
  To: Shaohua Li, Jens Axboe, linux-kernel, linux-raid, linux-block,
	Christoph Hellwig
  Cc: Ming Lei
In-Reply-To: <1487950971-1131-1-git-send-email-tom.leiming@gmail.com>

Now we allocate one page array for managing resync pages, instead
of using bio's vec table to do that, and the old way is very hacky
and won't work any more if multipage bvec is enabled.

The introduced cost is that we need to allocate (128 + 16) * raid_disks
bytes per r1_bio, and it is fine because the inflight r1_bio for
resync shouldn't be much, as pointed by Shaohua.

Also the bio_reset() in raid1_sync_request() is removed because
all bios are freshly new now and not necessary to reset any more.

This patch can be thought as a cleanup too

Suggested-by: Shaohua Li <shli@kernel.org>
Signed-off-by: Ming Lei <tom.leiming@gmail.com>
---
 drivers/md/raid1.c | 86 +++++++++++++++++++++++++++++++++++-------------------
 1 file changed, 56 insertions(+), 30 deletions(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 2de0bd69d8da..4a208220ff0f 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -77,6 +77,16 @@ static void lower_barrier(struct r1conf *conf, sector_t sector_nr);
 #define raid1_log(md, fmt, args...)				\
 	do { if ((md)->queue) blk_add_trace_msg((md)->queue, "raid1 " fmt, ##args); } while (0)
 
+static inline struct resync_pages *get_resync_pages(struct bio *bio)
+{
+	return bio->bi_private;
+}
+
+static inline struct r1bio *get_resync_r1bio(struct bio *bio)
+{
+	return get_resync_pages(bio)->raid_bio;
+}
+
 static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data)
 {
 	struct pool_info *pi = data;
@@ -104,12 +114,18 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
 	struct r1bio *r1_bio;
 	struct bio *bio;
 	int need_pages;
-	int i, j;
+	int j;
+	struct resync_pages *rps;
 
 	r1_bio = r1bio_pool_alloc(gfp_flags, pi);
 	if (!r1_bio)
 		return NULL;
 
+	rps = kmalloc(sizeof(struct resync_pages) * pi->raid_disks,
+		      gfp_flags);
+	if (!rps)
+		goto out_free_r1bio;
+
 	/*
 	 * Allocate bios : 1 for reading, n-1 for writing
 	 */
@@ -129,22 +145,22 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
 		need_pages = pi->raid_disks;
 	else
 		need_pages = 1;
-	for (j = 0; j < need_pages; j++) {
+	for (j = 0; j < pi->raid_disks; j++) {
+		struct resync_pages *rp = &rps[j];
+
 		bio = r1_bio->bios[j];
-		bio->bi_vcnt = RESYNC_PAGES;
-
-		if (bio_alloc_pages(bio, gfp_flags))
-			goto out_free_pages;
-	}
-	/* If not user-requests, copy the page pointers to all bios */
-	if (!test_bit(MD_RECOVERY_REQUESTED, &pi->mddev->recovery)) {
-		for (i=0; i<RESYNC_PAGES ; i++)
-			for (j=1; j<pi->raid_disks; j++) {
-				struct page *page =
-					r1_bio->bios[0]->bi_io_vec[i].bv_page;
-				get_page(page);
-				r1_bio->bios[j]->bi_io_vec[i].bv_page = page;
-			}
+
+		if (j < need_pages) {
+			if (resync_alloc_pages(rp, gfp_flags))
+				goto out_free_pages;
+		} else {
+			memcpy(rp, &rps[0], sizeof(*rp));
+			resync_get_all_pages(rp);
+		}
+
+		rp->idx = 0;
+		rp->raid_bio = r1_bio;
+		bio->bi_private = rp;
 	}
 
 	r1_bio->master_bio = NULL;
@@ -153,11 +169,14 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
 
 out_free_pages:
 	while (--j >= 0)
-		bio_free_pages(r1_bio->bios[j]);
+		resync_free_pages(&rps[j]);
 
 out_free_bio:
 	while (++j < pi->raid_disks)
 		bio_put(r1_bio->bios[j]);
+	kfree(rps);
+
+out_free_r1bio:
 	r1bio_pool_free(r1_bio, data);
 	return NULL;
 }
@@ -165,14 +184,18 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
 static void r1buf_pool_free(void *__r1_bio, void *data)
 {
 	struct pool_info *pi = data;
-	int i,j;
+	int i;
 	struct r1bio *r1bio = __r1_bio;
+	struct resync_pages *rp = NULL;
 
-	for (i = 0; i < RESYNC_PAGES; i++)
-		for (j = pi->raid_disks; j-- ;)
-			safe_put_page(r1bio->bios[j]->bi_io_vec[i].bv_page);
-	for (i=0 ; i < pi->raid_disks; i++)
+	for (i = pi->raid_disks; i--; ) {
+		rp = get_resync_pages(r1bio->bios[i]);
+		resync_free_pages(rp);
 		bio_put(r1bio->bios[i]);
+	}
+
+	/* resync pages array stored in the 1st bio's .bi_private */
+	kfree(rp);
 
 	r1bio_pool_free(r1bio, data);
 }
@@ -1849,7 +1872,7 @@ static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
 
 static void end_sync_read(struct bio *bio)
 {
-	struct r1bio *r1_bio = bio->bi_private;
+	struct r1bio *r1_bio = get_resync_r1bio(bio);
 
 	update_head_pos(r1_bio->read_disk, r1_bio);
 
@@ -1868,7 +1891,7 @@ static void end_sync_read(struct bio *bio)
 static void end_sync_write(struct bio *bio)
 {
 	int uptodate = !bio->bi_error;
-	struct r1bio *r1_bio = bio->bi_private;
+	struct r1bio *r1_bio = get_resync_r1bio(bio);
 	struct mddev *mddev = r1_bio->mddev;
 	struct r1conf *conf = mddev->private;
 	sector_t first_bad;
@@ -2085,6 +2108,7 @@ static void process_checks(struct r1bio *r1_bio)
 		int size;
 		int error;
 		struct bio *b = r1_bio->bios[i];
+		struct resync_pages *rp = get_resync_pages(b);
 		if (b->bi_end_io != end_sync_read)
 			continue;
 		/* fixup the bio for reuse, but preserve errno */
@@ -2097,7 +2121,8 @@ static void process_checks(struct r1bio *r1_bio)
 			conf->mirrors[i].rdev->data_offset;
 		b->bi_bdev = conf->mirrors[i].rdev->bdev;
 		b->bi_end_io = end_sync_read;
-		b->bi_private = r1_bio;
+		rp->raid_bio = r1_bio;
+		b->bi_private = rp;
 
 		size = b->bi_iter.bi_size;
 		for (j = 0; j < vcnt ; j++) {
@@ -2755,7 +2780,6 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
 	for (i = 0; i < conf->raid_disks * 2; i++) {
 		struct md_rdev *rdev;
 		bio = r1_bio->bios[i];
-		bio_reset(bio);
 
 		rdev = rcu_dereference(conf->mirrors[i].rdev);
 		if (rdev == NULL ||
@@ -2811,7 +2835,6 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
 			atomic_inc(&rdev->nr_pending);
 			bio->bi_iter.bi_sector = sector_nr + rdev->data_offset;
 			bio->bi_bdev = rdev->bdev;
-			bio->bi_private = r1_bio;
 			if (test_bit(FailFast, &rdev->flags))
 				bio->bi_opf |= MD_FAILFAST;
 		}
@@ -2897,12 +2920,15 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
 		}
 
 		for (i = 0 ; i < conf->raid_disks * 2; i++) {
+			struct resync_pages *rp;
+
 			bio = r1_bio->bios[i];
+			rp = get_resync_pages(bio);
 			if (bio->bi_end_io) {
-				page = bio->bi_io_vec[bio->bi_vcnt].bv_page;
+				page = resync_fetch_page(rp);
 				if (bio_add_page(bio, page, len, 0) == 0) {
 					/* stop here */
-					bio->bi_io_vec[bio->bi_vcnt].bv_page = page;
+					resync_store_page(rp, page);
 					while (i > 0) {
 						i--;
 						bio = r1_bio->bios[i];
@@ -2919,7 +2945,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
 		nr_sectors += len>>9;
 		sector_nr += len>>9;
 		sync_blocks -= (len>>9);
-	} while (r1_bio->bios[disk]->bi_vcnt < RESYNC_PAGES);
+	} while (resync_page_available(r1_bio->bios[disk]->bi_private));
  bio_full:
 	r1_bio->sectors = nr_sectors;
 
-- 
2.7.4

^ permalink raw reply related

* [PATCH v1 08/14] md: raid1: retrieve page from pre-allocated resync page array
From: Ming Lei @ 2017-02-24 15:42 UTC (permalink / raw)
  To: Shaohua Li, Jens Axboe, linux-kernel, linux-raid, linux-block,
	Christoph Hellwig
  Cc: Ming Lei
In-Reply-To: <1487950971-1131-1-git-send-email-tom.leiming@gmail.com>

Now one page array is allocated for each resync bio, and we can
retrieve page from this table directly.

Signed-off-by: Ming Lei <tom.leiming@gmail.com>
---
 drivers/md/raid1.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 4a208220ff0f..9371caace379 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1970,6 +1970,7 @@ static int fix_sync_read_error(struct r1bio *r1_bio)
 	struct mddev *mddev = r1_bio->mddev;
 	struct r1conf *conf = mddev->private;
 	struct bio *bio = r1_bio->bios[r1_bio->read_disk];
+	struct page **pages = get_resync_pages(bio)->pages;
 	sector_t sect = r1_bio->sector;
 	int sectors = r1_bio->sectors;
 	int idx = 0;
@@ -2003,7 +2004,7 @@ static int fix_sync_read_error(struct r1bio *r1_bio)
 				 */
 				rdev = conf->mirrors[d].rdev;
 				if (sync_page_io(rdev, sect, s<<9,
-						 bio->bi_io_vec[idx].bv_page,
+						 pages[idx],
 						 REQ_OP_READ, 0, false)) {
 					success = 1;
 					break;
@@ -2058,7 +2059,7 @@ static int fix_sync_read_error(struct r1bio *r1_bio)
 				continue;
 			rdev = conf->mirrors[d].rdev;
 			if (r1_sync_page_io(rdev, sect, s,
-					    bio->bi_io_vec[idx].bv_page,
+					    pages[idx],
 					    WRITE) == 0) {
 				r1_bio->bios[d]->bi_end_io = NULL;
 				rdev_dec_pending(rdev, mddev);
@@ -2073,7 +2074,7 @@ static int fix_sync_read_error(struct r1bio *r1_bio)
 				continue;
 			rdev = conf->mirrors[d].rdev;
 			if (r1_sync_page_io(rdev, sect, s,
-					    bio->bi_io_vec[idx].bv_page,
+					    pages[idx],
 					    READ) != 0)
 				atomic_add(s, &rdev->corrected_errors);
 		}
@@ -2149,6 +2150,8 @@ static void process_checks(struct r1bio *r1_bio)
 		struct bio *pbio = r1_bio->bios[primary];
 		struct bio *sbio = r1_bio->bios[i];
 		int error = sbio->bi_error;
+		struct page **ppages = get_resync_pages(pbio)->pages;
+		struct page **spages = get_resync_pages(sbio)->pages;
 
 		if (sbio->bi_end_io != end_sync_read)
 			continue;
@@ -2157,11 +2160,8 @@ static void process_checks(struct r1bio *r1_bio)
 
 		if (!error) {
 			for (j = vcnt; j-- ; ) {
-				struct page *p, *s;
-				p = pbio->bi_io_vec[j].bv_page;
-				s = sbio->bi_io_vec[j].bv_page;
-				if (memcmp(page_address(p),
-					   page_address(s),
+				if (memcmp(page_address(ppages[j]),
+					   page_address(spages[j]),
 					   sbio->bi_io_vec[j].bv_len))
 					break;
 			}
-- 
2.7.4

^ permalink raw reply related

* [PATCH v1 09/14] md: raid1: use bio helper in process_checks()
From: Ming Lei @ 2017-02-24 15:42 UTC (permalink / raw)
  To: Shaohua Li, Jens Axboe, linux-kernel, linux-raid, linux-block,
	Christoph Hellwig
  Cc: Ming Lei
In-Reply-To: <1487950971-1131-1-git-send-email-tom.leiming@gmail.com>

Avoid to direct access to bvec table.

Signed-off-by: Ming Lei <tom.leiming@gmail.com>
---
 drivers/md/raid1.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 9371caace379..7363bf56f3b4 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -2108,6 +2108,7 @@ static void process_checks(struct r1bio *r1_bio)
 		int j;
 		int size;
 		int error;
+		struct bio_vec *bi;
 		struct bio *b = r1_bio->bios[i];
 		struct resync_pages *rp = get_resync_pages(b);
 		if (b->bi_end_io != end_sync_read)
@@ -2126,9 +2127,7 @@ static void process_checks(struct r1bio *r1_bio)
 		b->bi_private = rp;
 
 		size = b->bi_iter.bi_size;
-		for (j = 0; j < vcnt ; j++) {
-			struct bio_vec *bi;
-			bi = &b->bi_io_vec[j];
+		bio_for_each_segment_all(bi, b, j) {
 			bi->bv_offset = 0;
 			if (size > PAGE_SIZE)
 				bi->bv_len = PAGE_SIZE;
@@ -2152,17 +2151,22 @@ static void process_checks(struct r1bio *r1_bio)
 		int error = sbio->bi_error;
 		struct page **ppages = get_resync_pages(pbio)->pages;
 		struct page **spages = get_resync_pages(sbio)->pages;
+		struct bio_vec *bi;
+		int page_len[RESYNC_PAGES];
 
 		if (sbio->bi_end_io != end_sync_read)
 			continue;
 		/* Now we can 'fixup' the error value */
 		sbio->bi_error = 0;
 
+		bio_for_each_segment_all(bi, sbio, j)
+			page_len[j] = bi->bv_len;
+
 		if (!error) {
 			for (j = vcnt; j-- ; ) {
 				if (memcmp(page_address(ppages[j]),
 					   page_address(spages[j]),
-					   sbio->bi_io_vec[j].bv_len))
+					   page_len[j]))
 					break;
 			}
 		} else
-- 
2.7.4

^ permalink raw reply related

* [PATCH v1 10/14] md: raid1: use bio_segments_all()
From: Ming Lei @ 2017-02-24 15:42 UTC (permalink / raw)
  To: Shaohua Li, Jens Axboe, linux-kernel, linux-raid, linux-block,
	Christoph Hellwig
  Cc: Ming Lei
In-Reply-To: <1487950971-1131-1-git-send-email-tom.leiming@gmail.com>

Use this helper, instead of direct access to .bi_vcnt.

Signed-off-by: Ming Lei <tom.leiming@gmail.com>
---
 drivers/md/raid1.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 7363bf56f3b4..391da975e092 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1091,7 +1091,8 @@ static void alloc_behind_pages(struct bio *bio, struct r1bio *r1_bio)
 {
 	int i;
 	struct bio_vec *bvec;
-	struct bio_vec *bvecs = kzalloc(bio->bi_vcnt * sizeof(struct bio_vec),
+	unsigned vcnt = bio_segments_all(bio);
+	struct bio_vec *bvecs = kzalloc(vcnt * sizeof(struct bio_vec),
 					GFP_NOIO);
 	if (unlikely(!bvecs))
 		return;
@@ -1107,12 +1108,12 @@ static void alloc_behind_pages(struct bio *bio, struct r1bio *r1_bio)
 		kunmap(bvec->bv_page);
 	}
 	r1_bio->behind_bvecs = bvecs;
-	r1_bio->behind_page_count = bio->bi_vcnt;
+	r1_bio->behind_page_count = vcnt;
 	set_bit(R1BIO_BehindIO, &r1_bio->state);
 	return;
 
 do_sync_io:
-	for (i = 0; i < bio->bi_vcnt; i++)
+	for (i = 0; i < vcnt; i++)
 		if (bvecs[i].bv_page)
 			put_page(bvecs[i].bv_page);
 	kfree(bvecs);
-- 
2.7.4

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox