Linux RAID subsystem development

Linux RAID subsystem development
 help / color / mirror / Atom feed

* [PATCH V2 08/10] md-cluster: convert the completion to wait queue
From: Guoqing Jiang @ 2016-08-12  5:42 UTC (permalink / raw)
  To: linux-raid; +Cc: shli, Guoqing Jiang
In-Reply-To: <1470980563-26062-1-git-send-email-gqjiang@suse.com>

Previously, we used completion to sync between require dlm lock
and sync_ast, however we will have to expose completion.wait
and completion.done in dlm_lock_sync_interruptible (introduced
later), it is not a common usage for completion, so convert
related things to wait queue.

Reviewed-by: NeilBrown <neilb@suse.com>
Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
---
 drivers/md/md-cluster.c | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index 8972413..03a51e7 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -25,7 +25,8 @@ struct dlm_lock_resource {
 	struct dlm_lksb lksb;
 	char *name; /* lock name. */
 	uint32_t flags; /* flags to pass to dlm_lock() */
-	struct completion completion; /* completion for synchronized locking */
+	wait_queue_head_t sync_locking; /* wait queue for synchronized locking */
+	bool sync_locking_done;
 	void (*bast)(void *arg, int mode); /* blocking AST function pointer*/
 	struct mddev *mddev; /* pointing back to mddev. */
 	int mode;
@@ -118,7 +119,8 @@ static void sync_ast(void *arg)
 	struct dlm_lock_resource *res;
 
 	res = arg;
-	complete(&res->completion);
+	res->sync_locking_done = true;
+	wake_up(&res->sync_locking);
 }
 
 static int dlm_lock_sync(struct dlm_lock_resource *res, int mode)
@@ -130,7 +132,8 @@ static int dlm_lock_sync(struct dlm_lock_resource *res, int mode)
 			0, sync_ast, res, res->bast);
 	if (ret)
 		return ret;
-	wait_for_completion(&res->completion);
+	wait_event(res->sync_locking, res->sync_locking_done);
+	res->sync_locking_done = false;
 	if (res->lksb.sb_status == 0)
 		res->mode = mode;
 	return res->lksb.sb_status;
@@ -151,7 +154,8 @@ static struct dlm_lock_resource *lockres_init(struct mddev *mddev,
 	res = kzalloc(sizeof(struct dlm_lock_resource), GFP_KERNEL);
 	if (!res)
 		return NULL;
-	init_completion(&res->completion);
+	init_waitqueue_head(&res->sync_locking);
+	res->sync_locking_done = false;
 	res->ls = cinfo->lockspace;
 	res->mddev = mddev;
 	res->mode = DLM_LOCK_IV;
@@ -205,7 +209,7 @@ static void lockres_free(struct dlm_lock_resource *res)
 	if (unlikely(ret != 0))
 		pr_err("failed to unlock %s return %d\n", res->name, ret);
 	else
-		wait_for_completion(&res->completion);
+		wait_event(res->sync_locking, res->sync_locking_done);
 
 	kfree(res->name);
 	kfree(res->lksb.sb_lvbptr);
-- 
2.6.2


^ permalink raw reply related

* [PATCH V2 07/10] md: remove obsolete ret in md_start_sync
From: Guoqing Jiang @ 2016-08-12  5:42 UTC (permalink / raw)
  To: linux-raid; +Cc: shli, Guoqing Jiang
In-Reply-To: <1470980563-26062-1-git-send-email-gqjiang@suse.com>

The ret is not needed anymore since we have already
move resync_start into md_do_sync in commit 41a9a0d.

Reviewed-by: NeilBrown <neilb@suse.com>
Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
---
 drivers/md/md.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index f95e5ed..cceaaf4 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -8283,16 +8283,14 @@ no_add:
 static void md_start_sync(struct work_struct *ws)
 {
 	struct mddev *mddev = container_of(ws, struct mddev, del_work);
-	int ret = 0;
 
 	mddev->sync_thread = md_register_thread(md_do_sync,
 						mddev,
 						"resync");
 	if (!mddev->sync_thread) {
-		if (!(mddev_is_clustered(mddev) && ret == -EAGAIN))
-			printk(KERN_ERR "%s: could not start resync"
-			       " thread...\n",
-			       mdname(mddev));
+		printk(KERN_ERR "%s: could not start resync"
+		       " thread...\n",
+		       mdname(mddev));
 		/* leave the spares where they are, it shouldn't hurt */
 		clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
 		clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
-- 
2.6.2


^ permalink raw reply related

* [PATCH V2 06/10] md-cluster: protect md_find_rdev_nr_rcu with rcu lock
From: Guoqing Jiang @ 2016-08-12  5:42 UTC (permalink / raw)
  To: linux-raid; +Cc: shli, Guoqing Jiang
In-Reply-To: <1470980563-26062-1-git-send-email-gqjiang@suse.com>

We need to use rcu_read_lock/unlock to avoid potential
race.

Reported-by: Shaohua Li <shli@fb.com>
Reviewed-by: NeilBrown <neilb@suse.com>
Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
---
 drivers/md/md-cluster.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index ea2699e..8972413 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -486,9 +486,10 @@ static void process_metadata_update(struct mddev *mddev, struct cluster_msg *msg
 
 static void process_remove_disk(struct mddev *mddev, struct cluster_msg *msg)
 {
-	struct md_rdev *rdev = md_find_rdev_nr_rcu(mddev,
-						   le32_to_cpu(msg->raid_slot));
+	struct md_rdev *rdev;
 
+	rcu_read_lock();
+	rdev = md_find_rdev_nr_rcu(mddev, le32_to_cpu(msg->raid_slot));
 	if (rdev) {
 		set_bit(ClusterRemove, &rdev->flags);
 		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
@@ -497,18 +498,21 @@ static void process_remove_disk(struct mddev *mddev, struct cluster_msg *msg)
 	else
 		pr_warn("%s: %d Could not find disk(%d) to REMOVE\n",
 			__func__, __LINE__, le32_to_cpu(msg->raid_slot));
+	rcu_read_unlock();
 }
 
 static void process_readd_disk(struct mddev *mddev, struct cluster_msg *msg)
 {
-	struct md_rdev *rdev = md_find_rdev_nr_rcu(mddev,
-						   le32_to_cpu(msg->raid_slot));
+	struct md_rdev *rdev;
 
+	rcu_read_lock();
+	rdev = md_find_rdev_nr_rcu(mddev, le32_to_cpu(msg->raid_slot));
 	if (rdev && test_bit(Faulty, &rdev->flags))
 		clear_bit(Faulty, &rdev->flags);
 	else
 		pr_warn("%s: %d Could not find disk(%d) which is faulty",
 			__func__, __LINE__, le32_to_cpu(msg->raid_slot));
+	rcu_read_unlock();
 }
 
 static int process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg)
-- 
2.6.2


^ permalink raw reply related

* [PATCH V2 05/10] md-cluster: clean related infos of cluster
From: Guoqing Jiang @ 2016-08-12  5:42 UTC (permalink / raw)
  To: linux-raid; +Cc: shli, Guoqing Jiang
In-Reply-To: <1470980563-26062-1-git-send-email-gqjiang@suse.com>

cluster_info and bitmap_info.nodes also need to be
cleared when array is stopped.

Reviewed-by: NeilBrown <neilb@suse.com>
Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
---
 drivers/md/md.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index da5741c..f95e5ed 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -5457,12 +5457,14 @@ static void md_clean(struct mddev *mddev)
 	mddev->degraded = 0;
 	mddev->safemode = 0;
 	mddev->private = NULL;
+	mddev->cluster_info = NULL;
 	mddev->bitmap_info.offset = 0;
 	mddev->bitmap_info.default_offset = 0;
 	mddev->bitmap_info.default_space = 0;
 	mddev->bitmap_info.chunksize = 0;
 	mddev->bitmap_info.daemon_sleep = 0;
 	mddev->bitmap_info.max_write_behind = 0;
+	mddev->bitmap_info.nodes = 0;
 }
 
 static void __md_stop_writes(struct mddev *mddev)
-- 
2.6.2


^ permalink raw reply related

* [PATCH V2 04/10] md: changes for MD_STILL_CLOSED flag
From: Guoqing Jiang @ 2016-08-12  5:42 UTC (permalink / raw)
  To: linux-raid; +Cc: shli, Guoqing Jiang
In-Reply-To: <1470980563-26062-1-git-send-email-gqjiang@suse.com>

When stop clustered raid while it is pending on resync,
MD_STILL_CLOSED flag could be cleared since udev rule
is triggered to open the mddev. So obviously array can't
be stopped soon and returns EBUSY.

	mdadm -Ss          md-raid-arrays.rules
  set MD_STILL_CLOSED          md_open()
	... ... ...          clear MD_STILL_CLOSED
	do_md_stop

We make below changes to resolve this issue:

1. rename MD_STILL_CLOSED to MD_CLOSING since it is set
   when stop array and it means we are stopping array.
2. let md_open returns early if CLOSING is set, so no
   other threads will open array if one thread is trying
   to close it.
3. no need to clear CLOSING bit in md_open because 1 has
   ensure the bit is cleared, then we also don't need to
   test CLOSING bit in do_md_stop.

Reviewed-by: NeilBrown <neilb@suse.com>
Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
---
 drivers/md/md.c | 14 ++++++++------
 drivers/md/md.h |  5 ++---
 2 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index d6ae9bc..da5741c 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -5576,8 +5576,7 @@ static int md_set_readonly(struct mddev *mddev, struct block_device *bdev)
 	mutex_lock(&mddev->open_mutex);
 	if ((mddev->pers && atomic_read(&mddev->openers) > !!bdev) ||
 	    mddev->sync_thread ||
-	    test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
-	    (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags))) {
+	    test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
 		printk("md: %s still in use.\n",mdname(mddev));
 		if (did_freeze) {
 			clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
@@ -5639,8 +5638,7 @@ static int do_md_stop(struct mddev *mddev, int mode,
 	if ((mddev->pers && atomic_read(&mddev->openers) > !!bdev) ||
 	    mddev->sysfs_active ||
 	    mddev->sync_thread ||
-	    test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
-	    (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags))) {
+	    test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
 		printk("md: %s still in use.\n",mdname(mddev));
 		mutex_unlock(&mddev->open_mutex);
 		if (did_freeze) {
@@ -6825,7 +6823,7 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
 			err = -EBUSY;
 			goto out;
 		}
-		set_bit(MD_STILL_CLOSED, &mddev->flags);
+		set_bit(MD_CLOSING, &mddev->flags);
 		mutex_unlock(&mddev->open_mutex);
 		sync_blockdev(bdev);
 	}
@@ -7074,9 +7072,13 @@ static int md_open(struct block_device *bdev, fmode_t mode)
 	if ((err = mutex_lock_interruptible(&mddev->open_mutex)))
 		goto out;
 
+	if (test_bit(MD_CLOSING, &mddev->flags)) {
+		mutex_unlock(&mddev->open_mutex);
+		return -ENODEV;
+	}
+
 	err = 0;
 	atomic_inc(&mddev->openers);
-	clear_bit(MD_STILL_CLOSED, &mddev->flags);
 	mutex_unlock(&mddev->open_mutex);
 
 	check_disk_change(bdev);
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 20c6675..2b20417 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -201,9 +201,8 @@ struct mddev {
 #define MD_CHANGE_PENDING 2	/* switch from 'clean' to 'active' in progress */
 #define MD_UPDATE_SB_FLAGS (1 | 2 | 4)	/* If these are set, md_update_sb needed */
 #define MD_ARRAY_FIRST_USE 3    /* First use of array, needs initialization */
-#define MD_STILL_CLOSED	4	/* If set, then array has not been opened since
-				 * md_ioctl checked on it.
-				 */
+#define MD_CLOSING	4	/* If set, we are closing the array, do not open
+				 * it then */
 #define MD_JOURNAL_CLEAN 5	/* A raid with journal is already clean */
 #define MD_HAS_JOURNAL	6	/* The raid array has journal feature set */
 #define MD_RELOAD_SB	7	/* Reload the superblock because another node
-- 
2.6.2


^ permalink raw reply related

* [PATCH V2 03/10] md-cluster: remove some unnecessary dlm_unlock_sync
From: Guoqing Jiang @ 2016-08-12  5:42 UTC (permalink / raw)
  To: linux-raid; +Cc: shli, Guoqing Jiang
In-Reply-To: <1470980563-26062-1-git-send-email-gqjiang@suse.com>

Since DLM_LKF_FORCEUNLOCK is used in lockres_free,
we don't need to call dlm_unlock_sync before free
lock resource.

Reviewed-by: NeilBrown <neilb@suse.com>
Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
---
 drivers/md/md-cluster.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index 0a0605f..ea2699e 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -281,7 +281,7 @@ static void recover_bitmaps(struct md_thread *thread)
 		ret = bitmap_copy_from_slot(mddev, slot, &lo, &hi, true);
 		if (ret) {
 			pr_err("md-cluster: Could not copy data from bitmap %d\n", slot);
-			goto dlm_unlock;
+			goto clear_bit;
 		}
 		if (hi > 0) {
 			if (lo < mddev->recovery_cp)
@@ -293,8 +293,6 @@ static void recover_bitmaps(struct md_thread *thread)
 			    md_wakeup_thread(mddev->thread);
 			}
 		}
-dlm_unlock:
-		dlm_unlock_sync(bm_lockres);
 clear_bit:
 		lockres_free(bm_lockres);
 		clear_bit(slot, &cinfo->recovery_map);
@@ -763,7 +761,6 @@ static int gather_all_resync_info(struct mddev *mddev, int total_slots)
 			md_check_recovery(mddev);
 		}
 
-		dlm_unlock_sync(bm_lockres);
 		lockres_free(bm_lockres);
 	}
 out:
@@ -1173,7 +1170,6 @@ static void unlock_all_bitmaps(struct mddev *mddev)
 	if (cinfo->other_bitmap_lockres) {
 		for (i = 0; i < mddev->bitmap_info.nodes - 1; i++) {
 			if (cinfo->other_bitmap_lockres[i]) {
-				dlm_unlock_sync(cinfo->other_bitmap_lockres[i]);
 				lockres_free(cinfo->other_bitmap_lockres[i]);
 			}
 		}
-- 
2.6.2


^ permalink raw reply related

* [PATCH V2 02/10] md-cluster: use FORCEUNLOCK in lockres_free
From: Guoqing Jiang @ 2016-08-12  5:42 UTC (permalink / raw)
  To: linux-raid; +Cc: shli, Guoqing Jiang
In-Reply-To: <1470980563-26062-1-git-send-email-gqjiang@suse.com>

For dlm_unlock, we need to pass flag to dlm_unlock as the
third parameter instead of set res->flags.

Also, DLM_LKF_FORCEUNLOCK is more suitable for dlm_unlock
since it works even the lock is on waiting or convert queue.

Acked-by: NeilBrown <neilb@suse.com>
Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
---
 drivers/md/md-cluster.c | 23 ++++++++---------------
 1 file changed, 8 insertions(+), 15 deletions(-)

diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index 41573f1..0a0605f 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -194,25 +194,18 @@ out_err:
 
 static void lockres_free(struct dlm_lock_resource *res)
 {
-	int ret;
+	int ret = 0;
 
 	if (!res)
 		return;
 
-	/* cancel a lock request or a conversion request that is blocked */
-	res->flags |= DLM_LKF_CANCEL;
-retry:
-	ret = dlm_unlock(res->ls, res->lksb.sb_lkid, 0, &res->lksb, res);
-	if (unlikely(ret != 0)) {
-		pr_info("%s: failed to unlock %s return %d\n", __func__, res->name, ret);
-
-		/* if a lock conversion is cancelled, then the lock is put
-		 * back to grant queue, need to ensure it is unlocked */
-		if (ret == -DLM_ECANCEL)
-			goto retry;
-	}
-	res->flags &= ~DLM_LKF_CANCEL;
-	wait_for_completion(&res->completion);
+	/* use FORCEUNLOCK flag, so we can unlock even the lock is on the
+	 * waiting or convert queue */
+	ret = dlm_unlock(res->ls, res->lksb.sb_lkid, DLM_LKF_FORCEUNLOCK, &res->lksb, res);
+	if (unlikely(ret != 0))
+		pr_err("failed to unlock %s return %d\n", res->name, ret);
+	else
+		wait_for_completion(&res->completion);
 
 	kfree(res->name);
 	kfree(res->lksb.sb_lvbptr);
-- 
2.6.2


^ permalink raw reply related

* [PATCH V2 01/10] md-cluster: call md_kick_rdev_from_array once ack failed
From: Guoqing Jiang @ 2016-08-12  5:42 UTC (permalink / raw)
  To: linux-raid; +Cc: shli, Guoqing Jiang
In-Reply-To: <1470980563-26062-1-git-send-email-gqjiang@suse.com>

The new_disk_ack could return failure if WAITING_FOR_NEWDISK
is not set, so we need to kick the dev from array in case
failure happened.

And we missed to check err before call new_disk_ack othwise
we could kick a rdev which isn't in array, thanks for the
reminder from Shaohua.

Reviewed-by: NeilBrown <neilb@suse.com>
Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
---
 drivers/md/md.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 2c3ab6f..d6ae9bc 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -6101,9 +6101,13 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info)
 			export_rdev(rdev);
 
 		if (mddev_is_clustered(mddev)) {
-			if (info->state & (1 << MD_DISK_CANDIDATE))
-				md_cluster_ops->new_disk_ack(mddev, (err == 0));
-			else {
+			if (info->state & (1 << MD_DISK_CANDIDATE)) {
+				if (!err) {
+					err = md_cluster_ops->new_disk_ack(mddev, (err == 0));
+					if (err)
+						md_kick_rdev_from_array(rdev);
+				}
+			} else {
 				if (err)
 					md_cluster_ops->add_new_disk_cancel(mddev);
 				else
-- 
2.6.2


^ permalink raw reply related

* [PATCH V2 00/10] The latest changes for md-cluster
From: Guoqing Jiang @ 2016-08-12  5:42 UTC (permalink / raw)
  To: linux-raid; +Cc: shli, Guoqing Jiang

This version addresses comments from Shaohua, and the 7th
patch is added as well.

Thanks,
Guoqing

Guoqing Jiang (10):
  md-cluster: call md_kick_rdev_from_array once ack failed
  md-cluster: use FORCEUNLOCK in lockres_free
  md-cluster: remove some unnecessary dlm_unlock_sync
  md: changes for MD_STILL_CLOSED flag
  md-cluster: clean related infos of cluster
  md-cluster: protect md_find_rdev_nr_rcu with rcu lock
  md: remove obsolete ret in md_start_sync
  md-cluster: convert the completion to wait queue
  md-cluster: introduce dlm_lock_sync_interruptible to fix tasks hang
  md-cluster: make resync lock also could be interruptted

 drivers/md/md-cluster.c | 93 +++++++++++++++++++++++++++++++++----------------
 drivers/md/md.c         | 34 ++++++++++--------
 drivers/md/md.h         |  5 ++-
 3 files changed, 85 insertions(+), 47 deletions(-)

-- 
2.6.2


^ permalink raw reply

* Re: Can't mount Old RHEL 6 Raid with new install of CentOS 7, now can't mount with original RHEL 6
From: Chris Murphy @ 2016-08-12  2:38 UTC (permalink / raw)
  To: John Dawson; +Cc: Linux-RAID
In-Reply-To: <42cee2e2d85a5b0e39efc34117e597a2@celticblues.com>

On Thu, Aug 11, 2016 at 6:19 AM, John Dawson <linux@celticblues.com> wrote:
> I have a machine which had a drive with RHEL 6.X installed with a raid
> device setup on separate disk(s).  Installed a new hard disk in machine and
> installed CentOS 7.  CentOS 7 wouldn't mount the raid.  Put the old drive
> back in and now RHEL 6.X won't mount the raid.  Is the raid permanently
> hosed?  Can I get the data on it back? How? Thx.

RHEL comes with a support contract so you should contact Red Hat about
that part.

Also, not anywhere near enough information has been provided, almost
like you think what you're experiencing is a widely known problem with
a known solution. But it isn't. So you should provide mdadm -E
information for each member block device, whether or not the array
assembles manually, if not what error do you get in user and kernel
space, and what command you're using to mount the array that you say
fails, and what the error message is.

Also include mdadm version on both systems because few people will
have any idea what mdadm version is on the particular installation of
RHEL and CentOS you're using, as these things aren't standardized at
all across distros.

-- 
Chris Murphy

^ permalink raw reply

* Re: Unable to convert raid1 to raid5
From: NeilBrown @ 2016-08-12  1:32 UTC (permalink / raw)
  To: Wols Lists, Glenn Enright; +Cc: mdraid
In-Reply-To: <57A68815.40601@youngman.org.uk>

[-- Attachment #1: Type: text/plain, Size: 2496 bytes --]

On Sun, Aug 07 2016, Wols Lists wrote:

> On 07/08/16 01:32, Glenn Enright wrote:
>> On 7/08/2016 12:01 pm, "Wols Lists" <antlists@youngman.org.uk
>> <mailto:antlists@youngman.org.uk>> wrote:
>>>
>>> On 05/08/16 21:16, Wols Lists wrote:
>>> > In my testing of xosview, I've been mucking about with a vm and raid.
>>> > xosview is looking quite promising (I've got a few comments about it,
>>> > but never mind).
>>> >
>>> > BUT. In mucking about with raid 1, I increased my raid devices to three.
>>> > I now just can NOT convert the array to raid 5! I've been mucking around
>>> > with all sorts of things trying to get it to work, but finally two error
>>> > messages make things clear.
>>> >
>>> Following up to myself - suddenly thought "I know what's wrong". So I
>>> stopped the array, and of course couldn't access it, it was no longer
>>> there. So I assembled but didn't run it, and it worked fine.
>>>
>>> Simples, once you realise what's wrong - you can ADD devices to a
>>> running array, but you can't REMOVE them.
>>>
>>> Cheers,
>>> Wol
>>>
>
>> 
>> You can remove em if you mark em as failed first. Eg
>> 
>> Mdadm /dev/mdx --fail /dev/sdc1 --remove /dev/sdc1
>> 
>> Best, Glenn
>> 
> Except - if you read my original post - I was trying to TOTALLY remove
> the device!
>
> mdadm --grow -raid-devices=2
>
> THAT was the problem - I had a 3-device mirror, and you can't convert
> that to raid5! Even if you've --fail --remove'd the third device!
>
> In other words, "--grow --raid-devices=more" will work on a running
> device, "--grow --raid-devices=less" will only work on an array that is
> built but not running.

I don't believe this is correct, and I could reproduce your results in
quick tests.

If the array is not running, then you cannot reshape it at all.

You can reduce the number of devices in a RAID1 at any time as long as
the number of active devices is not greater than the number of devices
requested.

/dev/md0 has 3 working devices:

# mdadm -G /dev/md0 -n2
mdadm: failed to set raid disks
# mdadm /dev/md0 -f /dev/loop0
mdadm: set /dev/loop0 faulty in /dev/md0
# mdadm -G /dev/md0 -n2
raid_disks for /dev/md0 set to 2


>
> I now have the problem that my "--grow --level=5" has fallen foul of the
> "reshape stuck at zero" problem, and I can now neither run the array,
> nor get the reshape working ... :-(

We really need to fix this ... if only I knew how to reproduce it.

NeilBrown

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 818 bytes --]

^ permalink raw reply

* [PATCH] md: do not count journal as spare in GET_ARRAY_INFO
From: Song Liu @ 2016-08-12  0:14 UTC (permalink / raw)
  To: linux-raid; +Cc: yizhan, Song Liu, Shaohua Li

GET_ARRAY_INFO counts journal as spare (spare_disks), which is not
accurate. This patch fixes this.

Reported-by: Yi Zhang <yizhan@redhat.com>
Signed-off-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/md.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 2c3ab6f..df595a8 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -5851,6 +5851,9 @@ static int get_array_info(struct mddev *mddev, void __user *arg)
 			working++;
 			if (test_bit(In_sync, &rdev->flags))
 				insync++;
+			else if (test_bit(Journal, &rdev->flags))
+				/* TODO: add journal count to md_u.h */
+				;
 			else
 				spare++;
 		}
-- 
2.8.0.rc2


^ permalink raw reply related

* [PATCH] mdadm: put journal device in right place of --detail
From: Song Liu @ 2016-08-12  0:14 UTC (permalink / raw)
  To: linux-raid; +Cc: yizhan, Song Liu, Shaohua Li

When there is failed HDDs, journal device showed in wrong place
of --detail:

    Number   Major   Minor   RaidDevice State
       4       8       24        -      journal   /dev/sdb8
       1       8       18        1      active sync   /dev/sdb2
       2       8       19        2      active sync   /dev/sdb3
       3       8       21        3      active sync   /dev/sdb5

       0       8       17        -      faulty   /dev/sdb1

This patch fixed the output as:

    Number   Major   Minor   RaidDevice State
       -       0        0        0      removed
       1       8       18        1      active sync   /dev/sdb2
       2       8       19        2      active sync   /dev/sdb3
       3       8       21        3      active sync   /dev/sdb5

       0       8       17        -      faulty   /dev/sdb1
       4       8       24        -      journal   /dev/sdb8

Reported-by: Yi Zhang <yizhan@redhat.com>
Signed-off-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 Detail.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/Detail.c b/Detail.c
index 7a984c8..925e479 100644
--- a/Detail.c
+++ b/Detail.c
@@ -323,7 +323,8 @@ int Detail(char *dev, struct context *c)
 		if (disk.major == 0 && disk.minor == 0)
 			continue;
 		if (disk.raid_disk >= 0 && disk.raid_disk < array.raid_disks
-		    && disks[disk.raid_disk*2].state == (1<<MD_DISK_REMOVED))
+		    && disks[disk.raid_disk*2].state == (1<<MD_DISK_REMOVED)
+		    && ((disk.state & (1<<MD_DISK_JOURNAL)) == 0))
 			disks[disk.raid_disk*2] = disk;
 		else if (disk.raid_disk >= 0 && disk.raid_disk < array.raid_disks
 			 && disks[disk.raid_disk*2+1].state == (1<<MD_DISK_REMOVED)
-- 
2.8.0.rc2


^ permalink raw reply related

* [PATCH] mdadm: add man page for --add-journal
From: Song Liu @ 2016-08-12  0:10 UTC (permalink / raw)
  To: linux-raid; +Cc: yizhan, Song Liu, Shaohua Li

Add the following to man page:

--add-journal
      Recreate journal for RAID-4/5/6 array that losts journal
      devices. In current implementation, this command cannot
      add journal to an array that had failed journal.  To
      avoid  interrupting  on-going  write  opertions,
      --add-journal only works for array in Read-Only state.

Reported-by: Yi Zhang <yizhan@redhat.com>
Signed-off-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 mdadm.8.in | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/mdadm.8.in b/mdadm.8.in
index 1a04bd1..a335c53 100644
--- a/mdadm.8.in
+++ b/mdadm.8.in
@@ -1444,6 +1444,14 @@ number. The receiving node must acknowledge this message
 with \-\-cluster\-confirm. Valid arguments are <slot>:<devicename> in case
 the device is found or <slot>:missing in case the device is not found.
 
+.TP
+.BR \-\-add-journal
+Recreate journal for RAID-4/5/6 array that losts journal devices. In current
+implementation, this command cannot add journal to an array that had failed
+journal. To avoid interrupting on-going write opertions,
+.B \-\-add-journal
+only works for array in Read-Only state.
+
 .P
 Each of these options requires that the first device listed is the array
 to be acted upon, and the remainder are component devices to be added,
-- 
2.8.0.rc2


^ permalink raw reply related

* Re: [PATCH 1/3] MD: hold mddev lock for .quiesce in md_do_sync
From: NeilBrown @ 2016-08-12  0:04 UTC (permalink / raw)
  To: Shaohua Li; +Cc: linux-raid, Shaohua Li
In-Reply-To: <20160806041428.GB9281@kernel.org>

[-- Attachment #1: Type: text/plain, Size: 3748 bytes --]

On Sat, Aug 06 2016, Shaohua Li wrote:

> On Thu, Aug 04, 2016 at 01:16:49PM +1000, Neil Brown wrote:
>> On Wed, Aug 03 2016, NeilBrown wrote:
>> 
>> > [ Unknown signature status ]
>> > On Sun, Jul 31 2016, shli@kernel.org wrote:
>> >
>> >> From: Shaohua Li <shli@fb.com>
>> >>
>> >> .quiesce is called with mddev lock hold at most places. There are few
>> >> exceptions. Calling .quesce without the lock hold could create races. For
>> >> example, the .quesce of raid1 can't be recursively. The purpose of the patches
>> >> is to fix a race in raid5-cache. The raid5-cache .quesce will write md
>> >> superblock and should be called with mddev lock hold.
>> >>
>> >> Cc: NeilBrown <neilb@suse.com>
>> >> Signed-off-by: Shaohua Li <shli@fb.com>
>> >
>> > Acked-by: NeilBrown <neilb@suse.com>
>> >
>> > This should be safe but I'm not sure I really like it.
>> > The raid1 quiesce could be changed so that it can be called recursively.
>> > The raid5-cache situation would be harder to get right and maybe this is
>> > the best solution... It's just that 'quiesce' should be a fairly
>> > light-weight operation, just waiting for pending requests to flush.  It
>> > shouldn't really *need* a lock.
>> 
>> Actually, the more I think about this, the less I like it.
>> 
>> I would much rather make .quiesce lighter weight so that no locking was
>> needed.
>> 
>> For r5l_quiesce, that probable means removed the "r5l_do_reclaim()".
>> Stopping and restarting the reclaim thread seems reasonable, but calling
>> r5l_do_reclaim() should not be needed.  It should be done periodically
>> by the thread, and at 'stop' time, but otherwise isn't needed.
>> You would need to hold some mutex while calling md_register_thread, but
>> that could be probably be log->io_mutex, or maybe even some other new
>> mutex
>
> We will have the same deadlock issue with just stopping/restarting the reclaim
> thread. As stopping the thread will wait for the thread, which probably is
> doing r5l_do_reclaim and writting the superblock. Since we are writting the
> superblock, we must hold the reconfig_mutex.

When you say "writing the superblock" you presumably mean "blocked in
r5l_write_super_and_discard_space(), waiting for  MD_CHANGE_PENDING to
be cleared" ??
With a bit of care you could wait for MD_CHANGE_PENDING to clear, or for
->quiesce to be set, and then exit gracefully.

>
> Letting raid5_quiesce call r5l_do_reclaim gives us a clean log. Just
> stop/restart the reclaim thread can't guarantee this, as it's possible some
> space aren't reclaimed yet. A clean log will simplify a lot of things, for
> example we change the layout of the array. The log doesn't need to remember
> which part is for the old layout and which part is the new layout.

I really think you are putting too much functionality into quiesce.
When we change the shape of the array, we do much more than just
quiesce it.  We also call check_reshape and start_reshape etc.
They are called with reconfig_mutex held and it would be perfectly
appropriate to finish of the r5l_do_reclaim() work in there.

>
> I think we can add a new parameter for .quiesce to indicate if reconfig_mutex
> is hold. raid5_quiesce can check the parameter and hold reconfig_mutex if
> necessary.

Adding a new parameter because it happens to be convenient in one case
is not necessarily a good idea.  It is often a sign that the interface
isn't well designed, or isn't well understood, or is being used poorly.

I really really don't think ->quiesce() should care about whether
reconfig_mutex is held.  All it should do is drain all IO and stop new
IO so that other threads can do unusually things in race-free ways.

NeilBrown

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 818 bytes --]

^ permalink raw reply

* Re: [PATCH v2] block: make sure big bio is splitted into at most 256 bvecs
From: Christoph Hellwig @ 2016-08-11 14:02 UTC (permalink / raw)
  To: Eric Wheeler
  Cc: Ming Lei, Jens Axboe, linux-kernel, linux-block, linux-bcache,
	linux-raid, kent.overstreet, Christoph Hellwig, Sebastian Roesner,
	4.3+, Shaohua Li
In-Reply-To: <alpine.LRH.2.11.1608102132120.10662@mail.ewheeler.net>

Please just fix bcache to not submit bios larger than BIO_MAX_PAGES for
now, until we can support such callers in general and enable common
used code to do so.

^ permalink raw reply

* Can't mount Old RHEL 6 Raid with new install of CentOS 7, now can't mount with original RHEL 6
From: John Dawson @ 2016-08-11 12:19 UTC (permalink / raw)
  To: linux-raid

I have a machine which had a drive with RHEL 6.X installed with a raid 
device setup on separate disk(s).  Installed a new hard disk in machine 
and installed CentOS 7.  CentOS 7 wouldn't mount the raid.  Put the old 
drive back in and now RHEL 6.X won't mount the raid.  Is the raid 
permanently hosed?  Can I get the data on it back? How? Thx.

JD

^ permalink raw reply

* Re: [PATCH] NULL pointer in raid1_make_request passed to bio_trim when adding md as bcache caching dev
From: Eric Wheeler @ 2016-08-11  6:52 UTC (permalink / raw)
  To: Shaohua Li
  Cc: Ming Lei, linux-bcache, linux-raid, Ming Lin, Vlad-Cosmin Miu,
	rjones, Kent Overstreet, jmoyer, axboe, Sebastian Roesner,
	linux-block
In-Reply-To: <alpine.LRH.2.11.1608102035180.10662@mail.ewheeler.net>

On Wed, 10 Aug 2016, Eric Wheeler wrote:

> On Sun, 10 Apr 2016 20:01:30 +0200, Sebastian Roesner wrote:
> > Am 01.04.2016 um 20:14 schrieb Eric Wheeler:
> > > On Mon, 28 Mar 2016, Shaohua Li wrote:
> > > > On Sat, Mar 26, 2016 at 05:46:16PM +0100, Sebastian Roesner wrote:
> > > > > Hello Ming, Eric,
> > > > > 
> > > > > Am 26.03.2016 um 16:40 schrieb Ming Lei:
> > > > > I was able to reproduce it on a non-productive system, but only after
> > > > > copying the bcache superblocks/partition starts from the original system,
> > > > > with new created ones it worked fine.
> > > >
> > > > 320 bvecs exceeds what bio-clone_set can handle. Could you please try below patch?
> > > >
> > > Hey Sebastian,
> > > 
> > > Have you had a chance to test the patch below from Shaohua Li?
> > 
> > I just tried the patch and it did not crash anymore.
>  
> It doesn't look like this made it into v4.8-rc1 and it is known to fix at 
> least Sebastian's issue.
> 
> commit 92761dad7ff6e1bf25de247e0064dd398e797599
> Author: Shaohua Li <shli@fb.com>
> Date:   Mon Mar 28 10:54:35 2016 -0700
> 
>     block: don't make BLK_DEF_MAX_SECTORS too big
>     
>     bio_alloc_bioset() allocates bvecs from bvec_slabs which can only allocate
>     maximum 256 bvec (eg, 1M for 4k pages). We can't bump BLK_DEF_MAX_SECTORS to
>     exceed this value otherwise bio_alloc_bioset will fail.
>     
>     This fixes commit 30e2bc08b2bb7c069. We probably should make the bvec_slabs
>     hold bigger bvecs if bigger bio size is required.
>     
>     Signed-off-by: Shaohua Li <shli@fb.com>
> 
> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
> index 7e5d7e0..da64325 100644
> --- a/include/linux/blkdev.h
> +++ b/include/linux/blkdev.h
> @@ -1153,7 +1153,11 @@ extern int blk_verify_command(unsigned char *cmd, fmode_t has_write_perm);
>  enum blk_default_limits {
>  	BLK_MAX_SEGMENTS	= 128,
>  	BLK_SAFE_MAX_SECTORS	= 255,
> -	BLK_DEF_MAX_SECTORS	= 2560,
> +	/*
> +	 * if you change this, please also change bvec_alloc and BIO_MAX_PAGES.
> +	 * Otherwise bio_alloc_bioset will break.
> +	 */
> +	BLK_DEF_MAX_SECTORS	= BIO_MAX_SECTORS,


It seems that I spoke too soon, as BIO_MAX_SECTORS was removed for v4.8.

What should BLK_DEF_MAX_SECTORS be defined as?  

Something like this?
	-	BLK_DEF_MAX_SECTORS     = 2560,
	+	BLK_DEF_MAX_SECTORS = (BIO_MAX_PAGES << (PAGE_SHIFT-9)

Or was this issue patched by Shaohua Li solved in some other way?

It appears that BLK_DEF_MAX_SECTORS is still used:

$ git grep BLK_DEF_MAX_SECTORS
block/blk-settings.c:   max_sectors = min_t(unsigned int, max_sectors, BLK_DEF_MAX_SECTORS);
drivers/block/aoe/aoeblk.c:     blk_queue_max_hw_sectors(q, BLK_DEF_MAX_SECTORS);
drivers/scsi/sd.c:              rw_max = BLK_DEF_MAX_SECTORS;


--
Eric Wheeler


>  	BLK_MAX_SEGMENT_SIZE	= 65536,
>  	BLK_SEG_BOUNDARY_MASK	= 0xFFFFFFFFUL,
>  };
> --
> To unsubscribe from this list: send the line "unsubscribe linux-bcache" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> --
> To unsubscribe from this list: send the line "unsubscribe linux-bcache" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 

^ permalink raw reply

* Re: [PATCH v2] block: make sure big bio is splitted into at most 256 bvecs
From: Eric Wheeler @ 2016-08-11  6:33 UTC (permalink / raw)
  To: Ming Lei
  Cc: Jens Axboe, linux-kernel, linux-block, linux-bcache, linux-raid,
	kent.overstreet, Christoph Hellwig, Sebastian Roesner, 4.3+,
	Shaohua Li
In-Reply-To: <1459914212-9330-1-git-send-email-ming.lei@canonical.com>

On Fri, 10 Jun 2016, Christoph Hellwig wrote: 
> On Wed, 6 Apr 2016, Ming Lei wrote:
> >
> > After arbitrary bio size is supported, the incoming bio may
> > be very big. We have to split the bio into small bios so that
> > each holds at most BIO_MAX_PAGES bvecs for safety reason, such
> > as bio_clone().
> > 
> > This patch fixes the following kernel crash:
> > [  172.664813]  [<ffffffffa007f3be>] ? raid1_make_request+0x2e8/0xad7 [raid1]
> > [  172.664846]  [<ffffffff811f07da>] ? blk_queue_split+0x377/0x3d4
> > [  172.664880]  [<ffffffffa005fb5f>] ? md_make_request+0xf6/0x1e9 [md_mod]
> > [  172.664912]  [<ffffffff811eb860>] ? generic_make_request+0xb5/0x155
> > [  172.664947]  [<ffffffffa0445c89>] ? prio_io+0x85/0x95 [bcache]
>
> The fixup to allow bio_clone support a larger size is the same one as to 
> allow everyone else submitting larger bios:  increase BIO_MAX_PAGES and 
> create the required mempools to back that new larger size.  Or just go 
> for multipage biovecs..

Hi Christoph, Ming, everyone:

I'm hoping you can help me get this off of a list of stability fixes 
related to changes around Linux 4.3.  Ming's patch [1] is known to fix an 
issue when a bio with bi_vcnt > BIO_MAX_PAGES is passed to 
generic_make_request and later hits bio_clone.  (Note that bi_vcnt can't 
be trusted since immutable biovecs and needs to be re-counted unless you 
own the bio, which Ming's patch does.)

The diffstat, 22 lines of which are commentary, 
seems relatively minor and would land in stable for v4.3+:
 block/blk-merge.c | 35 ++++++++++++++++++++++++++++++++---

I'm not sure I understood Christoph's suggestion; BIO_MAX_PAGES is a 
static #define and we don't know what the the bi_vcnt from an arbitrary 
driver might be.  Wouldn't increasing BIO_MAX_PAGES just push the problem 
further out into the future when bi_vcnt might again exceed BIO_MAX_PAGES?  

Perhaps you could elaborate if I have misunderstood. Are you suggesting 
that no driver should call generic_make_request when bi_vcnt > BIO_MAX_PAGES?


--
Eric Wheeler

[1] https://patchwork.kernel.org/patch/9169483/
    Pasted below:



After arbitrary bio size is supported, the incoming bio may
be very big. We have to split the bio into small bios so that
each holds at most BIO_MAX_PAGES bvecs for safety reason, such
as bio_clone().

This patch fixes the following kernel crash:

> [  172.660142] BUG: unable to handle kernel NULL pointer dereference at 0000000000000028
> [  172.660229] IP: [<ffffffff811e53b4>] bio_trim+0xf/0x2a
> [  172.660289] PGD 7faf3e067 PUD 7f9279067 PMD 0
> [  172.660399] Oops: 0000 [#1] SMP
> [...]
> [  172.664780] Call Trace:
> [  172.664813]  [<ffffffffa007f3be>] ? raid1_make_request+0x2e8/0xad7 [raid1]
> [  172.664846]  [<ffffffff811f07da>] ? blk_queue_split+0x377/0x3d4
> [  172.664880]  [<ffffffffa005fb5f>] ? md_make_request+0xf6/0x1e9 [md_mod]
> [  172.664912]  [<ffffffff811eb860>] ? generic_make_request+0xb5/0x155
> [  172.664947]  [<ffffffffa0445c89>] ? prio_io+0x85/0x95 [bcache]
> [  172.664981]  [<ffffffffa0448252>] ? register_cache_set+0x355/0x8d0 [bcache]
> [  172.665016]  [<ffffffffa04497d3>] ? register_bcache+0x1006/0x1174 [bcache]

The issue can be reproduced by the following steps:
	- create one raid1 over two virtio-blk
	- build bcache device over the above raid1 and another cache device
	and bucket size is set as 2Mbytes
	- set cache mode as writeback
	- run random write over ext4 on the bcache device

Fixes: 54efd50(block: make generic_make_request handle arbitrarily sized bios)
Reported-by: Sebastian Roesner <sroesner-kernelorg@roesner-online.de>
Reported-by: Eric Wheeler <bcache@lists.ewheeler.net>
Cc: stable@vger.kernel.org (4.3+)
Cc: Shaohua Li <shli@fb.com>
Acked-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Ming Lei <ming.lei@canonical.com>
---
V2:
	- don't mark as REQ_NOMERGE in case the bio is splitted
	for reaching the limit of bvecs count
V1:
        - Kent pointed out that using max io size can't cover
        the case of non-full bvecs/pages
 block/blk-merge.c | 35 ++++++++++++++++++++++++++++++++---
 1 file changed, 32 insertions(+), 3 deletions(-)

diff --git a/block/blk-merge.c b/block/blk-merge.c
index c265348..839529b 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -85,7 +85,8 @@  static inline unsigned get_max_io_size(struct request_queue *q,
 static struct bio *blk_bio_segment_split(struct request_queue *q,
 					 struct bio *bio,
 					 struct bio_set *bs,
-					 unsigned *segs)
+					 unsigned *segs,
+					 bool *no_merge)
 {
 	struct bio_vec bv, bvprv, *bvprvp = NULL;
 	struct bvec_iter iter;
@@ -94,9 +95,34 @@  static struct bio *blk_bio_segment_split(struct request_queue *q,
 	bool do_split = true;
 	struct bio *new = NULL;
 	const unsigned max_sectors = get_max_io_size(q, bio);
+	unsigned bvecs = 0;
+
+	*no_merge = true;
 
 	bio_for_each_segment(bv, bio, iter) {
 		/*
+		 * With arbitrary bio size, the incoming bio may be very
+		 * big. We have to split the bio into small bios so that
+		 * each holds at most BIO_MAX_PAGES bvecs because
+		 * bio_clone() can fail to allocate big bvecs.
+		 *
+		 * It should have been better to apply the limit per
+		 * request queue in which bio_clone() is involved,
+		 * instead of globally. The biggest blocker is
+		 * bio_clone() in bio bounce.
+		 *
+		 * If bio is splitted by this reason, we should allow
+		 * to continue bios merging.
+		 *
+		 * TODO: deal with bio bounce's bio_clone() gracefully
+		 * and convert the global limit into per-queue limit.
+		 */
+		if (bvecs++ >= BIO_MAX_PAGES) {
+			*no_merge = false;
+			goto split;
+		}
+
+		/*
 		 * If the queue doesn't support SG gaps and adding this
 		 * offset would create a gap, disallow it.
 		 */
@@ -171,13 +197,15 @@  void blk_queue_split(struct request_queue *q, struct bio **bio,
 {
 	struct bio *split, *res;
 	unsigned nsegs;
+	bool no_merge_for_split = true;
 
 	if (bio_op(*bio) == REQ_OP_DISCARD)
 		split = blk_bio_discard_split(q, *bio, bs, &nsegs);
 	else if (bio_op(*bio) == REQ_OP_WRITE_SAME)
 		split = blk_bio_write_same_split(q, *bio, bs, &nsegs);
 	else
-		split = blk_bio_segment_split(q, *bio, q->bio_split, &nsegs);
+		split = blk_bio_segment_split(q, *bio, q->bio_split, &nsegs,
+				&no_merge_for_split);
 
 	/* physical segments can be figured out during splitting */
 	res = split ? split : *bio;
@@ -186,7 +214,8 @@  void blk_queue_split(struct request_queue *q, struct bio **bio,
 
 	if (split) {
 		/* there isn't chance to merge the splitted bio */
-		split->bi_rw |= REQ_NOMERGE;
+		if (no_merge_for_split)
+			split->bi_rw |= REQ_NOMERGE;
 
 		bio_chain(split, *bio);
 		trace_block_split(q, split, (*bio)->bi_iter.bi_sector);

^ permalink raw reply related

* Re: [PATCH] NULL pointer in raid1_make_request passed to bio_trim when adding md as bcache caching dev
From: kbuild test robot @ 2016-08-11  4:36 UTC (permalink / raw)
  To: Eric Wheeler
  Cc: kbuild-all, Shaohua Li, Ming Lei, linux-bcache, linux-raid,
	Ming Lin, Vlad-Cosmin Miu, rjones, Kent Overstreet, jmoyer, axboe,
	Sebastian Roesner, linux-block
In-Reply-To: <alpine.LRH.2.11.1608102035180.10662@mail.ewheeler.net>

[-- Attachment #1: Type: text/plain, Size: 1649 bytes --]

Hi Eric,

[auto build test ERROR on linus/master]
[also build test ERROR on v4.8-rc1 next-20160811]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url:    https://github.com/0day-ci/linux/commits/Eric-Wheeler/NULL-pointer-in-raid1_make_request-passed-to-bio_trim-when-adding-md-as-bcache-caching-dev/20160811-115046
config: i386-randconfig-s0-201632 (attached as .config)
compiler: gcc-6 (Debian 6.1.1-9) 6.1.1 20160705
reproduce:
        # save the attached .config to linux build tree
        make ARCH=i386 

Note: the linux-review/Eric-Wheeler/NULL-pointer-in-raid1_make_request-passed-to-bio_trim-when-adding-md-as-bcache-caching-dev/20160811-115046 HEAD 84e331216ae8d3359dd0e7bf2cf4e11e30ca73e5 builds fine.
      It only hurts bisectibility.

All errors (new ones prefixed by >>):

   In file included from fs/btrfs/super.c:19:0:
>> include/linux/blkdev.h:1179:24: error: 'BIO_MAX_SECTORS' undeclared here (not in a function)
     BLK_DEF_MAX_SECTORS = BIO_MAX_SECTORS,
                           ^~~~~~~~~~~~~~~

vim +/BIO_MAX_SECTORS +1179 include/linux/blkdev.h

  1173		BLK_MAX_SEGMENTS	= 128,
  1174		BLK_SAFE_MAX_SECTORS	= 255,
  1175		/*
  1176		 * if you change this, please also change bvec_alloc and BIO_MAX_PAGES.
  1177		 * Otherwise bio_alloc_bioset will break.
  1178		 */
> 1179		BLK_DEF_MAX_SECTORS	= BIO_MAX_SECTORS,
  1180		BLK_MAX_SEGMENT_SIZE	= 65536,
  1181		BLK_SEG_BOUNDARY_MASK	= 0xFFFFFFFFUL,
  1182	};

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

[-- Attachment #2: .config.gz --]
[-- Type: application/octet-stream, Size: 34099 bytes --]

^ permalink raw reply

* Re: [PATCH v2 1/1] block: fix blk_queue_split() resource exhaustion
From: Eric Wheeler @ 2016-08-11  4:16 UTC (permalink / raw)
  To: Lars Ellenberg
  Cc: Mike Snitzer, Eric Wheeler, NeilBrown, Jens Axboe, linux-block,
	Martin K. Petersen, Peter Zijlstra, Jiri Kosina, Ming Lei,
	Kirill A. Shutemov, linux-kernel, linux-raid, Takashi Iwai,
	linux-bcache, Zheng Liu, Kent Overstreet, Keith Busch, dm-devel,
	Shaohua Li, Ingo Molnar, Alasdair Kergon, Roland Kammerer
In-Reply-To: <20160719090024.GB20868@soda.linbit>

On Tue, 19 Jul 2016, Lars Ellenberg wrote:
> On Tue, Jul 12, 2016 at 10:32:33PM -0400, Mike Snitzer wrote:
> > On Tue, Jul 12 2016 at 10:18pm -0400,
> > Eric Wheeler <bcache@lists.ewheeler.net> wrote:
> > 
> > > On Tue, 12 Jul 2016, NeilBrown wrote:
> > > 
> > > > On Tue, Jul 12 2016, Lars Ellenberg wrote:
> > > > ....
> > > > >
> > > > > Instead, I suggest to distinguish between recursive calls to
> > > > > generic_make_request(), and pushing back the remainder part in
> > > > > blk_queue_split(), by pointing current->bio_lists to a
> > > > > 	struct recursion_to_iteration_bio_lists {
> > > > > 		struct bio_list recursion;
> > > > > 		struct bio_list queue;
> > > > > 	}
> > > > >
> > > > > By providing each q->make_request_fn() with an empty "recursion"
> > > > > bio_list, then merging any recursively submitted bios to the
> > > > > head of the "queue" list, we can make the recursion-to-iteration
> > > > > logic in generic_make_request() process deepest level bios first,
> > > > > and "sibling" bios of the same level in "natural" order.
> > > > >
> > > > > Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
> > > > > Signed-off-by: Roland Kammerer <roland.kammerer@linbit.com>
> > > > 
> > > > Reviewed-by: NeilBrown <neilb@suse.com>
> > > > 
> > > > Thanks again for doing this - I think this is a very significant
> > > > improvement and could allow other simplifications.
> > > 
> > > Thank you Lars for all of this work!  
> > > 
> > > It seems like there have been many 4.3+ blockdev stacking issues and this 
> > > will certainly address some of those (maybe all of them?).  (I think we 
> > > hit this while trying drbd in 4.4 so we dropped back to 4.1 without 
> > > issue.)  It would be great to hear 4.4.y stable pick this up if 
> > > compatible.
> > > 
> > > 
> > > Do you believe that this patch would solve any of the proposals by others 
> > > since 4.3 related to bio splitting/large bios?  I've been collecting a 
> > > list, none of which appear have landed yet as of 4.7-rc7 (but correct me 
> > > if I'm wrong):

[... cut unrelated A,B ... ]

> > > C.  [1/3] block: flush queued bios when process blocks to avoid deadlock
> > > 	by Mikulas Patocka: https://patchwork.kernel.org/patch/9204125/
> > > 		(was https://patchwork.kernel.org/patch/7398411/)
> 
> As it stands now, this is yet an other issue, but related.
> 
> From the link above:
> 
> | ** Here is the dm-snapshot deadlock that was observed:
> | 
> | 1) Process A sends one-page read bio to the dm-snapshot target. The bio
> | spans snapshot chunk boundary and so it is split to two bios by device
> | mapper.
> | 
> | 2) Device mapper creates the first sub-bio and sends it to the snapshot
> | driver.
> | 
> | 3) The function snapshot_map calls track_chunk (that allocates a
> | structure
> | dm_snap_tracked_chunk and adds it to tracked_chunk_hash) and then remaps
> | the bio to the underlying device and exits with DM_MAPIO_REMAPPED.
> | 
> | 4) The remapped bio is submitted with generic_make_request, but it isn't
> | issued - it is added to current->bio_list instead.
> | 
> | 5) Meanwhile, process B (dm's kcopyd) executes pending_complete for the
> | chunk affected be the first remapped bio, it takes down_write(&s->lock)
> | and then loops in __check_for_conflicting_io, waiting for
> | dm_snap_tracked_chunk created in step 3) to be released.
> | 
> | 6) Process A continues, it creates a second sub-bio for the rest of the
> | original bio.
> 
> Aha.
> Here is the relation.
> If "A" had only ever processed "just the chunk it can handle now",
> and "pushed back" the rest of the incoming bio,
> it could rely on all deeper level bios to have been submitted already.
> 
> But this does not look like it easily fits into the current DM model.
> 
> | 7) snapshot_map is called for this new bio, it waits on
> | down_write(&s->lock) that is held by Process B (in step 5).
> 
> There is an other suggestion:
> Use down_trylock (or down_timeout),
> and if it fails, push back the currently to-be-processed bio.
> We can introduce a new bio helper for that.
> Kind of what blk_queue_split() does with my patch applied.
> 
> Or even better, ignore the down_trylock suggestion,
> simply not iterate over all pieces first,
> but process one piece, and return back the the
> iteration in generic_make_request.
> 
> A bit of conflict here may be that DM has all its own
> split and clone and queue magic, and wants to process
> "all of the bio" before returning back to generic_make_request().
> 
> To change that, __split_and_process_bio() and all its helpers
> would need to learn to "push back" (pieces of) the bio they are
> currently working on, and not push back via "DM_ENDIO_REQUEUE",
> but by bio_list_add_head(&current->bio_lists->queue, piece_to_be_done_later).
> 
> Then, after they processed each piece,
> *return* all the way up to the top-level generic_make_request(),
> where the recursion-to-iteration logic would then
> make sure that all deeper level bios, submitted via
> recursive calls to generic_make_request() will be processed, before the
> next, pushed back, piece of the "original incoming" bio.
> 
> And *not* do their own iteration over all pieces first.
> 
> Probably not as easy as dropping the while loop,
> using bio_advance, and pushing that "advanced" bio back to
> current->...queue?
> 
> static void __split_and_process_bio(struct mapped_device *md,
> 				    struct dm_table *map, struct bio *bio)
> ...
> 		ci.bio = bio;
> 		ci.sector_count = bio_sectors(bio);
> 		while (ci.sector_count && !error)
> 			error = __split_and_process_non_flush(&ci);
> ...
> 		error = __split_and_process_non_flush(&ci);
> 		if (ci.sector_count)
> 			bio_advance()
> 			bio_list_add_head(&current->bio_lists->queue, )
> ...
> 
> Something like that, maybe?
> Just a thought.

Hello all,

Has anyone been able to make progress with resolution to this issue?  

Might the suggestions from Lars help solve BZ# 119841?
	https://bugzilla.kernel.org/show_bug.cgi?id=119841

--
Eric Wheeler

^ permalink raw reply

* [PATCH] NULL pointer in raid1_make_request passed to bio_trim when adding md as bcache caching dev
From: Eric Wheeler @ 2016-08-11  3:44 UTC (permalink / raw)
  To: Shaohua Li, Ming Lei, linux-bcache, linux-raid, Ming Lin,
	Vlad-Cosmin Miu, rjones, Kent Overstreet, jmoyer, axboe,
	Sebastian Roesner, linux-block

On Sun, 10 Apr 2016 20:01:30 +0200, Sebastian Roesner wrote:
> Am 01.04.2016 um 20:14 schrieb Eric Wheeler:
> > On Mon, 28 Mar 2016, Shaohua Li wrote:
> > > On Sat, Mar 26, 2016 at 05:46:16PM +0100, Sebastian Roesner wrote:
> > > > Hello Ming, Eric,
> > > > 
> > > > Am 26.03.2016 um 16:40 schrieb Ming Lei:
> > > > I was able to reproduce it on a non-productive system, but only after
> > > > copying the bcache superblocks/partition starts from the original system,
> > > > with new created ones it worked fine.
> > >
> > > 320 bvecs exceeds what bio-clone_set can handle. Could you please try below patch?
> > >
> > Hey Sebastian,
> > 
> > Have you had a chance to test the patch below from Shaohua Li?
> 
> I just tried the patch and it did not crash anymore.
 
Hello all,

It doesn't look like this made it into v4.8-rc1 and it is known to fix at 
least Sebastian's issue.

Can this get into v4.8?

--
Eric Wheeler


commit 92761dad7ff6e1bf25de247e0064dd398e797599
Author: Shaohua Li <shli@fb.com>
Date:   Mon Mar 28 10:54:35 2016 -0700

    block: don't make BLK_DEF_MAX_SECTORS too big
    
    bio_alloc_bioset() allocates bvecs from bvec_slabs which can only allocate
    maximum 256 bvec (eg, 1M for 4k pages). We can't bump BLK_DEF_MAX_SECTORS to
    exceed this value otherwise bio_alloc_bioset will fail.
    
    This fixes commit 30e2bc08b2bb7c069. We probably should make the bvec_slabs
    hold bigger bvecs if bigger bio size is required.
    
    Signed-off-by: Shaohua Li <shli@fb.com>

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 7e5d7e0..da64325 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1153,7 +1153,11 @@ extern int blk_verify_command(unsigned char *cmd, fmode_t has_write_perm);
 enum blk_default_limits {
 	BLK_MAX_SEGMENTS	= 128,
 	BLK_SAFE_MAX_SECTORS	= 255,
-	BLK_DEF_MAX_SECTORS	= 2560,
+	/*
+	 * if you change this, please also change bvec_alloc and BIO_MAX_PAGES.
+	 * Otherwise bio_alloc_bioset will break.
+	 */
+	BLK_DEF_MAX_SECTORS	= BIO_MAX_SECTORS,
 	BLK_MAX_SEGMENT_SIZE	= 65536,
 	BLK_SEG_BOUNDARY_MASK	= 0xFFFFFFFFUL,
 };
--
To unsubscribe from this list: send the line "unsubscribe linux-bcache" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related

* Re: [PATCH] Incremental: don't try to load_container() for a subarray
From: Jes Sorensen @ 2016-08-09 14:58 UTC (permalink / raw)
  To: Artur Paszkiewicz; +Cc: linux-raid
In-Reply-To: <20160809080551.2226-1-artur.paszkiewicz@intel.com>

Artur Paszkiewicz <artur.paszkiewicz@intel.com> writes:
> mdadm -IRs would exit with a non-zero status because of this.
>
> Reported-by: Xiao Ni <xni@redhat.com>
> Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
> ---
>  Incremental.c | 6 +++++-
>  1 file changed, 5 insertions(+), 1 deletion(-)

Applied!

Thanks,
Jes

> diff --git a/Incremental.c b/Incremental.c
> index ba97b00..cc01d41 100644
> --- a/Incremental.c
> +++ b/Incremental.c
> @@ -1347,8 +1347,12 @@ restart:
>  
>  		if (devnm && strcmp(devnm, me->devnm) != 0)
>  			continue;
> -		if (devnm && me->metadata[0] == '/') {
> +		if (me->metadata[0] == '/') {
>  			char *sl;
> +
> +			if (!devnm)
> +				continue;
> +
>  			/* member array, need to work on container */
>  			strncpy(container, me->metadata+1, 32);
>  			container[31] = 0;

^ permalink raw reply

* [PATCH] Incremental: don't try to load_container() for a subarray
From: Artur Paszkiewicz @ 2016-08-09  8:05 UTC (permalink / raw)
  To: jes.sorensen; +Cc: linux-raid, Artur Paszkiewicz

mdadm -IRs would exit with a non-zero status because of this.

Reported-by: Xiao Ni <xni@redhat.com>
Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
---
 Incremental.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/Incremental.c b/Incremental.c
index ba97b00..cc01d41 100644
--- a/Incremental.c
+++ b/Incremental.c
@@ -1347,8 +1347,12 @@ restart:
 
 		if (devnm && strcmp(devnm, me->devnm) != 0)
 			continue;
-		if (devnm && me->metadata[0] == '/') {
+		if (me->metadata[0] == '/') {
 			char *sl;
+
+			if (!devnm)
+				continue;
+
 			/* member array, need to work on container */
 			strncpy(container, me->metadata+1, 32);
 			container[31] = 0;
-- 
2.9.2


^ permalink raw reply related

* Re: The subarray is loaded container by load_container
From: Artur Paszkiewicz @ 2016-08-09  8:05 UTC (permalink / raw)
  To: Jes Sorensen
  Cc: Xiao Ni, linux-raid, tomasz.majchrzak, aleksey.obitotskiy,
	pawel.baldysiak
In-Reply-To: <wrfjshuk1irn.fsf@redhat.com>

On 08/04/2016 08:38 PM, Jes Sorensen wrote:
> Do you have a version of this patch you would like me to apply to the
> official mdadm tree?

Hi Jes,

Yes I do, I'll send the patch in a moment.

Artur

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox