linux-raid.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v2] md/raid5: avoid deadlock when raid5 array has unack badblocks during md_stop_writes.
@ 2013-09-02  5:09 ycbzzjlby
  2013-09-12  4:44 ` NeilBrown
  0 siblings, 1 reply; 3+ messages in thread
From: ycbzzjlby @ 2013-09-02  5:09 UTC (permalink / raw)
  To: neilb; +Cc: linux-raid, linux-kernel, Bian Yu

From: Bian Yu <bianyu@kedacom.com>

When raid5 hit a fresh badblock, this badblock will flagged as unack
badblock until md_update_sb is called.
But md_stop/reboot/md_set_readonly will avoid raid5d call md_update_sb
in md_check_recovery, the badblock will always be unack, so raid5d
thread enter a infinite loop and never can unregister sync_thread
that cause deadlock.

To solve this, before md_stop_writes call md_unregister_thread, set
MD_STOPPING_WRITES on mddev->flags. In raid5.c analyse_stripe judge
MD_STOPPING_WRITES bit on mddev->flags, if setted don't block rdev
to wait md_update_sb. so raid5d thread can be stopped.

I can reproduce it by using follow way:
When raid5 array is recovering and hit a fresh badblock, then shutdown array at once.

[  480.850203]       Not tainted 3.11.0-next-20130906+ #4
[  480.852344] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[  480.854380]  [<ffffffff8153b324>] md_do_sync+0x7e4/0xe60
[  480.854386]  [<ffffffff81747fcb>] ? _raw_spin_unlock_irq+0x2b/0x40
[  480.854395]  [<ffffffff81536fa0>] ? md_unregister_thread+0x90/0x90
[  480.854400]  [<ffffffff810a716d>] ? trace_hardirqs_on+0xd/0x10
[  480.854405]  [<ffffffff815370bf>] md_thread+0x11f/0x170
[  480.854410]  [<ffffffff81536fa0>] ? md_unregister_thread+0x90/0x90
[  480.854415]  [<ffffffff8106d956>] kthread+0xd6/0xe0
[  480.854423]  [<ffffffff8106d880>] ? __init_kthread_worker+0x70/0x70
[  480.854428]  [<ffffffff8174ff2c>] ret_from_fork+0x7c/0xb0
[  480.854432]  [<ffffffff8106d880>] ? __init_kthread_worker+0x70/0x70
[  480.854435] no locks held by md0_resync/3246.
[  480.854437] INFO: task mdadm:3257 blocked for more than 120 seconds.
[  480.854438]       Not tainted 3.11.0-next-20130906+ #4
[  480.854439] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[  480.854442] mdadm           D 0000000000000000  5024  3257   3209 0x00000080
[  480.854445]  ffff880138c37b18 0000000000000046 00000000ffffffff ffff880037d3b120
[  480.854447]  ffff88013a038720 ffff88013a038000 0000000000013500 ffff880138c37fd8
[  480.854449]  ffff880138c36010 0000000000013500 0000000000013500 ffff880138c37fd8
[  480.854449] Call Trace:
[  480.854452]  [<ffffffff81745fa4>] schedule+0x24/0x70
[  480.854453]  [<ffffffff81742725>] schedule_timeout+0x175/0x200
[  480.854455]  [<ffffffff810a6d30>] ? mark_held_locks+0x80/0x130
[  480.854457]  [<ffffffff81747fcb>] ? _raw_spin_unlock_irq+0x2b/0x40
[  480.854459]  [<ffffffff810a709d>] ? trace_hardirqs_on_caller+0xfd/0x1c0
[  480.854461]  [<ffffffff810a716d>] ? trace_hardirqs_on+0xd/0x10
[  480.854463]  [<ffffffff81746600>] wait_for_completion+0xa0/0x110
[  480.854465]  [<ffffffff8107d160>] ? try_to_wake_up+0x300/0x300
[  480.854467]  [<ffffffff8106ddbc>] kthread_stop+0x4c/0xe0
[  480.854468]  [<ffffffff81536f5e>] md_unregister_thread+0x4e/0x90
[  480.854470]  [<ffffffff8153965d>] md_reap_sync_thread+0x1d/0x140
[  480.854472]  [<ffffffff815397ab>] __md_stop_writes+0x2b/0x80
[  480.854473]  [<ffffffff8153c911>] do_md_stop+0x91/0x4d0
[  480.854475]  [<ffffffff8153e8a7>] ? md_ioctl+0xf7/0x15c0
[  480.854477]  [<ffffffff810a716d>] ? trace_hardirqs_on+0xd/0x10
[  480.854479]  [<ffffffff8153f6a9>] md_ioctl+0xef9/0x15c0
[  480.854481]  [<ffffffff8113145d>] ? handle_mm_fault+0x17d/0x920

Signed-off-by: Bian Yu <bianyu@kedacom.com>
---
 drivers/md/md.c    |    2 ++
 drivers/md/md.h    |    4 ++++
 drivers/md/raid5.c |    3 ++-
 3 files changed, 8 insertions(+), 1 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index adf4d7e..54ef71f 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -5278,6 +5278,7 @@ static void md_clean(struct mddev *mddev)
 static void __md_stop_writes(struct mddev *mddev)
 {
 	set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+	set_bit(MD_STOPPING_WRITES, &mddev->flags);
 	if (mddev->sync_thread) {
 		set_bit(MD_RECOVERY_INTR, &mddev->recovery);
 		md_reap_sync_thread(mddev);
@@ -5294,6 +5295,7 @@ static void __md_stop_writes(struct mddev *mddev)
 		mddev->in_sync = 1;
 		md_update_sb(mddev, 1);
 	}
+	clear_bit(MD_STOPPING_WRITES, &mddev->flags);
 }
 
 void md_stop_writes(struct mddev *mddev)
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 608050c..a24ae1d 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -214,6 +214,10 @@ struct mddev {
 #define MD_STILL_CLOSED	4	/* If set, then array has not been opened since
 				 * md_ioctl checked on it.
 				 */
+#define MD_STOPPING_WRITES 5	/* If set, raid5 shouldn't set unacknowledged
+				 * badblock blocked in analyse_stripe to avoid
+				 * infinite loop.
+				 */
 
 	int				suspended;
 	atomic_t			active_io;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index f9972e2..ff1aecf 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -3446,7 +3446,8 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
 		if (rdev) {
 			is_bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS,
 					     &first_bad, &bad_sectors);
-			if (s->blocked_rdev == NULL
+			if (!test_bit(MD_STOPPING_WRITES, &conf->mddev->flags)
+			    && s->blocked_rdev == NULL
 			    && (test_bit(Blocked, &rdev->flags)
 				|| is_bad < 0)) {
 				if (is_bad < 0)
-- 
1.7.1

^ permalink raw reply related	[flat|nested] 3+ messages in thread

* Re: [PATCH v2] md/raid5: avoid deadlock when raid5 array has unack badblocks during md_stop_writes.
  2013-09-02  5:09 [PATCH v2] md/raid5: avoid deadlock when raid5 array has unack badblocks during md_stop_writes ycbzzjlby
@ 2013-09-12  4:44 ` NeilBrown
  2013-09-12  6:36   ` ycbzzjlby
  0 siblings, 1 reply; 3+ messages in thread
From: NeilBrown @ 2013-09-12  4:44 UTC (permalink / raw)
  To: ycbzzjlby; +Cc: linux-raid, linux-kernel, Bian Yu

[-- Attachment #1: Type: text/plain, Size: 8823 bytes --]

On Mon,  2 Sep 2013 01:09:55 -0400 ycbzzjlby@gmail.com wrote:

> From: Bian Yu <bianyu@kedacom.com>
> 
> When raid5 hit a fresh badblock, this badblock will flagged as unack
> badblock until md_update_sb is called.
> But md_stop/reboot/md_set_readonly will avoid raid5d call md_update_sb
> in md_check_recovery, the badblock will always be unack, so raid5d
> thread enter a infinite loop and never can unregister sync_thread
> that cause deadlock.
> 
> To solve this, before md_stop_writes call md_unregister_thread, set
> MD_STOPPING_WRITES on mddev->flags. In raid5.c analyse_stripe judge
> MD_STOPPING_WRITES bit on mddev->flags, if setted don't block rdev
> to wait md_update_sb. so raid5d thread can be stopped.
> 
> I can reproduce it by using follow way:
> When raid5 array is recovering and hit a fresh badblock, then shutdown array at once.
> 
> [  480.850203]       Not tainted 3.11.0-next-20130906+ #4
> [  480.852344] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> [  480.854380]  [<ffffffff8153b324>] md_do_sync+0x7e4/0xe60
> [  480.854386]  [<ffffffff81747fcb>] ? _raw_spin_unlock_irq+0x2b/0x40
> [  480.854395]  [<ffffffff81536fa0>] ? md_unregister_thread+0x90/0x90
> [  480.854400]  [<ffffffff810a716d>] ? trace_hardirqs_on+0xd/0x10
> [  480.854405]  [<ffffffff815370bf>] md_thread+0x11f/0x170
> [  480.854410]  [<ffffffff81536fa0>] ? md_unregister_thread+0x90/0x90
> [  480.854415]  [<ffffffff8106d956>] kthread+0xd6/0xe0
> [  480.854423]  [<ffffffff8106d880>] ? __init_kthread_worker+0x70/0x70
> [  480.854428]  [<ffffffff8174ff2c>] ret_from_fork+0x7c/0xb0
> [  480.854432]  [<ffffffff8106d880>] ? __init_kthread_worker+0x70/0x70
> [  480.854435] no locks held by md0_resync/3246.
> [  480.854437] INFO: task mdadm:3257 blocked for more than 120 seconds.
> [  480.854438]       Not tainted 3.11.0-next-20130906+ #4
> [  480.854439] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> [  480.854442] mdadm           D 0000000000000000  5024  3257   3209 0x00000080
> [  480.854445]  ffff880138c37b18 0000000000000046 00000000ffffffff ffff880037d3b120
> [  480.854447]  ffff88013a038720 ffff88013a038000 0000000000013500 ffff880138c37fd8
> [  480.854449]  ffff880138c36010 0000000000013500 0000000000013500 ffff880138c37fd8
> [  480.854449] Call Trace:
> [  480.854452]  [<ffffffff81745fa4>] schedule+0x24/0x70
> [  480.854453]  [<ffffffff81742725>] schedule_timeout+0x175/0x200
> [  480.854455]  [<ffffffff810a6d30>] ? mark_held_locks+0x80/0x130
> [  480.854457]  [<ffffffff81747fcb>] ? _raw_spin_unlock_irq+0x2b/0x40
> [  480.854459]  [<ffffffff810a709d>] ? trace_hardirqs_on_caller+0xfd/0x1c0
> [  480.854461]  [<ffffffff810a716d>] ? trace_hardirqs_on+0xd/0x10
> [  480.854463]  [<ffffffff81746600>] wait_for_completion+0xa0/0x110
> [  480.854465]  [<ffffffff8107d160>] ? try_to_wake_up+0x300/0x300
> [  480.854467]  [<ffffffff8106ddbc>] kthread_stop+0x4c/0xe0
> [  480.854468]  [<ffffffff81536f5e>] md_unregister_thread+0x4e/0x90
> [  480.854470]  [<ffffffff8153965d>] md_reap_sync_thread+0x1d/0x140
> [  480.854472]  [<ffffffff815397ab>] __md_stop_writes+0x2b/0x80
> [  480.854473]  [<ffffffff8153c911>] do_md_stop+0x91/0x4d0
> [  480.854475]  [<ffffffff8153e8a7>] ? md_ioctl+0xf7/0x15c0
> [  480.854477]  [<ffffffff810a716d>] ? trace_hardirqs_on+0xd/0x10
> [  480.854479]  [<ffffffff8153f6a9>] md_ioctl+0xef9/0x15c0
> [  480.854481]  [<ffffffff8113145d>] ? handle_mm_fault+0x17d/0x920
> 
> Signed-off-by: Bian Yu <bianyu@kedacom.com>
> ---
>  drivers/md/md.c    |    2 ++
>  drivers/md/md.h    |    4 ++++
>  drivers/md/raid5.c |    3 ++-
>  3 files changed, 8 insertions(+), 1 deletions(-)
> 
> diff --git a/drivers/md/md.c b/drivers/md/md.c
> index adf4d7e..54ef71f 100644
> --- a/drivers/md/md.c
> +++ b/drivers/md/md.c
> @@ -5278,6 +5278,7 @@ static void md_clean(struct mddev *mddev)
>  static void __md_stop_writes(struct mddev *mddev)
>  {
>  	set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
> +	set_bit(MD_STOPPING_WRITES, &mddev->flags);
>  	if (mddev->sync_thread) {
>  		set_bit(MD_RECOVERY_INTR, &mddev->recovery);
>  		md_reap_sync_thread(mddev);
> @@ -5294,6 +5295,7 @@ static void __md_stop_writes(struct mddev *mddev)
>  		mddev->in_sync = 1;
>  		md_update_sb(mddev, 1);
>  	}
> +	clear_bit(MD_STOPPING_WRITES, &mddev->flags);
>  }
>  
>  void md_stop_writes(struct mddev *mddev)
> diff --git a/drivers/md/md.h b/drivers/md/md.h
> index 608050c..a24ae1d 100644
> --- a/drivers/md/md.h
> +++ b/drivers/md/md.h
> @@ -214,6 +214,10 @@ struct mddev {
>  #define MD_STILL_CLOSED	4	/* If set, then array has not been opened since
>  				 * md_ioctl checked on it.
>  				 */
> +#define MD_STOPPING_WRITES 5	/* If set, raid5 shouldn't set unacknowledged
> +				 * badblock blocked in analyse_stripe to avoid
> +				 * infinite loop.
> +				 */
>  
>  	int				suspended;
>  	atomic_t			active_io;
> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
> index f9972e2..ff1aecf 100644
> --- a/drivers/md/raid5.c
> +++ b/drivers/md/raid5.c
> @@ -3446,7 +3446,8 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
>  		if (rdev) {
>  			is_bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS,
>  					     &first_bad, &bad_sectors);
> -			if (s->blocked_rdev == NULL
> +			if (!test_bit(MD_STOPPING_WRITES, &conf->mddev->flags)
> +			    && s->blocked_rdev == NULL
>  			    && (test_bit(Blocked, &rdev->flags)
>  				|| is_bad < 0)) {
>  				if (is_bad < 0)


Thanks for including the extra details in the patch, but it wasn't only that
I didn't think it should be needed (I was wrong there).  I also don't like
the patch.  It isn't at all clear to me that it will do the right thing.
There is a reason that we block until the bad block lists has been updated,
and to just not block because it is inconvenient just doesn't seem right.

I would rather change the sequencing so that the badblock list can be updated
at this point.
Something like that patch below, but I'm not 100% sure of it yet and I'm
about to go on leave so I'm not sure I'll be able to commit to it for a while.

If you could review and test I would appreciate it.

Thanks,
NeilBrown

diff --git a/drivers/md/md.c b/drivers/md/md.c
index adf4d7e..e4d78c0 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -3170,6 +3170,9 @@ rdev_attr_store(struct kobject *kobj, struct attribute *attr,
 		return -EACCES;
 	rv = mddev ? mddev_lock(mddev): -EBUSY;
 	if (!rv) {
+		mutex_lock(&mddev->open_mutex);
+		clear_bit(MD_STILL_CLOSED, &mddev->flags);
+		mutex_unlock(&mddev->open_mutex);
 		if (rdev->mddev == NULL)
 			rv = -EBUSY;
 		else
@@ -4764,6 +4767,9 @@ md_attr_store(struct kobject *kobj, struct attribute *attr,
 		flush_workqueue(md_misc_wq);
 	rv = mddev_lock(mddev);
 	if (!rv) {
+		mutex_lock(&mddev->open_mutex);
+		clear_bit(MD_STILL_CLOSED, &mddev->flags);
+		mutex_unlock(&mddev->open_mutex);
 		rv = entry->store(mddev, page, length);
 		mddev_unlock(mddev);
 	}
@@ -5279,8 +5285,10 @@ static void __md_stop_writes(struct mddev *mddev)
 {
 	set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
 	if (mddev->sync_thread) {
+		mddev_unlock(&mddev);
 		set_bit(MD_RECOVERY_INTR, &mddev->recovery);
 		md_reap_sync_thread(mddev);
+		mddev_lock(&mddev);
 	}
 
 	del_timer_sync(&mddev->safemode_timer);
@@ -5337,7 +5345,9 @@ static int md_set_readonly(struct mddev *mddev, struct block_device *bdev)
 		err = -EBUSY;
 		goto out;
 	}
-	if (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags)) {
+	if (mddev->pers)
+		__md_stop_writes(mddev);
+	if (!test_bit(MD_STILL_CLOSED, &mddev->flags)) {
 		/* Someone opened the device since we flushed it
 		 * so page cache could be dirty and it is too late
 		 * to flush.  So abort
@@ -5346,8 +5356,6 @@ static int md_set_readonly(struct mddev *mddev, struct block_device *bdev)
 		return -EBUSY;
 	}
 	if (mddev->pers) {
-		__md_stop_writes(mddev);
-
 		err  = -ENXIO;
 		if (mddev->ro==1)
 			goto out;
@@ -5379,7 +5387,9 @@ static int do_md_stop(struct mddev * mddev, int mode,
 		mutex_unlock(&mddev->open_mutex);
 		return -EBUSY;
 	}
-	if (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags)) {
+	if (mddev->pers)
+		__md_stop_writes(mddev);
+	if (!test_bit(MD_STILL_CLOSED, &mddev->flags)) {
 		/* Someone opened the device since we flushed it
 		 * so page cache could be dirty and it is too late
 		 * to flush.  So abort
@@ -5391,7 +5401,6 @@ static int do_md_stop(struct mddev * mddev, int mode,
 		if (mddev->ro)
 			set_disk_ro(disk, 0);
 
-		__md_stop_writes(mddev);
 		__md_stop(mddev);
 		mddev->queue->merge_bvec_fn = NULL;
 		mddev->queue->backing_dev_info.congested_fn = NULL;

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 828 bytes --]

^ permalink raw reply related	[flat|nested] 3+ messages in thread

* Re: Re: [PATCH v2] md/raid5: avoid deadlock when raid5 array has unack badblocks during md_stop_writes.
  2013-09-12  4:44 ` NeilBrown
@ 2013-09-12  6:36   ` ycbzzjlby
  0 siblings, 0 replies; 3+ messages in thread
From: ycbzzjlby @ 2013-09-12  6:36 UTC (permalink / raw)
  To: NeilBrown; +Cc: linux-raid, Bian Yu

Hi,
  Thank you, I tried your patch, but I got a deadlock.
  I reproduce it with script and some bad sectors on hard disks:
	#!/bin/sh
	mdadm -CR /dev/md0 -b none -c 64 -l5 -n4 /dev/sd[cbde]
	usleep 1800000
	mdadm -S /dev/md0

<4>[   74.835547] ======================================================
<4>[   74.835548] [ INFO: possible circular locking dependency detected ]
<4>[   74.835550] 3.11.0-next-20130906+ #6 Not tainted
<4>[   74.835551] -------------------------------------------------------
<4>[   74.835552] mdadm/3085 is trying to acquire lock:
<4>[   74.835560]  (&mddev->reconfig_mutex){+.+.+.}, at: [<ffffffff81539751>] __md_stop_writes+0x41/0xa0
<4>[   74.835561] 
<4>[   74.835561] but task is already holding lock:
<4>[   74.835565]  (&mddev->open_mutex){+.+.+.}, at: [<ffffffff8153c6e7>] do_md_stop+0x37/0x4d0
<4>[   74.835565] 
<4>[   74.835565] which lock already depends on the new lock.
<4>[   74.835565] 
<4>[   74.835566] 
<4>[   74.835566] the existing dependency chain (in reverse order) is:
<4>[   74.835569] 
<4>[   74.835569] -> #1 (&mddev->open_mutex){+.+.+.}:
<4>[   74.835574]        [<ffffffff810a8cc7>] lock_acquire+0x97/0x110
<4>[   74.835578]        [<ffffffff81743377>] mutex_lock_nested+0x47/0x3a0
<4>[   74.835580]        [<ffffffff8153c6e7>] do_md_stop+0x37/0x4d0
<4>[   74.835583]        [<ffffffff8153f6f9>] md_ioctl+0xef9/0x15c0
<4>[   74.835586]        [<ffffffff812eeda3>] __blkdev_driver_ioctl+0x23/0x30
<4>[   74.835589]        [<ffffffff812ef420>] blkdev_ioctl+0x200/0x7b0
<4>[   74.835592]        [<ffffffff81195b37>] block_ioctl+0x37/0x40
<4>[   74.835597]        [<ffffffff8116f8d1>] do_vfs_ioctl+0x91/0x550
<4>[   74.835599]        [<ffffffff8116fe21>] SyS_ioctl+0x91/0xa0
<4>[   74.835602]        [<ffffffff81750012>] system_call_fastpath+0x16/0x1b
<4>[   74.835604] 
<4>[   74.835604] -> #0 (&mddev->reconfig_mutex){+.+.+.}:
<4>[   74.835607]        [<ffffffff810a8b30>] __lock_acquire+0x1740/0x1840
<4>[   74.835609]        [<ffffffff810a8cc7>] lock_acquire+0x97/0x110
<4>[   74.835612]        [<ffffffff81743e13>] mutex_lock_interruptible_nested+0x53/0x410
<4>[   74.835614]        [<ffffffff81539751>] __md_stop_writes+0x41/0xa0
<4>[   74.835626]        [<ffffffff8153c71d>] do_md_stop+0x6d/0x4d0
<4>[   74.835627]        [<ffffffff8153f6f9>] md_ioctl+0xef9/0x15c0
<4>[   74.835629]        [<ffffffff812eeda3>] __blkdev_driver_ioctl+0x23/0x30
<4>[   74.835631]        [<ffffffff812ef420>] blkdev_ioctl+0x200/0x7b0
<4>[   74.835633]        [<ffffffff81195b37>] block_ioctl+0x37/0x40
<4>[   74.835634]        [<ffffffff8116f8d1>] do_vfs_ioctl+0x91/0x550
<4>[   74.835636]        [<ffffffff8116fe21>] SyS_ioctl+0x91/0xa0
<4>[   74.835638]        [<ffffffff81750012>] system_call_fastpath+0x16/0x1b
<4>[   74.835638] 
<4>[   74.835638] other info that might help us debug this:
<4>[   74.835638] 
<4>[   74.835639]  Possible unsafe locking scenario:
<4>[   74.835639] 
<4>[   74.835639]        CPU0                    CPU1
<4>[   74.835640]        ----                    ----
<4>[   74.835641]   lock(&mddev->open_mutex);
<4>[   74.835642]                                lock(&mddev->reconfig_mutex);
<4>[   74.835644]                                lock(&mddev->open_mutex);
<4>[   74.835645]   lock(&mddev->reconfig_mutex);
<4>[   74.835645] 
<4>[   74.835645]  *** DEADLOCK ***
<4>[   74.835645] 
<4>[   74.835646] 1 lock held by mdadm/3085:
<4>[   74.835649]  #0:  (&mddev->open_mutex){+.+.+.}, at: [<ffffffff8153c6e7>] do_md_stop+0x37/0x4d0
<4>[   74.835650] 
<4>[   74.835650] stack backtrace:
<4>[   74.835652] CPU: 1 PID: 3085 Comm: mdadm Not tainted 3.11.0-next-20130906+ #6
<4>[   74.835653] Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./To be filled by O.E.M., BIOS 080015  02/27/2012
<4>[   74.835655]  ffffffff820ed440 ffff88013917ba58 ffffffff817425e6 0000000000000003
<4>[   74.835657]  ffffffff820ed5f0 ffff88013917baa8 ffffffff810a61d6 ffff88013917ba98
<4>[   74.835659]  ffff88013917bb08 ffff8800b7e4a0c0 ffff8800b7e4a7b8 ffff8800b7e4a0c0
<4>[   74.835659] Call Trace:
<4>[   74.835661]  [<ffffffff817425e6>] dump_stack+0x49/0x5b
<4>[   74.835663]  [<ffffffff810a61d6>] print_circular_bug+0x216/0x310
<4>[   74.835665]  [<ffffffff810a8b30>] __lock_acquire+0x1740/0x1840
<4>[   74.835668]  [<ffffffff8174806b>] ? _raw_spin_unlock_irqrestore+0x3b/0x70
<4>[   74.835669]  [<ffffffff81539751>] ? __md_stop_writes+0x41/0xa0
<4>[   74.835671]  [<ffffffff810a8cc7>] lock_acquire+0x97/0x110
<4>[   74.835673]  [<ffffffff81539751>] ? __md_stop_writes+0x41/0xa0
<4>[   74.835674]  [<ffffffff8174806b>] ? _raw_spin_unlock_irqrestore+0x3b/0x70
<4>[   74.835676]  [<ffffffff81743e13>] mutex_lock_interruptible_nested+0x53/0x410
<4>[   74.835678]  [<ffffffff81539751>] ? __md_stop_writes+0x41/0xa0
<4>[   74.835679]  [<ffffffff81539751>] ? __md_stop_writes+0x41/0xa0
<4>[   74.835681]  [<ffffffff81539751>] __md_stop_writes+0x41/0xa0
<4>[   74.835682]  [<ffffffff8153c71d>] do_md_stop+0x6d/0x4d0
<4>[   74.835684]  [<ffffffff8153e8f7>] ? md_ioctl+0xf7/0x15c0
<4>[   74.835686]  [<ffffffff8110a693>] ? filemap_fdatawait+0x23/0x30
<4>[   74.835688]  [<ffffffff8110aa6c>] ? filemap_write_and_wait+0x6c/0x80
<4>[   74.835689]  [<ffffffff8153f6f9>] md_ioctl+0xef9/0x15c0
<4>[   74.835691]  [<ffffffff8174806b>] ? _raw_spin_unlock_irqrestore+0x3b/0x70
<4>[   74.835693]  [<ffffffff810a709d>] ? trace_hardirqs_on_caller+0xfd/0x1c0
<4>[   74.835695]  [<ffffffff810a716d>] ? trace_hardirqs_on+0xd/0x10
<4>[   74.835696]  [<ffffffff812eeda3>] __blkdev_driver_ioctl+0x23/0x30
<4>[   74.835698]  [<ffffffff812ef420>] blkdev_ioctl+0x200/0x7b0
<4>[   74.835700]  [<ffffffff81195b37>] block_ioctl+0x37/0x40
<4>[   74.835702]  [<ffffffff8116f8d1>] do_vfs_ioctl+0x91/0x550
<4>[   74.835704]  [<ffffffff81070bb0>] ? update_rmtp+0x80/0x80
<4>[   74.835706]  [<ffffffff81750037>] ? sysret_check+0x1b/0x56
<4>[   74.835708]  [<ffffffff8116fe21>] SyS_ioctl+0x91/0xa0
<4>[   74.835710]  [<ffffffff8130db2e>] ? trace_hardirqs_on_thunk+0x3a/0x3f
<4>[   74.835712]  [<ffffffff81750012>] system_call_fastpath+0x16/0x1b
<7>[   74.835716] interrupting MD-thread pid 3076
<6>[   74.937009] md: export_rdev(sdb)


>On Mon,  2 Sep 2013 01:09:55 -0400 ycbzzjlby@gmail.com wrote:
>
>> From: Bian Yu <bianyu@kedacom.com>
>> 
>> When raid5 hit a fresh badblock, this badblock will flagged as unack
>> badblock until md_update_sb is called.
>> But md_stop/reboot/md_set_readonly will avoid raid5d call md_update_sb
>> in md_check_recovery, the badblock will always be unack, so raid5d
>> thread enter a infinite loop and never can unregister sync_thread
>> that cause deadlock.
>> 
>> To solve this, before md_stop_writes call md_unregister_thread, set
>> MD_STOPPING_WRITES on mddev->flags. In raid5.c analyse_stripe judge
>> MD_STOPPING_WRITES bit on mddev->flags, if setted don't block rdev
>> to wait md_update_sb. so raid5d thread can be stopped.
>> 
>> I can reproduce it by using follow way:
>> When raid5 array is recovering and hit a fresh badblock, then shutdown array at once.
>> 
>> [  480.850203]       Not tainted 3.11.0-next-20130906+ #4
>> [  480.852344] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
>> [  480.854380]  [<ffffffff8153b324>] md_do_sync+0x7e4/0xe60
>> [  480.854386]  [<ffffffff81747fcb>] ? _raw_spin_unlock_irq+0x2b/0x40
>> [  480.854395]  [<ffffffff81536fa0>] ? md_unregister_thread+0x90/0x90
>> [  480.854400]  [<ffffffff810a716d>] ? trace_hardirqs_on+0xd/0x10
>> [  480.854405]  [<ffffffff815370bf>] md_thread+0x11f/0x170
>> [  480.854410]  [<ffffffff81536fa0>] ? md_unregister_thread+0x90/0x90
>> [  480.854415]  [<ffffffff8106d956>] kthread+0xd6/0xe0
>> [  480.854423]  [<ffffffff8106d880>] ? __init_kthread_worker+0x70/0x70
>> [  480.854428]  [<ffffffff8174ff2c>] ret_from_fork+0x7c/0xb0
>> [  480.854432]  [<ffffffff8106d880>] ? __init_kthread_worker+0x70/0x70
>> [  480.854435] no locks held by md0_resync/3246.
>> [  480.854437] INFO: task mdadm:3257 blocked for more than 120 seconds.
>> [  480.854438]       Not tainted 3.11.0-next-20130906+ #4
>> [  480.854439] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
>> [  480.854442] mdadm           D 0000000000000000  5024  3257   3209 0x00000080
>> [  480.854445]  ffff880138c37b18 0000000000000046 00000000ffffffff ffff880037d3b120
>> [  480.854447]  ffff88013a038720 ffff88013a038000 0000000000013500 ffff880138c37fd8
>> [  480.854449]  ffff880138c36010 0000000000013500 0000000000013500 ffff880138c37fd8
>> [  480.854449] Call Trace:
>> [  480.854452]  [<ffffffff81745fa4>] schedule+0x24/0x70
>> [  480.854453]  [<ffffffff81742725>] schedule_timeout+0x175/0x200
>> [  480.854455]  [<ffffffff810a6d30>] ? mark_held_locks+0x80/0x130
>> [  480.854457]  [<ffffffff81747fcb>] ? _raw_spin_unlock_irq+0x2b/0x40
>> [  480.854459]  [<ffffffff810a709d>] ? trace_hardirqs_on_caller+0xfd/0x1c0
>> [  480.854461]  [<ffffffff810a716d>] ? trace_hardirqs_on+0xd/0x10
>> [  480.854463]  [<ffffffff81746600>] wait_for_completion+0xa0/0x110
>> [  480.854465]  [<ffffffff8107d160>] ? try_to_wake_up+0x300/0x300
>> [  480.854467]  [<ffffffff8106ddbc>] kthread_stop+0x4c/0xe0
>> [  480.854468]  [<ffffffff81536f5e>] md_unregister_thread+0x4e/0x90
>> [  480.854470]  [<ffffffff8153965d>] md_reap_sync_thread+0x1d/0x140
>> [  480.854472]  [<ffffffff815397ab>] __md_stop_writes+0x2b/0x80
>> [  480.854473]  [<ffffffff8153c911>] do_md_stop+0x91/0x4d0
>> [  480.854475]  [<ffffffff8153e8a7>] ? md_ioctl+0xf7/0x15c0
>> [  480.854477]  [<ffffffff810a716d>] ? trace_hardirqs_on+0xd/0x10
>> [  480.854479]  [<ffffffff8153f6a9>] md_ioctl+0xef9/0x15c0
>> [  480.854481]  [<ffffffff8113145d>] ? handle_mm_fault+0x17d/0x920
>> 
>> Signed-off-by: Bian Yu <bianyu@kedacom.com>
>> ---
>>  drivers/md/md.c    |    2 ++
>>  drivers/md/md.h    |    4 ++++
>>  drivers/md/raid5.c |    3 ++-
>>  3 files changed, 8 insertions(+), 1 deletions(-)
>> 
>> diff --git a/drivers/md/md.c b/drivers/md/md.c
>> index adf4d7e..54ef71f 100644
>> --- a/drivers/md/md.c
>> +++ b/drivers/md/md.c
>> @@ -5278,6 +5278,7 @@ static void md_clean(struct mddev *mddev)
>>  static void __md_stop_writes(struct mddev *mddev)
>>  {
>>  	set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
>> +	set_bit(MD_STOPPING_WRITES, &mddev->flags);
>>  	if (mddev->sync_thread) {
>>  		set_bit(MD_RECOVERY_INTR, &mddev->recovery);
>>  		md_reap_sync_thread(mddev);
>> @@ -5294,6 +5295,7 @@ static void __md_stop_writes(struct mddev *mddev)
>>  		mddev->in_sync = 1;
>>  		md_update_sb(mddev, 1);
>>  	}
>> +	clear_bit(MD_STOPPING_WRITES, &mddev->flags);
>>  }
>>  
>>  void md_stop_writes(struct mddev *mddev)
>> diff --git a/drivers/md/md.h b/drivers/md/md.h
>> index 608050c..a24ae1d 100644
>> --- a/drivers/md/md.h
>> +++ b/drivers/md/md.h
>> @@ -214,6 +214,10 @@ struct mddev {
>>  #define MD_STILL_CLOSED	4	/* If set, then array has not been opened since
>>  				 * md_ioctl checked on it.
>>  				 */
>> +#define MD_STOPPING_WRITES 5	/* If set, raid5 shouldn't set unacknowledged
>> +				 * badblock blocked in analyse_stripe to avoid
>> +				 * infinite loop.
>> +				 */
>>  
>>  	int				suspended;
>>  	atomic_t			active_io;
>> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
>> index f9972e2..ff1aecf 100644
>> --- a/drivers/md/raid5.c
>> +++ b/drivers/md/raid5.c
>> @@ -3446,7 +3446,8 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
>>  		if (rdev) {
>>  			is_bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS,
>>  					     &first_bad, &bad_sectors);
>> -			if (s->blocked_rdev == NULL
>> +			if (!test_bit(MD_STOPPING_WRITES, &conf->mddev->flags)
>> +			    && s->blocked_rdev == NULL
>>  			    && (test_bit(Blocked, &rdev->flags)
>>  				|| is_bad < 0)) {
>>  				if (is_bad < 0)
>
>
>Thanks for including the extra details in the patch, but it wasn't only that
>I didn't think it should be needed (I was wrong there).  I also don't like
>the patch.  It isn't at all clear to me that it will do the right thing.
>There is a reason that we block until the bad block lists has been updated,
>and to just not block because it is inconvenient just doesn't seem right.
>
>I would rather change the sequencing so that the badblock list can be updated
>at this point.
>Something like that patch below, but I'm not 100% sure of it yet and I'm
>about to go on leave so I'm not sure I'll be able to commit to it for a while.
>
>If you could review and test I would appreciate it.
>
>Thanks,
>NeilBrown
>
>diff --git a/drivers/md/md.c b/drivers/md/md.c
>index adf4d7e..e4d78c0 100644
>--- a/drivers/md/md.c
>+++ b/drivers/md/md.c
>@@ -3170,6 +3170,9 @@ rdev_attr_store(struct kobject *kobj, struct attribute *attr,
> 		return -EACCES;
> 	rv = mddev ? mddev_lock(mddev): -EBUSY;
> 	if (!rv) {
>+		mutex_lock(&mddev->open_mutex);
>+		clear_bit(MD_STILL_CLOSED, &mddev->flags);
>+		mutex_unlock(&mddev->open_mutex);
> 		if (rdev->mddev == NULL)
> 			rv = -EBUSY;
> 		else
>@@ -4764,6 +4767,9 @@ md_attr_store(struct kobject *kobj, struct attribute *attr,
> 		flush_workqueue(md_misc_wq);
> 	rv = mddev_lock(mddev);
> 	if (!rv) {
>+		mutex_lock(&mddev->open_mutex);
>+		clear_bit(MD_STILL_CLOSED, &mddev->flags);
>+		mutex_unlock(&mddev->open_mutex);
> 		rv = entry->store(mddev, page, length);
> 		mddev_unlock(mddev);
> 	}
>@@ -5279,8 +5285,10 @@ static void __md_stop_writes(struct mddev *mddev)
> {
> 	set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
> 	if (mddev->sync_thread) {
>+		mddev_unlock(&mddev);
> 		set_bit(MD_RECOVERY_INTR, &mddev->recovery);
> 		md_reap_sync_thread(mddev);
>+		mddev_lock(&mddev);
> 	}
> 
> 	del_timer_sync(&mddev->safemode_timer);
>@@ -5337,7 +5345,9 @@ static int md_set_readonly(struct mddev *mddev, struct block_device *bdev)
> 		err = -EBUSY;
> 		goto out;
> 	}
>-	if (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags)) {
>+	if (mddev->pers)
>+		__md_stop_writes(mddev);
>+	if (!test_bit(MD_STILL_CLOSED, &mddev->flags)) {
> 		/* Someone opened the device since we flushed it
> 		 * so page cache could be dirty and it is too late
> 		 * to flush.  So abort
>@@ -5346,8 +5356,6 @@ static int md_set_readonly(struct mddev *mddev, struct block_device *bdev)
> 		return -EBUSY;
> 	}
> 	if (mddev->pers) {
>-		__md_stop_writes(mddev);
>-
> 		err  = -ENXIO;
> 		if (mddev->ro==1)
> 			goto out;
>@@ -5379,7 +5387,9 @@ static int do_md_stop(struct mddev * mddev, int mode,
> 		mutex_unlock(&mddev->open_mutex);
> 		return -EBUSY;
> 	}
>-	if (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags)) {
>+	if (mddev->pers)
>+		__md_stop_writes(mddev);
>+	if (!test_bit(MD_STILL_CLOSED, &mddev->flags)) {
> 		/* Someone opened the device since we flushed it
> 		 * so page cache could be dirty and it is too late
> 		 * to flush.  So abort
>@@ -5391,7 +5401,6 @@ static int do_md_stop(struct mddev * mddev, int mode,
> 		if (mddev->ro)
> 			set_disk_ro(disk, 0);
> 
>-		__md_stop_writes(mddev);
> 		__md_stop(mddev);
> 		mddev->queue->merge_bvec_fn = NULL;
> 		mddev->queue->backing_dev_info.congested_fn = NULL;
>

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2013-09-12  6:36 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2013-09-02  5:09 [PATCH v2] md/raid5: avoid deadlock when raid5 array has unack badblocks during md_stop_writes ycbzzjlby
2013-09-12  4:44 ` NeilBrown
2013-09-12  6:36   ` ycbzzjlby

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).