From: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
To: Shaohua Li <shli@kernel.org>, NeilBrown <neilb@suse.com>
Cc: Dan Williams <dan.j.williams@intel.com>,
linux-raid <linux-raid@vger.kernel.org>
Subject: Re: raid5d hangs when stopping an array during reshape
Date: Thu, 25 Feb 2016 17:05:17 +0100 [thread overview]
Message-ID: <56CF263D.9020803@intel.com> (raw)
In-Reply-To: <20160225011710.GA27642@kernel.org>
On 02/25/2016 02:17 AM, Shaohua Li wrote:
> On Thu, Feb 25, 2016 at 11:31:04AM +1100, Neil Brown wrote:
>> On Thu, Feb 25 2016, Shaohua Li wrote:
>>
>>>
>>> As for the bug, write requests run in raid5d, mddev_suspend() waits for all IO,
>>> which waits for the write requests. So this is a clear deadlock. I think we
>>> should delete the check_reshape() in md_check_recovery(). If we change
>>> layout/disks/chunk_size, check_reshape() is already called. If we start an
>>> array, the .run() already handles new layout. There is no point
>>> md_check_recovery() check_reshape() again.
>>
>> Are you sure?
>> Did you look at the commit which added that code?
>> commit b4c4c7b8095298ff4ce20b40bf180ada070812d0
>>
>> When there is an IO error, reshape (or resync or recovery) will abort
>> and then possibly be automatically restarted.
>
> thanks pointing out this.
>> Without the check here a reshape might be attempted on an array which
>> has failed. Not sure if that would be harmful, but it would certainly
>> be pointless.
>>
>> But you are right that this is causing the problem.
>> Maybe we should keep track of the size of the 'scribble' arrays and only
>> call resize_chunks if the size needs to change? Similar to what
>> resize_stripes does.
>
> yep, this is my first solution, but think check_reshape() is useless here
> later, apparently miss the restart case. I'll go this way.
My idea was to replace mddev_suspend()/mddev_resume() in resize_chunks()
with a rw lock that would prevent collisions with raid_run_ops(), since
scribble is used only there. But if the parity operations are executed
asynchronously this would also need to wait until all the submitted
operations have completed. Seems a bit overkill, but I came up with
this:
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index a086014..3b7bbec 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -55,6 +55,7 @@
#include <linux/ratelimit.h>
#include <linux/nodemask.h>
#include <linux/flex_array.h>
+#include <linux/delay.h>
#include <trace/events/block.h>
#include "md.h"
@@ -1267,6 +1268,8 @@ static void ops_complete_compute(void *stripe_head_ref)
pr_debug("%s: stripe %llu\n", __func__,
(unsigned long long)sh->sector);
+ atomic_dec(&sh->raid_conf->scribble_count);
+
/* mark the computed target(s) as uptodate */
mark_target_uptodate(sh, sh->ops.target);
mark_target_uptodate(sh, sh->ops.target2);
@@ -1314,6 +1317,9 @@ ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu)
pr_debug("%s: stripe %llu block: %d\n",
__func__, (unsigned long long)sh->sector, target);
+
+ atomic_inc(&sh->raid_conf->scribble_count);
+
BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
for (i = disks; i--; )
@@ -1399,6 +1405,8 @@ ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu)
pr_debug("%s: stripe %llu block: %d\n",
__func__, (unsigned long long)sh->sector, target);
+ atomic_inc(&sh->raid_conf->scribble_count);
+
tgt = &sh->dev[target];
BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
dest = tgt->page;
@@ -1449,6 +1457,9 @@ ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu)
BUG_ON(sh->batch_head);
pr_debug("%s: stripe %llu block1: %d block2: %d\n",
__func__, (unsigned long long)sh->sector, target, target2);
+
+ atomic_inc(&sh->raid_conf->scribble_count);
+
BUG_ON(target < 0 || target2 < 0);
BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
BUG_ON(!test_bit(R5_Wantcompute, &tgt2->flags));
@@ -1545,6 +1556,8 @@ static void ops_complete_prexor(void *stripe_head_ref)
pr_debug("%s: stripe %llu\n", __func__,
(unsigned long long)sh->sector);
+
+ atomic_dec(&sh->raid_conf->scribble_count);
}
static struct dma_async_tx_descriptor *
@@ -1563,6 +1576,8 @@ ops_run_prexor5(struct stripe_head *sh, struct raid5_percpu *percpu,
pr_debug("%s: stripe %llu\n", __func__,
(unsigned long long)sh->sector);
+ atomic_inc(&sh->raid_conf->scribble_count);
+
for (i = disks; i--; ) {
struct r5dev *dev = &sh->dev[i];
/* Only process blocks that are known to be uptodate */
@@ -1588,6 +1603,8 @@ ops_run_prexor6(struct stripe_head *sh, struct raid5_percpu *percpu,
pr_debug("%s: stripe %llu\n", __func__,
(unsigned long long)sh->sector);
+ atomic_inc(&sh->raid_conf->scribble_count);
+
count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_WANT_DRAIN);
init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_PQ_XOR_DST, tx,
@@ -1672,6 +1689,8 @@ static void ops_complete_reconstruct(void *stripe_head_ref)
pr_debug("%s: stripe %llu\n", __func__,
(unsigned long long)sh->sector);
+ atomic_dec(&sh->raid_conf->scribble_count);
+
for (i = disks; i--; ) {
fua |= test_bit(R5_WantFUA, &sh->dev[i].flags);
sync |= test_bit(R5_SyncIO, &sh->dev[i].flags);
@@ -1722,6 +1741,8 @@ ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu,
pr_debug("%s: stripe %llu\n", __func__,
(unsigned long long)sh->sector);
+ atomic_inc(&sh->raid_conf->scribble_count);
+
for (i = 0; i < sh->disks; i++) {
if (pd_idx == i)
continue;
@@ -1804,6 +1825,8 @@ ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu,
pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector);
+ atomic_inc(&sh->raid_conf->scribble_count);
+
for (i = 0; i < sh->disks; i++) {
if (sh->pd_idx == i || sh->qd_idx == i)
continue;
@@ -1857,6 +1880,8 @@ static void ops_complete_check(void *stripe_head_ref)
pr_debug("%s: stripe %llu\n", __func__,
(unsigned long long)sh->sector);
+ atomic_dec(&sh->raid_conf->scribble_count);
+
sh->check_state = check_state_check_result;
set_bit(STRIPE_HANDLE, &sh->state);
raid5_release_stripe(sh);
@@ -1877,6 +1902,8 @@ static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu)
pr_debug("%s: stripe %llu\n", __func__,
(unsigned long long)sh->sector);
+ atomic_inc(&sh->raid_conf->scribble_count);
+
BUG_ON(sh->batch_head);
count = 0;
xor_dest = sh->dev[pd_idx].page;
@@ -1906,6 +1933,8 @@ static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu
pr_debug("%s: stripe %llu checkp: %d\n", __func__,
(unsigned long long)sh->sector, checkp);
+ atomic_inc(&sh->raid_conf->scribble_count);
+
BUG_ON(sh->batch_head);
count = set_syndrome_sources(srcs, sh, SYNDROME_SRC_ALL);
if (!checkp)
@@ -1927,6 +1956,7 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
struct raid5_percpu *percpu;
unsigned long cpu;
+ down_read(&conf->scribble_lock);
cpu = get_cpu();
percpu = per_cpu_ptr(conf->percpu, cpu);
if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) {
@@ -1985,6 +2015,7 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
wake_up(&sh->raid_conf->wait_for_overlap);
}
put_cpu();
+ up_read(&conf->scribble_lock);
}
static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp)
@@ -2089,7 +2120,10 @@ static int resize_chunks(struct r5conf *conf, int new_disks, int new_sectors)
unsigned long cpu;
int err = 0;
- mddev_suspend(conf->mddev);
+ down_write(&conf->scribble_lock);
+ /* wait for async operations using scribble to complete */
+ while (atomic_read(&conf->scribble_count))
+ udelay(10);
get_online_cpus();
for_each_present_cpu(cpu) {
struct raid5_percpu *percpu;
@@ -2109,7 +2143,8 @@ static int resize_chunks(struct r5conf *conf, int new_disks, int new_sectors)
}
}
put_online_cpus();
- mddev_resume(conf->mddev);
+ up_write(&conf->scribble_lock);
+
return err;
}
@@ -6501,6 +6536,8 @@ static struct r5conf *setup_conf(struct mddev *mddev)
spin_lock_init(&conf->device_lock);
seqcount_init(&conf->gen_lock);
mutex_init(&conf->cache_size_mutex);
+ init_rwsem(&conf->scribble_lock);
+ atomic_set(&conf->scribble_count, 0);
init_waitqueue_head(&conf->wait_for_quiescent);
for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) {
init_waitqueue_head(&conf->wait_for_stripe[i]);
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index a415e1c..8361156 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -3,6 +3,7 @@
#include <linux/raid/xor.h>
#include <linux/dmaengine.h>
+#include <linux/rwsem.h>
/*
*
@@ -494,6 +495,9 @@ struct r5conf {
struct kmem_cache *slab_cache; /* for allocating stripes */
struct mutex cache_size_mutex; /* Protect changes to cache size */
+ struct rw_semaphore scribble_lock;
+ atomic_t scribble_count;
+
int seq_flush, seq_write;
int quiesce;
next prev parent reply other threads:[~2016-02-25 16:05 UTC|newest]
Thread overview: 10+ messages / expand[flat|nested] mbox.gz Atom feed top
2015-12-30 13:45 raid5d hangs when stopping an array during reshape Artur Paszkiewicz
2016-02-24 21:21 ` Dan Williams
2016-02-25 0:03 ` Shaohua Li
2016-02-25 0:31 ` NeilBrown
2016-02-25 1:17 ` Shaohua Li
2016-02-25 16:05 ` Artur Paszkiewicz [this message]
2016-02-25 18:42 ` Shaohua Li
2016-02-25 18:48 ` Dan Williams
2016-02-25 19:17 ` Shaohua Li
2016-02-25 19:58 ` Dan Williams
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=56CF263D.9020803@intel.com \
--to=artur.paszkiewicz@intel.com \
--cc=dan.j.williams@intel.com \
--cc=linux-raid@vger.kernel.org \
--cc=neilb@suse.com \
--cc=shli@kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).