From: NeilBrown <neilb@suse.com>
To: Shaohua Li <shli@kernel.org>
Cc: linux-raid@vger.kernel.org
Subject: Re: [md PATCH 3/5] md: move suspend_hi/lo handling into core md code
Date: Wed, 18 Oct 2017 18:40:55 +1100 [thread overview]
Message-ID: <87bml4j43c.fsf@notabene.neil.brown.name> (raw)
In-Reply-To: <20171018061620.alka3npi6pyd76py@kernel.org>
[-- Attachment #1: Type: text/plain, Size: 6272 bytes --]
On Tue, Oct 17 2017, Shaohua Li wrote:
> On Tue, Oct 17, 2017 at 01:46:43PM +1100, Neil Brown wrote:
>> responding to ->suspend_lo and ->suspend_hi is similar
>> to responding to ->suspended. It is best to wait in
>> the common core code without incrementing ->active_io.
>> This allows mddev_suspend()/mddev_resume() to work while
>> requests are waiting for suspend_lo/hi to change.
>> This is will be important after a subsequent patch
>> which uses mddev_suspend() to synchronize updating for
>> suspend_lo/hi.
>>
>> So move the code for testing suspend_lo/hi out of raid1.c
>> and raid5.c, and place it in md.c
>>
>> Signed-off-by: NeilBrown <neilb@suse.com>
>> ---
>> drivers/md/md.c | 29 +++++++++++++++++++++++------
>> drivers/md/raid1.c | 12 ++++--------
>> drivers/md/raid5.c | 22 ----------------------
>> 3 files changed, 27 insertions(+), 36 deletions(-)
>>
>> diff --git a/drivers/md/md.c b/drivers/md/md.c
>> index 308456842d3e..f8d0e41120cb 100644
>> --- a/drivers/md/md.c
>> +++ b/drivers/md/md.c
>> @@ -266,16 +266,31 @@ static DEFINE_SPINLOCK(all_mddevs_lock);
>> * call has finished, the bio has been linked into some internal structure
>> * and so is visible to ->quiesce(), so we don't need the refcount any more.
>> */
>> +static bool is_suspended(struct mddev *mddev, struct bio *bio)
>> +{
>> + if (mddev->suspended)
>> + return true;
>> + if (bio_data_dir(bio) != WRITE)
>> + return false;
>> + if (mddev->suspend_lo >= mddev->suspend_hi)
>> + return false;
>> + if (bio->bi_iter.bi_sector >= mddev->suspend_hi)
>> + return false;
>> + if (bio_end_sector(bio) < mddev->suspend_lo)
>> + return false;
>> + return true;
>> +}
>> +
>> void md_handle_request(struct mddev *mddev, struct bio *bio)
>> {
>> check_suspended:
>> rcu_read_lock();
>> - if (mddev->suspended) {
>> + if (is_suspended(mddev, bio)) {
>> DEFINE_WAIT(__wait);
>> for (;;) {
>> prepare_to_wait(&mddev->sb_wait, &__wait,
>> TASK_UNINTERRUPTIBLE);
>> - if (!mddev->suspended)
>> + if (!is_suspended(mddev, bio))
>> break;
>> rcu_read_unlock();
>> schedule();
>> @@ -4843,10 +4858,11 @@ suspend_lo_store(struct mddev *mddev, const char *buf, size_t len)
>> goto unlock;
>> old = mddev->suspend_lo;
>> mddev->suspend_lo = new;
>> - if (new >= old)
>> + if (new >= old) {
>> /* Shrinking suspended region */
>> + wake_up(&mddev->sb_wait);
>> mddev->pers->quiesce(mddev, 2);
>
> Do we still need the quiesce(mddev, 2)? sounds not to me.
No, I don't think we do - thanks for noticing that;.
>
>> - else {
>> + } else {
>> /* Expanding suspended region - need to wait */
>> mddev->pers->quiesce(mddev, 1);
>> mddev->pers->quiesce(mddev, 0);
>> @@ -4886,10 +4902,11 @@ suspend_hi_store(struct mddev *mddev, const char *buf, size_t len)
>> goto unlock;
>> old = mddev->suspend_hi;
>> mddev->suspend_hi = new;
>> - if (new <= old)
>> + if (new <= old) {
>> /* Shrinking suspended region */
>> + wake_up(&mddev->sb_wait);
>> mddev->pers->quiesce(mddev, 2);
>> - else {
>> + } else {
>> /* Expanding suspended region - need to wait */
>> mddev->pers->quiesce(mddev, 1);
>> mddev->pers->quiesce(mddev, 0);
>> diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
>> index f3f3e40dc9d8..277a145b3ff5 100644
>> --- a/drivers/md/raid1.c
>> +++ b/drivers/md/raid1.c
>> @@ -1310,11 +1310,9 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
>> */
>>
>>
>> - if ((bio_end_sector(bio) > mddev->suspend_lo &&
>> - bio->bi_iter.bi_sector < mddev->suspend_hi) ||
>> - (mddev_is_clustered(mddev) &&
>> + if (mddev_is_clustered(mddev) &&
>> md_cluster_ops->area_resyncing(mddev, WRITE,
>> - bio->bi_iter.bi_sector, bio_end_sector(bio)))) {
>> + bio->bi_iter.bi_sector, bio_end_sector(bio))) {
>>
>> /*
>> * As the suspend_* range is controlled by userspace, we want
>> @@ -1325,12 +1323,10 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
>> sigset_t full, old;
>> prepare_to_wait(&conf->wait_barrier,
>> &w, TASK_INTERRUPTIBLE);
>> - if (bio_end_sector(bio) <= mddev->suspend_lo ||
>> - bio->bi_iter.bi_sector >= mddev->suspend_hi ||
>> - (mddev_is_clustered(mddev) &&
>> + if (mddev_is_clustered(mddev) &&
>> !md_cluster_ops->area_resyncing(mddev, WRITE,
>> bio->bi_iter.bi_sector,
>> - bio_end_sector(bio))))
>> + bio_end_sector(bio)))
>> break;
>> sigfillset(&full);
>> sigprocmask(SIG_BLOCK, &full, &old);
>> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
>> index 928e24a07133..4f40ccd21cbb 100644
>> --- a/drivers/md/raid5.c
>> +++ b/drivers/md/raid5.c
>> @@ -5682,28 +5682,6 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
>> goto retry;
>> }
>>
>> - if (rw == WRITE &&
>> - logical_sector >= mddev->suspend_lo &&
>> - logical_sector < mddev->suspend_hi) {
>> - raid5_release_stripe(sh);
>> - /* As the suspend_* range is controlled by
>> - * userspace, we want an interruptible
>> - * wait.
>> - */
>> - prepare_to_wait(&conf->wait_for_overlap,
>> - &w, TASK_INTERRUPTIBLE);
>> - if (logical_sector >= mddev->suspend_lo &&
>> - logical_sector < mddev->suspend_hi) {
>> - sigset_t full, old;
>> - sigfillset(&full);
>> - sigprocmask(SIG_BLOCK, &full, &old);
>> - schedule();
>> - sigprocmask(SIG_SETMASK, &old, NULL);
>> - do_prepare = true;
>> - }
>> - goto retry;
>> - }
>> -
>> if (test_bit(STRIPE_EXPANDING, &sh->state) ||
>> !add_stripe_bio(sh, bi, dd_idx, rw, previous)) {
>> /* Stripe is busy expanding or
>
> I think we can remove the state == 2 branch in raid5_quiesce().
>
Yes.
> This leaves only raid1 and md-cluster needs the quiesce(2) calls. In the
> future, we probably should abstract that md-cluster logic to separate API and
> let raid1 use it for waitting. The quiesce(2) is a strange name for the wait
> purpose.
Agreed. I'll try to find a nice solution.
Thanks,
NeilBrown
[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 832 bytes --]
next prev parent reply other threads:[~2017-10-18 7:40 UTC|newest]
Thread overview: 17+ messages / expand[flat|nested] mbox.gz Atom feed top
2017-10-17 2:46 [md PATCH 0/5] Address deadlock associated with setting suspend_lo NeilBrown
2017-10-17 2:46 ` [md PATCH 4/5] md: use mddev_suspend/resume instead of ->quiesce() NeilBrown
2017-10-17 2:46 ` [md PATCH 5/5] md: allow metadata update while suspending NeilBrown
2017-10-17 2:46 ` [md PATCH 3/5] md: move suspend_hi/lo handling into core md code NeilBrown
2017-10-18 6:16 ` Shaohua Li
2017-10-18 7:40 ` NeilBrown [this message]
2017-10-19 1:49 ` [md PATCH 6/5] md: remove special meaning of ->quiesce(.., 2) NeilBrown
2017-10-17 2:46 ` [md PATCH 2/5] md: don't call bitmap_create() while array is quiesced NeilBrown
2017-10-17 2:46 ` [md PATCH 1/5] md: always hold reconfig_mutex when calling mddev_suspend() NeilBrown
2017-10-18 6:11 ` Shaohua Li
2017-10-18 7:35 ` NeilBrown
2017-10-19 1:17 ` [md PATCH 1/5 v2] " NeilBrown
2017-10-19 3:45 ` Shaohua Li
2017-10-19 6:29 ` NeilBrown
2017-10-20 4:37 ` Shaohua Li
2017-10-23 0:02 ` NeilBrown
2017-10-23 1:48 ` Shaohua Li
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=87bml4j43c.fsf@notabene.neil.brown.name \
--to=neilb@suse.com \
--cc=linux-raid@vger.kernel.org \
--cc=shli@kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).