All of lore.kernel.org
 help / color / mirror / Atom feed
From: NeilBrown <neilb@suse.com>
To: Shaohua Li <shli@kernel.org>
Cc: linux-raid@vger.kernel.org, hch@lst.de
Subject: Re: [md PATCH 02/15] md/raid5: simplfy delaying of writes while metadata is updated.
Date: Thu, 16 Mar 2017 13:45:16 +1100	[thread overview]
Message-ID: <8760jadjnn.fsf@notabene.neil.brown.name> (raw)
In-Reply-To: <20170315230356.3zizpl44atdikrt7@kernel.org>

[-- Attachment #1: Type: text/plain, Size: 5099 bytes --]

On Wed, Mar 15 2017, Shaohua Li wrote:

> On Wed, Mar 15, 2017 at 02:05:12PM +1100, Neil Brown wrote:
>> If a device fails during a write, we must ensure the failure is
>> recorded in the metadata before the completion of the write is
>> acknowleged.
>> 
>> Commit c3cce6cda162 ("md/raid5: ensure device failure recorded before
>> write request returns.")  added code for this, but it was
>> unnecessarily complicated.  We already had similar functionality for
>> handling updates to the bad-block-list, thanks to Commit de393cdea66c
>> ("md: make it easier to wait for bad blocks to be acknowledged.")
>> 
>> So revert most of the former commit, and instead avoid collecting
>> completed writes if MD_CHANGE_PENDING is set.  raid5d() will then flush
>> the metadata and retry the stripe_head.
>> As this change can leave a stripe_head ready for handling immediately
>> after handle_active_stripes() returns, we change raid5_do_work() to
>> pause when MD_CHANGE_PENDING is set, so that it doesn't spin.
>> 
>> We check MD_CHANGE_PENDING *after* analyse_stripe() as it could be set
>> asynchronously.  After analyse_stripe(), we have collected stable data
>> about the state of devices, which will be used to make decisions.
>> 
>> Signed-off-by: NeilBrown <neilb@suse.com>
>> ---
>>  drivers/md/raid5.c |   31 ++++++++-----------------------
>>  drivers/md/raid5.h |    3 ---
>>  2 files changed, 8 insertions(+), 26 deletions(-)
>> 
>> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
>> index cc2d039b4aae..f990f74901d2 100644
>> --- a/drivers/md/raid5.c
>> +++ b/drivers/md/raid5.c
>> @@ -4690,7 +4690,8 @@ static void handle_stripe(struct stripe_head *sh)
>>  	if (test_bit(STRIPE_LOG_TRAPPED, &sh->state))
>>  		goto finish;
>>  
>> -	if (s.handle_bad_blocks) {
>> +	if (s.handle_bad_blocks ||
>> +	    test_bit(MD_SB_CHANGE_PENDING, &conf->mddev->sb_flags)) {
>>  		set_bit(STRIPE_HANDLE, &sh->state);
>>  		goto finish;
>>  	}
>> @@ -5020,15 +5021,8 @@ static void handle_stripe(struct stripe_head *sh)
>>  			md_wakeup_thread(conf->mddev->thread);
>>  	}
>>  
>> -	if (!bio_list_empty(&s.return_bi)) {
>> -		if (test_bit(MD_SB_CHANGE_PENDING, &conf->mddev->sb_flags)) {
>> -			spin_lock_irq(&conf->device_lock);
>> -			bio_list_merge(&conf->return_bi, &s.return_bi);
>> -			spin_unlock_irq(&conf->device_lock);
>> -			md_wakeup_thread(conf->mddev->thread);
>> -		} else
>> -			return_io(&s.return_bi);
>> -	}
>> +	if (!bio_list_empty(&s.return_bi))
>> +		return_io(&s.return_bi);
>>  
>>  	clear_bit_unlock(STRIPE_ACTIVE, &sh->state);
>>  }
>> @@ -6225,6 +6219,7 @@ static void raid5_do_work(struct work_struct *work)
>>  	struct r5worker *worker = container_of(work, struct r5worker, work);
>>  	struct r5worker_group *group = worker->group;
>>  	struct r5conf *conf = group->conf;
>> +	struct mddev *mddev = conf->mddev;
>>  	int group_id = group - conf->worker_groups;
>>  	int handled;
>>  	struct blk_plug plug;
>> @@ -6245,6 +6240,9 @@ static void raid5_do_work(struct work_struct *work)
>>  		if (!batch_size && !released)
>>  			break;
>>  		handled += batch_size;
>> +		wait_event_lock_irq(mddev->sb_wait,
>> +				    !test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags),
> MD_SB_CHANGE_PENDING?


Yes.  Thanks for catching.

NeilBrown

>
>> +				    conf->device_lock);
>>  	}
>>  	pr_debug("%d stripes handled\n", handled);
>>  
>> @@ -6272,18 +6270,6 @@ static void raid5d(struct md_thread *thread)
>>  
>>  	md_check_recovery(mddev);
>>  
>> -	if (!bio_list_empty(&conf->return_bi) &&
>> -	    !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
>> -		struct bio_list tmp = BIO_EMPTY_LIST;
>> -		spin_lock_irq(&conf->device_lock);
>> -		if (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
>> -			bio_list_merge(&tmp, &conf->return_bi);
>> -			bio_list_init(&conf->return_bi);
>> -		}
>> -		spin_unlock_irq(&conf->device_lock);
>> -		return_io(&tmp);
>> -	}
>> -
>>  	blk_start_plug(&plug);
>>  	handled = 0;
>>  	spin_lock_irq(&conf->device_lock);
>> @@ -6935,7 +6921,6 @@ static struct r5conf *setup_conf(struct mddev *mddev)
>>  	INIT_LIST_HEAD(&conf->hold_list);
>>  	INIT_LIST_HEAD(&conf->delayed_list);
>>  	INIT_LIST_HEAD(&conf->bitmap_list);
>> -	bio_list_init(&conf->return_bi);
>>  	init_llist_head(&conf->released_stripes);
>>  	atomic_set(&conf->active_stripes, 0);
>>  	atomic_set(&conf->preread_active_stripes, 0);
>> diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
>> index ba5b7a3790af..13800dc9dd88 100644
>> --- a/drivers/md/raid5.h
>> +++ b/drivers/md/raid5.h
>> @@ -638,9 +638,6 @@ struct r5conf {
>>  	int			skip_copy; /* Don't copy data from bio to stripe cache */
>>  	struct list_head	*last_hold; /* detect hold_list promotions */
>>  
>> -	/* bios to have bi_end_io called after metadata is synced */
>> -	struct bio_list		return_bi;
>> -
>>  	atomic_t		reshape_stripes; /* stripes with pending writes for reshape */
>>  	/* unfortunately we need two cache names as we temporarily have
>>  	 * two caches.
>> 
>> 

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 832 bytes --]

  reply	other threads:[~2017-03-16  2:45 UTC|newest]

Thread overview: 31+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2017-03-15  3:05 [md PATCH 00/15 v2] remove all abuse of bi_phys_segments NeilBrown
2017-03-15  3:05 ` [md PATCH 03/15] md/raid5: call bio_endio() directly rather than queueing for later NeilBrown
2017-03-15  3:05 ` [md PATCH 01/15] md/raid5: use md_write_start to count stripes, not bios NeilBrown
2017-03-15  3:05 ` [md PATCH 04/15] block: trace completion of all bios NeilBrown
2017-03-15  3:05 ` [md PATCH 02/15] md/raid5: simplfy delaying of writes while metadata is updated NeilBrown
2017-03-15 23:03   ` Shaohua Li
2017-03-16  2:45     ` NeilBrown [this message]
2017-03-22  1:40   ` Fix bug in " NeilBrown
2017-03-22  2:29     ` REALLY " NeilBrown
2017-03-22  2:35       ` NeilBrown
2017-03-23  2:22         ` Shaohua Li
2017-03-15  3:05 ` [md PATCH 06/15] md/raid5: remove over-loading of ->bi_phys_segments NeilBrown
2017-03-15  3:05 ` [md PATCH 09/15] md/raid10: stop using bi_phys_segments NeilBrown
2017-03-15  3:05 ` [md PATCH 05/15] md/raid5: use bio_inc_remaining() instead of repurposing bi_phys_segments as a counter NeilBrown
2017-03-15  3:05 ` [md PATCH 08/15] md/raid1, raid10: move rXbio accounting closer to allocation NeilBrown
2017-03-15  3:05 ` [md PATCH 07/15] Revert "md/raid5: limit request size according to implementation limits" NeilBrown
2017-03-15  3:05 ` [md PATCH 14/15] percpu-refcount: support synchronous switch to atomic mode NeilBrown
2017-03-15  3:05 ` [md PATCH 13/15] md: close a race with setting mddev->in_sync NeilBrown
2017-03-15  3:05 ` [md PATCH 15/15] MD: use per-cpu counter for writes_pending NeilBrown
2017-03-16  1:05   ` Shaohua Li
2017-03-16  2:57     ` NeilBrown
2017-03-22  1:55   ` Improvement for " NeilBrown
2017-03-22  2:34     ` IMPROVEMENT for " NeilBrown
2017-03-15  3:05 ` [md PATCH 11/15] md/raid5: don't test ->writes_pending in raid5_remove_disk NeilBrown
2017-03-15  3:05 ` [md PATCH 10/15] md/raid1: stop using bi_phys_segment NeilBrown
2017-03-16  0:13   ` Shaohua Li
2017-03-16  2:49     ` NeilBrown
2017-03-16  3:36       ` Shaohua Li
2017-03-22  1:41   ` Fix bugs in " NeilBrown
2017-03-15  3:05 ` [md PATCH 12/15] md: factor out set_in_sync() NeilBrown
2017-03-16  1:12 ` [md PATCH 00/15 v2] remove all abuse of bi_phys_segments Shaohua Li

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=8760jadjnn.fsf@notabene.neil.brown.name \
    --to=neilb@suse.com \
    --cc=hch@lst.de \
    --cc=linux-raid@vger.kernel.org \
    --cc=shli@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.