linux-raid.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] md: use a separate bio_set for synchronous IO.
@ 2017-06-20 23:12 NeilBrown
  2017-06-21 17:49 ` Shaohua Li
  0 siblings, 1 reply; 2+ messages in thread
From: NeilBrown @ 2017-06-20 23:12 UTC (permalink / raw)
  To: Shaohua Li; +Cc: linux-raid

[-- Attachment #1: Type: text/plain, Size: 4398 bytes --]


md devices allocate a bio_set and use it for two
distinct purposes.
mddev->bio_set is used to clone bios as part of sending
upper level requests down to lower level devices,
and it is also use for synchronous IO such as superblock
and bitmap updates, and for correcting read errors.

This multiple usage can lead to deadlocks.  It is likely
that cloned bios might be queued for write and to be
waiting for a metadata update before the write can be permitted.
If the cloning exhausted mddev->bio_set, the metadata update
may not be able to proceed.

This scenario has been seen during heavy testing, with lots of IO and
lots of memory pressure.

Address this by adding a new bio_set specifically for synchronous IO.
All synchronous IO goes directly to the underlying device and is not
queued at the md level, so request using entries from the new
mddev->sync_set will complete in a timely fashion.
Requests that use mddev->bio_set will sometimes need to wait
for synchronous IO, but will no longer risk deadlocking that iO.

Also: small simplification in mddev_put(): there is no need to
wait until the spinlock is released before calling bioset_free().

Signed-off-by: NeilBrown <neilb@suse.com>
---
 drivers/md/md.c | 27 ++++++++++++++++++++-------
 drivers/md/md.h |  3 +++
 2 files changed, 23 insertions(+), 7 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 87edc342ccb3..9d0eb50c458e 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -203,6 +203,14 @@ struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
 }
 EXPORT_SYMBOL_GPL(bio_alloc_mddev);
 
+static struct bio *md_bio_alloc_sync(struct mddev *mddev)
+{
+	if (!mddev->sync_set)
+		return bio_alloc(GFP_NOIO, 1);
+
+	return bio_alloc_bioset(GFP_NOIO, 1, mddev->sync_set);
+}
+
 /*
  * We have a system wide 'event count' that is incremented
  * on any 'interesting' event, and readers of /proc/mdstat
@@ -462,8 +470,6 @@ static void mddev_delayed_delete(struct work_struct *ws);
 
 static void mddev_put(struct mddev *mddev)
 {
-	struct bio_set *bs = NULL;
-
 	if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock))
 		return;
 	if (!mddev->raid_disks && list_empty(&mddev->disks) &&
@@ -471,8 +477,12 @@ static void mddev_put(struct mddev *mddev)
 		/* Array is not configured at all, and not held active,
 		 * so destroy it */
 		list_del_init(&mddev->all_mddevs);
-		bs = mddev->bio_set;
+		if (mddev->bio_set)
+			bioset_free(mddev->bio_set);
+		if (mddev->sync_set)
+			bioset_free(mddev->sync_set);
 		mddev->bio_set = NULL;
+		mddev->sync_set = NULL;
 		if (mddev->gendisk) {
 			/* We did a probe so need to clean up.  Call
 			 * queue_work inside the spinlock so that
@@ -485,8 +495,6 @@ static void mddev_put(struct mddev *mddev)
 			kfree(mddev);
 	}
 	spin_unlock(&all_mddevs_lock);
-	if (bs)
-		bioset_free(bs);
 }
 
 static void md_safemode_timeout(unsigned long data);
@@ -751,7 +759,7 @@ void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
 	if (test_bit(Faulty, &rdev->flags))
 		return;
 
-	bio = bio_alloc_mddev(GFP_NOIO, 1, mddev);
+	bio = md_bio_alloc_sync(mddev);
 
 	atomic_inc(&rdev->nr_pending);
 
@@ -783,7 +791,7 @@ int md_super_wait(struct mddev *mddev)
 int sync_page_io(struct md_rdev *rdev, sector_t sector, int size,
 		 struct page *page, int op, int op_flags, bool metadata_op)
 {
-	struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, rdev->mddev);
+	struct bio *bio = md_bio_alloc_sync(rdev->mddev);
 	int ret;
 
 	bio->bi_bdev = (metadata_op && rdev->meta_bdev) ?
@@ -5432,6 +5440,11 @@ int md_run(struct mddev *mddev)
 		if (!mddev->bio_set)
 			return -ENOMEM;
 	}
+	if (mddev->sync_set == NULL) {
+		mddev->sync_set = bioset_create(BIO_POOL_SIZE, 0);
+		if (!mddev->sync_set)
+			return -ENOMEM;
+	}
 
 	spin_lock(&pers_lock);
 	pers = find_pers(mddev->level, mddev->clevel);
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 0fa1de42c42b..a1c7aa89bb67 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -444,6 +444,9 @@ struct mddev {
 	struct attribute_group		*to_remove;
 
 	struct bio_set			*bio_set;
+	struct bio_set			*sync_set; /* for sync operations like
+						   * metadata and bitmap writes
+						   */
 
 	/* Generic flush handling.
 	 * The last to finish preflush schedules a worker to submit
-- 
2.12.2


[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 832 bytes --]

^ permalink raw reply related	[flat|nested] 2+ messages in thread

* Re: [PATCH] md: use a separate bio_set for synchronous IO.
  2017-06-20 23:12 [PATCH] md: use a separate bio_set for synchronous IO NeilBrown
@ 2017-06-21 17:49 ` Shaohua Li
  0 siblings, 0 replies; 2+ messages in thread
From: Shaohua Li @ 2017-06-21 17:49 UTC (permalink / raw)
  To: NeilBrown; +Cc: linux-raid

On Wed, Jun 21, 2017 at 09:12:21AM +1000, Neil Brown wrote:
> 
> md devices allocate a bio_set and use it for two
> distinct purposes.
> mddev->bio_set is used to clone bios as part of sending
> upper level requests down to lower level devices,
> and it is also use for synchronous IO such as superblock
> and bitmap updates, and for correcting read errors.
> 
> This multiple usage can lead to deadlocks.  It is likely
> that cloned bios might be queued for write and to be
> waiting for a metadata update before the write can be permitted.
> If the cloning exhausted mddev->bio_set, the metadata update
> may not be able to proceed.
> 
> This scenario has been seen during heavy testing, with lots of IO and
> lots of memory pressure.
> 
> Address this by adding a new bio_set specifically for synchronous IO.
> All synchronous IO goes directly to the underlying device and is not
> queued at the md level, so request using entries from the new
> mddev->sync_set will complete in a timely fashion.
> Requests that use mddev->bio_set will sometimes need to wait
> for synchronous IO, but will no longer risk deadlocking that iO.
> 
> Also: small simplification in mddev_put(): there is no need to
> wait until the spinlock is released before calling bioset_free().

applied, thanks
 
> Signed-off-by: NeilBrown <neilb@suse.com>
> ---
>  drivers/md/md.c | 27 ++++++++++++++++++++-------
>  drivers/md/md.h |  3 +++
>  2 files changed, 23 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/md/md.c b/drivers/md/md.c
> index 87edc342ccb3..9d0eb50c458e 100644
> --- a/drivers/md/md.c
> +++ b/drivers/md/md.c
> @@ -203,6 +203,14 @@ struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
>  }
>  EXPORT_SYMBOL_GPL(bio_alloc_mddev);
>  
> +static struct bio *md_bio_alloc_sync(struct mddev *mddev)
> +{
> +	if (!mddev->sync_set)
> +		return bio_alloc(GFP_NOIO, 1);
> +
> +	return bio_alloc_bioset(GFP_NOIO, 1, mddev->sync_set);
> +}
> +
>  /*
>   * We have a system wide 'event count' that is incremented
>   * on any 'interesting' event, and readers of /proc/mdstat
> @@ -462,8 +470,6 @@ static void mddev_delayed_delete(struct work_struct *ws);
>  
>  static void mddev_put(struct mddev *mddev)
>  {
> -	struct bio_set *bs = NULL;
> -
>  	if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock))
>  		return;
>  	if (!mddev->raid_disks && list_empty(&mddev->disks) &&
> @@ -471,8 +477,12 @@ static void mddev_put(struct mddev *mddev)
>  		/* Array is not configured at all, and not held active,
>  		 * so destroy it */
>  		list_del_init(&mddev->all_mddevs);
> -		bs = mddev->bio_set;
> +		if (mddev->bio_set)
> +			bioset_free(mddev->bio_set);
> +		if (mddev->sync_set)
> +			bioset_free(mddev->sync_set);
>  		mddev->bio_set = NULL;
> +		mddev->sync_set = NULL;
>  		if (mddev->gendisk) {
>  			/* We did a probe so need to clean up.  Call
>  			 * queue_work inside the spinlock so that
> @@ -485,8 +495,6 @@ static void mddev_put(struct mddev *mddev)
>  			kfree(mddev);
>  	}
>  	spin_unlock(&all_mddevs_lock);
> -	if (bs)
> -		bioset_free(bs);
>  }
>  
>  static void md_safemode_timeout(unsigned long data);
> @@ -751,7 +759,7 @@ void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
>  	if (test_bit(Faulty, &rdev->flags))
>  		return;
>  
> -	bio = bio_alloc_mddev(GFP_NOIO, 1, mddev);
> +	bio = md_bio_alloc_sync(mddev);
>  
>  	atomic_inc(&rdev->nr_pending);
>  
> @@ -783,7 +791,7 @@ int md_super_wait(struct mddev *mddev)
>  int sync_page_io(struct md_rdev *rdev, sector_t sector, int size,
>  		 struct page *page, int op, int op_flags, bool metadata_op)
>  {
> -	struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, rdev->mddev);
> +	struct bio *bio = md_bio_alloc_sync(rdev->mddev);
>  	int ret;
>  
>  	bio->bi_bdev = (metadata_op && rdev->meta_bdev) ?
> @@ -5432,6 +5440,11 @@ int md_run(struct mddev *mddev)
>  		if (!mddev->bio_set)
>  			return -ENOMEM;
>  	}
> +	if (mddev->sync_set == NULL) {
> +		mddev->sync_set = bioset_create(BIO_POOL_SIZE, 0);
> +		if (!mddev->sync_set)
> +			return -ENOMEM;
> +	}
>  
>  	spin_lock(&pers_lock);
>  	pers = find_pers(mddev->level, mddev->clevel);
> diff --git a/drivers/md/md.h b/drivers/md/md.h
> index 0fa1de42c42b..a1c7aa89bb67 100644
> --- a/drivers/md/md.h
> +++ b/drivers/md/md.h
> @@ -444,6 +444,9 @@ struct mddev {
>  	struct attribute_group		*to_remove;
>  
>  	struct bio_set			*bio_set;
> +	struct bio_set			*sync_set; /* for sync operations like
> +						   * metadata and bitmap writes
> +						   */
>  
>  	/* Generic flush handling.
>  	 * The last to finish preflush schedules a worker to submit
> -- 
> 2.12.2
> 



^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2017-06-21 17:49 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2017-06-20 23:12 [PATCH] md: use a separate bio_set for synchronous IO NeilBrown
2017-06-21 17:49 ` Shaohua Li

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).