[RFC] using mempools for raid5-cache

linux-raid.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [RFC] using mempools for raid5-cache
@ 2015-12-02 16:10 Christoph Hellwig
  2015-12-02 16:10 ` [PATCH 1/2] raid5-cache: use a bio_set Christoph Hellwig
                   ` (3 more replies)
  0 siblings, 4 replies; 16+ messages in thread
From: Christoph Hellwig @ 2015-12-02 16:10 UTC (permalink / raw)
  To: shli, neilb; +Cc: linux-raid

Currently the raid5-cache code is heavily relying on GFP_NOFAIL allocations.

I've looked into replacing these with mempools and biosets, and for the
bio and the meta_page that's pretty trivial as they have short life times
and do make guaranteed progress.  I'm massively struggling with the iounit
allocation, though.  These can live on for a long time over log I/O, cache
flushing and last but not least RAID I/O, and every attempt at something
mempool-like results in reproducible deadlocks.  I wonder if we need to
figure out some more efficient data structure to communicate the completion
status that doesn't rely on these fairly long living allocations from
the I/O path.

FYI, my last attempt to use the bio frontpad is below, but a mempool showed
pretty similar results:

diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 2e3f22a..d2438be 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -33,12 +33,12 @@
  */
 #define RECLAIM_MAX_FREE_SPACE (10 * 1024 * 1024 * 2) /* sector */
 #define RECLAIM_MAX_FREE_SPACE_SHIFT (2)
-
+	
 /*
  * We only need 2 bios per I/O unit to make progress, but ensure we
  * have a few more available to not get too tight.
  */
-#define R5L_POOL_SIZE	1024
+#define R5L_POOL_SIZE	16384
 
 struct r5l_log {
 	struct md_rdev *rdev;
@@ -75,7 +75,6 @@ struct r5l_log {
 	struct list_head finished_ios;	/* io_units which settle down in log disk */
 	struct bio flush_bio;
 
-	struct kmem_cache *io_kc;
 	struct bio_set *bs;
 	mempool_t *meta_pool;
 
@@ -120,6 +119,8 @@ struct r5l_io_unit {
 
 	int state;
 	bool need_split_bio;
+
+	struct bio bio;
 };
 
 /* r5l_io_unit state */
@@ -209,14 +210,13 @@ static void r5l_move_to_end_ios(struct r5l_log *log)
 
 static void r5l_log_endio(struct bio *bio)
 {
-	struct r5l_io_unit *io = bio->bi_private;
+	struct r5l_io_unit *io = container_of(bio, struct r5l_io_unit, bio);
 	struct r5l_log *log = io->log;
 	unsigned long flags;
 
 	if (bio->bi_error)
 		md_error(log->rdev->mddev, log->rdev);
 
-	bio_put(bio);
 	mempool_free(io->meta_page, log->meta_pool);
 
 	spin_lock_irqsave(&log->io_list_lock, flags);
@@ -284,11 +284,13 @@ static void r5_reserve_log_entry(struct r5l_log *log, struct r5l_io_unit *io)
 
 static struct r5l_io_unit *r5l_new_meta(struct r5l_log *log)
 {
+	struct bio *bio;
 	struct r5l_io_unit *io;
 	struct r5l_meta_block *block;
 
-	/* We can't handle memory allocate failure so far */
-	io = kmem_cache_zalloc(log->io_kc, GFP_NOIO | __GFP_NOFAIL);
+	bio = r5l_bio_alloc(log);
+
+	io = container_of(bio, struct r5l_io_unit, bio);
 	io->log = log;
 	INIT_LIST_HEAD(&io->log_sibling);
 	INIT_LIST_HEAD(&io->stripe_list);
@@ -306,7 +308,7 @@ static struct r5l_io_unit *r5l_new_meta(struct r5l_log *log)
 	io->meta_offset = sizeof(struct r5l_meta_block);
 	io->seq = log->seq++;
 
-	io->current_bio = r5l_bio_alloc(log);
+	io->current_bio = bio;
 	io->current_bio->bi_end_io = r5l_log_endio;
 	io->current_bio->bi_private = io;
 	bio_add_page(io->current_bio, io->meta_page, PAGE_SIZE, 0);
@@ -556,7 +558,7 @@ static bool r5l_complete_finished_ios(struct r5l_log *log)
 		log->next_cp_seq = io->seq;
 
 		list_del(&io->log_sibling);
-		kmem_cache_free(log->io_kc, io);
+		bio_put(&io->bio);
 
 		found = true;
 	}
@@ -1158,11 +1160,8 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
 	INIT_LIST_HEAD(&log->finished_ios);
 	bio_init(&log->flush_bio);
 
-	log->io_kc = KMEM_CACHE(r5l_io_unit, 0);
-	if (!log->io_kc)
-		goto io_kc;
-
-	log->bs = bioset_create(R5L_POOL_SIZE, 0);
+	log->bs = bioset_create(R5L_POOL_SIZE,
+			offsetof(struct r5l_io_unit, bio));
 	if (!log->bs)
 		goto io_bs;
 
@@ -1192,8 +1191,6 @@ reclaim_thread:
 out_mempool:
 	bioset_free(log->bs);
 io_bs:
-	kmem_cache_destroy(log->io_kc);
-io_kc:
 	kfree(log);
 	return -EINVAL;
 }
@@ -1203,6 +1200,5 @@ void r5l_exit_log(struct r5l_log *log)
 	md_unregister_thread(&log->reclaim_thread);
 	mempool_destroy(log->meta_pool);
 	bioset_free(log->bs);
-	kmem_cache_destroy(log->io_kc);
 	kfree(log);
 }


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH 1/2] raid5-cache: use a bio_set
  2015-12-02 16:10 [RFC] using mempools for raid5-cache Christoph Hellwig
@ 2015-12-02 16:10 ` Christoph Hellwig
  2015-12-03  5:01   ` Shaohua Li
  2015-12-08 23:22   ` NeilBrown
  2015-12-02 16:10 ` [PATCH 2/2] raid5-cache: use a mempool for the metadata block Christoph Hellwig
                   ` (2 subsequent siblings)
  3 siblings, 2 replies; 16+ messages in thread
From: Christoph Hellwig @ 2015-12-02 16:10 UTC (permalink / raw)
  To: shli, neilb; +Cc: linux-raid

This allows us to make guaranteed forward progress.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/md/raid5-cache.c | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 668e973..ef59564 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -34,6 +34,12 @@
 #define RECLAIM_MAX_FREE_SPACE (10 * 1024 * 1024 * 2) /* sector */
 #define RECLAIM_MAX_FREE_SPACE_SHIFT (2)
 
+/*
+ * We only need 2 bios per I/O unit to make progress, but ensure we
+ * have a few more available to not get too tight.
+ */
+#define R5L_POOL_SIZE	1024
+
 struct r5l_log {
 	struct md_rdev *rdev;
 
@@ -70,6 +76,7 @@ struct r5l_log {
 	struct bio flush_bio;
 
 	struct kmem_cache *io_kc;
+	struct bio_set *bs;
 
 	struct md_thread *reclaim_thread;
 	unsigned long reclaim_target;	/* number of space that need to be
@@ -248,7 +255,7 @@ static void r5l_submit_current_io(struct r5l_log *log)
 
 static struct bio *r5l_bio_alloc(struct r5l_log *log)
 {
-	struct bio *bio = bio_kmalloc(GFP_NOIO | __GFP_NOFAIL, BIO_MAX_PAGES);
+	struct bio *bio = bio_alloc_bioset(GFP_NOIO, BIO_MAX_PAGES, log->bs);
 
 	bio->bi_rw = WRITE;
 	bio->bi_bdev = log->rdev->bdev;
@@ -1153,6 +1160,10 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
 	if (!log->io_kc)
 		goto io_kc;
 
+	log->bs = bioset_create(R5L_POOL_SIZE, 0);
+	if (!log->bs)
+		goto io_bs;
+
 	log->reclaim_thread = md_register_thread(r5l_reclaim_thread,
 						 log->rdev->mddev, "reclaim");
 	if (!log->reclaim_thread)
@@ -1170,6 +1181,8 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
 error:
 	md_unregister_thread(&log->reclaim_thread);
 reclaim_thread:
+	bioset_free(log->bs);
+io_bs:
 	kmem_cache_destroy(log->io_kc);
 io_kc:
 	kfree(log);
@@ -1179,6 +1192,7 @@ io_kc:
 void r5l_exit_log(struct r5l_log *log)
 {
 	md_unregister_thread(&log->reclaim_thread);
+	bioset_free(log->bs);
 	kmem_cache_destroy(log->io_kc);
 	kfree(log);
 }
-- 
1.9.1


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH 2/2] raid5-cache: use a mempool for the metadata block
  2015-12-02 16:10 [RFC] using mempools for raid5-cache Christoph Hellwig
  2015-12-02 16:10 ` [PATCH 1/2] raid5-cache: use a bio_set Christoph Hellwig
@ 2015-12-02 16:10 ` Christoph Hellwig
  2015-12-08 23:27   ` NeilBrown
  2015-12-03  4:49 ` [RFC] using mempools for raid5-cache Shaohua Li
  2015-12-09  0:36 ` NeilBrown
  3 siblings, 1 reply; 16+ messages in thread
From: Christoph Hellwig @ 2015-12-02 16:10 UTC (permalink / raw)
  To: shli, neilb; +Cc: linux-raid

We only have a limited number in flight, so use a page based mempool.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/md/raid5-cache.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index ef59564..2e3f22a 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -77,6 +77,7 @@ struct r5l_log {
 
 	struct kmem_cache *io_kc;
 	struct bio_set *bs;
+	mempool_t *meta_pool;
 
 	struct md_thread *reclaim_thread;
 	unsigned long reclaim_target;	/* number of space that need to be
@@ -216,7 +217,7 @@ static void r5l_log_endio(struct bio *bio)
 		md_error(log->rdev->mddev, log->rdev);
 
 	bio_put(bio);
-	__free_page(io->meta_page);
+	mempool_free(io->meta_page, log->meta_pool);
 
 	spin_lock_irqsave(&log->io_list_lock, flags);
 	__r5l_set_io_unit_state(io, IO_UNIT_IO_END);
@@ -293,8 +294,9 @@ static struct r5l_io_unit *r5l_new_meta(struct r5l_log *log)
 	INIT_LIST_HEAD(&io->stripe_list);
 	io->state = IO_UNIT_RUNNING;
 
-	io->meta_page = alloc_page(GFP_NOIO | __GFP_NOFAIL | __GFP_ZERO);
+	io->meta_page = mempool_alloc(log->meta_pool, GFP_NOIO);
 	block = page_address(io->meta_page);
+	clear_page(block);
 	block->magic = cpu_to_le32(R5LOG_MAGIC);
 	block->version = R5LOG_VERSION;
 	block->seq = cpu_to_le64(log->seq);
@@ -1164,6 +1166,10 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
 	if (!log->bs)
 		goto io_bs;
 
+	log->meta_pool = mempool_create_page_pool(R5L_POOL_SIZE, 0);
+	if (!log->meta_pool)
+		goto out_mempool;
+
 	log->reclaim_thread = md_register_thread(r5l_reclaim_thread,
 						 log->rdev->mddev, "reclaim");
 	if (!log->reclaim_thread)
@@ -1178,9 +1184,12 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
 
 	conf->log = log;
 	return 0;
+
 error:
 	md_unregister_thread(&log->reclaim_thread);
 reclaim_thread:
+	mempool_destroy(log->meta_pool);
+out_mempool:
 	bioset_free(log->bs);
 io_bs:
 	kmem_cache_destroy(log->io_kc);
@@ -1192,6 +1201,7 @@ io_kc:
 void r5l_exit_log(struct r5l_log *log)
 {
 	md_unregister_thread(&log->reclaim_thread);
+	mempool_destroy(log->meta_pool);
 	bioset_free(log->bs);
 	kmem_cache_destroy(log->io_kc);
 	kfree(log);
-- 
1.9.1


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* Re: [RFC] using mempools for raid5-cache
  2015-12-02 16:10 [RFC] using mempools for raid5-cache Christoph Hellwig
  2015-12-02 16:10 ` [PATCH 1/2] raid5-cache: use a bio_set Christoph Hellwig
  2015-12-02 16:10 ` [PATCH 2/2] raid5-cache: use a mempool for the metadata block Christoph Hellwig
@ 2015-12-03  4:49 ` Shaohua Li
  2015-12-09  0:36 ` NeilBrown
  3 siblings, 0 replies; 16+ messages in thread
From: Shaohua Li @ 2015-12-03  4:49 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: neilb, linux-raid

On Wed, Dec 02, 2015 at 05:10:36PM +0100, Christoph Hellwig wrote:
> Currently the raid5-cache code is heavily relying on GFP_NOFAIL allocations.
> 
> I've looked into replacing these with mempools and biosets, and for the
> bio and the meta_page that's pretty trivial as they have short life times
> and do make guaranteed progress.  I'm massively struggling with the iounit
> allocation, though.  These can live on for a long time over log I/O, cache
> flushing and last but not least RAID I/O, and every attempt at something
> mempool-like results in reproducible deadlocks.  I wonder if we need to
> figure out some more efficient data structure to communicate the completion
> status that doesn't rely on these fairly long living allocations from
> the I/O path.
> 
> FYI, my last attempt to use the bio frontpad is below, but a mempool showed
> pretty similar results:

yep, the io unit and metadata are used to be allocated with a mempool,
but it's hard to calculate the pool size, so fall back to GFP_NOFAIL ...

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 1/2] raid5-cache: use a bio_set
  2015-12-02 16:10 ` [PATCH 1/2] raid5-cache: use a bio_set Christoph Hellwig
@ 2015-12-03  5:01   ` Shaohua Li
  2015-12-08 23:22   ` NeilBrown
  1 sibling, 0 replies; 16+ messages in thread
From: Shaohua Li @ 2015-12-03  5:01 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: neilb, linux-raid

On Wed, Dec 02, 2015 at 05:10:37PM +0100, Christoph Hellwig wrote:
> This allows us to make guaranteed forward progress.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  drivers/md/raid5-cache.c | 16 +++++++++++++++-
>  1 file changed, 15 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
> index 668e973..ef59564 100644
> --- a/drivers/md/raid5-cache.c
> +++ b/drivers/md/raid5-cache.c
> @@ -34,6 +34,12 @@
>  #define RECLAIM_MAX_FREE_SPACE (10 * 1024 * 1024 * 2) /* sector */
>  #define RECLAIM_MAX_FREE_SPACE_SHIFT (2)
>  
> +/*
> + * We only need 2 bios per I/O unit to make progress, but ensure we
> + * have a few more available to not get too tight.
> + */
> +#define R5L_POOL_SIZE	1024

Looks reasonable, the recent arbitrary size bio makes things simpler. I
think we don't need to be that conservative. One metadata page can hold
at most
(PAGE_SIZE - sizeof(struct r5l_meta_block)) / sizeof(struct r5l_payload_data_parity)
pages.

Thanks,
Shaohua

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 1/2] raid5-cache: use a bio_set
  2015-12-02 16:10 ` [PATCH 1/2] raid5-cache: use a bio_set Christoph Hellwig
  2015-12-03  5:01   ` Shaohua Li
@ 2015-12-08 23:22   ` NeilBrown
  2015-12-14 21:12     ` Christoph Hellwig
  1 sibling, 1 reply; 16+ messages in thread
From: NeilBrown @ 2015-12-08 23:22 UTC (permalink / raw)
  To: Christoph Hellwig, shli; +Cc: linux-raid

[-- Attachment #1: Type: text/plain, Size: 3102 bytes --]

On Thu, Dec 03 2015, Christoph Hellwig wrote:

> This allows us to make guaranteed forward progress.
>
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  drivers/md/raid5-cache.c | 16 +++++++++++++++-
>  1 file changed, 15 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
> index 668e973..ef59564 100644
> --- a/drivers/md/raid5-cache.c
> +++ b/drivers/md/raid5-cache.c
> @@ -34,6 +34,12 @@
>  #define RECLAIM_MAX_FREE_SPACE (10 * 1024 * 1024 * 2) /* sector */
>  #define RECLAIM_MAX_FREE_SPACE_SHIFT (2)
>  
> +/*
> + * We only need 2 bios per I/O unit to make progress, but ensure we
> + * have a few more available to not get too tight.
> + */
> +#define R5L_POOL_SIZE	1024
> +

I'm really suspicious of big pool sizes.
The memory allocated to the pool is almost never used - only where no
other memory is available - so large pools are largely wasted.

As you say, we need 2 bios per unit, and unit submission is serialized
(by ->io_mutex) so '2' really should be enough.  For the very brief
periods when there is no other memory, there will only be one or two
units in flight at once, but as each one gets us closer to freeing real
memory, that shouldn't last long.

I can easily justify '4' as "double buffering" is a well understood
technique, but 1024 just seems like gratuitous waste.

If you have performance numbers that tell me I'm wrong I'll stand
corrected, but without evidence I much prefer a smaller number.

Otherwise I really like the change.

Thanks,
NeilBrown


>  struct r5l_log {
>  	struct md_rdev *rdev;
>  
> @@ -70,6 +76,7 @@ struct r5l_log {
>  	struct bio flush_bio;
>  
>  	struct kmem_cache *io_kc;
> +	struct bio_set *bs;
>  
>  	struct md_thread *reclaim_thread;
>  	unsigned long reclaim_target;	/* number of space that need to be
> @@ -248,7 +255,7 @@ static void r5l_submit_current_io(struct r5l_log *log)
>  
>  static struct bio *r5l_bio_alloc(struct r5l_log *log)
>  {
> -	struct bio *bio = bio_kmalloc(GFP_NOIO | __GFP_NOFAIL, BIO_MAX_PAGES);
> +	struct bio *bio = bio_alloc_bioset(GFP_NOIO, BIO_MAX_PAGES, log->bs);
>  
>  	bio->bi_rw = WRITE;
>  	bio->bi_bdev = log->rdev->bdev;
> @@ -1153,6 +1160,10 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
>  	if (!log->io_kc)
>  		goto io_kc;
>  
> +	log->bs = bioset_create(R5L_POOL_SIZE, 0);
> +	if (!log->bs)
> +		goto io_bs;
> +
>  	log->reclaim_thread = md_register_thread(r5l_reclaim_thread,
>  						 log->rdev->mddev, "reclaim");
>  	if (!log->reclaim_thread)
> @@ -1170,6 +1181,8 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
>  error:
>  	md_unregister_thread(&log->reclaim_thread);
>  reclaim_thread:
> +	bioset_free(log->bs);
> +io_bs:
>  	kmem_cache_destroy(log->io_kc);
>  io_kc:
>  	kfree(log);
> @@ -1179,6 +1192,7 @@ io_kc:
>  void r5l_exit_log(struct r5l_log *log)
>  {
>  	md_unregister_thread(&log->reclaim_thread);
> +	bioset_free(log->bs);
>  	kmem_cache_destroy(log->io_kc);
>  	kfree(log);
>  }
> -- 
> 1.9.1

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 818 bytes --]

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/2] raid5-cache: use a mempool for the metadata block
  2015-12-02 16:10 ` [PATCH 2/2] raid5-cache: use a mempool for the metadata block Christoph Hellwig
@ 2015-12-08 23:27   ` NeilBrown
  0 siblings, 0 replies; 16+ messages in thread
From: NeilBrown @ 2015-12-08 23:27 UTC (permalink / raw)
  To: Christoph Hellwig, shli; +Cc: linux-raid

[-- Attachment #1: Type: text/plain, Size: 992 bytes --]

On Thu, Dec 03 2015, Christoph Hellwig wrote:

> +
>  error:
>  	md_unregister_thread(&log->reclaim_thread);
>  reclaim_thread:
> +	mempool_destroy(log->meta_pool);
> +out_mempool:
>  	bioset_free(log->bs);
>  io_bs:

These cascading goto labels always bother me.
As mempool_destroy accepts NULL, we don't need "out_mempool", we can
just "goto reclaim_thread", or even "goto error" as md_unregsiter_thread
copes with NULL too.
If we use:
    if (log->bs)
    	bioset_free(log->bs);

or modified bioset_free, we could just have a single 'error' label...

But that is just cosmetic.  I like the patch once we agree on value for
R5L_POOL_SIZE.

Thanks,
NeilBrown


>  	kmem_cache_destroy(log->io_kc);
> @@ -1192,6 +1201,7 @@ io_kc:
>  void r5l_exit_log(struct r5l_log *log)
>  {
>  	md_unregister_thread(&log->reclaim_thread);
> +	mempool_destroy(log->meta_pool);
>  	bioset_free(log->bs);
>  	kmem_cache_destroy(log->io_kc);
>  	kfree(log);
> -- 
> 1.9.1

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 818 bytes --]

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RFC] using mempools for raid5-cache
  2015-12-02 16:10 [RFC] using mempools for raid5-cache Christoph Hellwig
                   ` (2 preceding siblings ...)
  2015-12-03  4:49 ` [RFC] using mempools for raid5-cache Shaohua Li
@ 2015-12-09  0:36 ` NeilBrown
  2015-12-09  1:28   ` Shaohua Li
  3 siblings, 1 reply; 16+ messages in thread
From: NeilBrown @ 2015-12-09  0:36 UTC (permalink / raw)
  To: Christoph Hellwig, shli; +Cc: linux-raid

[-- Attachment #1: Type: text/plain, Size: 1834 bytes --]

On Thu, Dec 03 2015, Christoph Hellwig wrote:

> Currently the raid5-cache code is heavily relying on GFP_NOFAIL allocations.
>
> I've looked into replacing these with mempools and biosets, and for the
> bio and the meta_page that's pretty trivial as they have short life times
> and do make guaranteed progress.  I'm massively struggling with the iounit
> allocation, though.  These can live on for a long time over log I/O, cache
> flushing and last but not least RAID I/O, and every attempt at something
> mempool-like results in reproducible deadlocks.  I wonder if we need to
> figure out some more efficient data structure to communicate the completion
> status that doesn't rely on these fairly long living allocations from
> the I/O path.

Presumably the root cause of these deadlocks is that the raid5d thread
has called
   handle_stripe -> ops_run_io ->r5l_write_stripe -> r5l_log_stripe
      -> r5l_get_meta -> r5l_new_meta

and r5l_new_meta is blocked on memory allocation, which won't complete
until some raid5 stripes get written out, which requires raid5d to do
something more useful than sitting and waiting.

I suspect a good direction towards a solution would be to allow the
memory allocation to fail, to cleanly propagate that failure indication
up through r5l_log_stripe to r5l_write_stripe which falls back to adding
the stripe_head to ->no_space_stripes.

Then we only release stripes from no_space_stripes when a memory
allocation might succeed.

There are lots of missing details, and possibly we would need a separate
list rather than re-using no_space_stripes.
But the key idea is that raid5d should never block (except beneath
submit_bio on some other device) and when it cannot make progress
without blocking, it should queue the stripe_head for later handling.

Does that make sense?

Thanks,
NeilBrown

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 818 bytes --]

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RFC] using mempools for raid5-cache
  2015-12-09  0:36 ` NeilBrown
@ 2015-12-09  1:28   ` Shaohua Li
  2015-12-09  6:34     ` NeilBrown
  2015-12-09 13:51     ` Wols Lists
  0 siblings, 2 replies; 16+ messages in thread
From: Shaohua Li @ 2015-12-09  1:28 UTC (permalink / raw)
  To: NeilBrown; +Cc: Christoph Hellwig, linux-raid

On Wed, Dec 09, 2015 at 11:36:30AM +1100, NeilBrown wrote:
> On Thu, Dec 03 2015, Christoph Hellwig wrote:
> 
> > Currently the raid5-cache code is heavily relying on GFP_NOFAIL allocations.
> >
> > I've looked into replacing these with mempools and biosets, and for the
> > bio and the meta_page that's pretty trivial as they have short life times
> > and do make guaranteed progress.  I'm massively struggling with the iounit
> > allocation, though.  These can live on for a long time over log I/O, cache
> > flushing and last but not least RAID I/O, and every attempt at something
> > mempool-like results in reproducible deadlocks.  I wonder if we need to
> > figure out some more efficient data structure to communicate the completion
> > status that doesn't rely on these fairly long living allocations from
> > the I/O path.
> 
> Presumably the root cause of these deadlocks is that the raid5d thread
> has called
>    handle_stripe -> ops_run_io ->r5l_write_stripe -> r5l_log_stripe
>       -> r5l_get_meta -> r5l_new_meta
> 
> and r5l_new_meta is blocked on memory allocation, which won't complete
> until some raid5 stripes get written out, which requires raid5d to do
> something more useful than sitting and waiting.
> 
> I suspect a good direction towards a solution would be to allow the
> memory allocation to fail, to cleanly propagate that failure indication
> up through r5l_log_stripe to r5l_write_stripe which falls back to adding
> the stripe_head to ->no_space_stripes.
> 
> Then we only release stripes from no_space_stripes when a memory
> allocation might succeed.
> 
> There are lots of missing details, and possibly we would need a separate
> list rather than re-using no_space_stripes.
> But the key idea is that raid5d should never block (except beneath
> submit_bio on some other device) and when it cannot make progress
> without blocking, it should queue the stripe_head for later handling.
> 
> Does that make sense?

It does remove the scary __GFP_NOFAIL, but the approach is essentially
idential to a 'retry after allocation failure'. Why not just let the mm
(with __GFP_NOFAIL) to do the retry then?

Thanks,
Shaohua

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RFC] using mempools for raid5-cache
  2015-12-09  1:28   ` Shaohua Li
@ 2015-12-09  6:34     ` NeilBrown
  2015-12-10 23:40       ` Shaohua Li
  2015-12-09 13:51     ` Wols Lists
  1 sibling, 1 reply; 16+ messages in thread
From: NeilBrown @ 2015-12-09  6:34 UTC (permalink / raw)
  To: Shaohua Li; +Cc: Christoph Hellwig, linux-raid

[-- Attachment #1: Type: text/plain, Size: 2428 bytes --]

On Wed, Dec 09 2015, Shaohua Li wrote:

> On Wed, Dec 09, 2015 at 11:36:30AM +1100, NeilBrown wrote:
>> On Thu, Dec 03 2015, Christoph Hellwig wrote:
>> 
>> > Currently the raid5-cache code is heavily relying on GFP_NOFAIL allocations.
>> >
>> > I've looked into replacing these with mempools and biosets, and for the
>> > bio and the meta_page that's pretty trivial as they have short life times
>> > and do make guaranteed progress.  I'm massively struggling with the iounit
>> > allocation, though.  These can live on for a long time over log I/O, cache
>> > flushing and last but not least RAID I/O, and every attempt at something
>> > mempool-like results in reproducible deadlocks.  I wonder if we need to
>> > figure out some more efficient data structure to communicate the completion
>> > status that doesn't rely on these fairly long living allocations from
>> > the I/O path.
>> 
>> Presumably the root cause of these deadlocks is that the raid5d thread
>> has called
>>    handle_stripe -> ops_run_io ->r5l_write_stripe -> r5l_log_stripe
>>       -> r5l_get_meta -> r5l_new_meta
>> 
>> and r5l_new_meta is blocked on memory allocation, which won't complete
>> until some raid5 stripes get written out, which requires raid5d to do
>> something more useful than sitting and waiting.
>> 
>> I suspect a good direction towards a solution would be to allow the
>> memory allocation to fail, to cleanly propagate that failure indication
>> up through r5l_log_stripe to r5l_write_stripe which falls back to adding
>> the stripe_head to ->no_space_stripes.
>> 
>> Then we only release stripes from no_space_stripes when a memory
>> allocation might succeed.
>> 
>> There are lots of missing details, and possibly we would need a separate
>> list rather than re-using no_space_stripes.
>> But the key idea is that raid5d should never block (except beneath
>> submit_bio on some other device) and when it cannot make progress
>> without blocking, it should queue the stripe_head for later handling.
>> 
>> Does that make sense?
>
> It does remove the scary __GFP_NOFAIL, but the approach is essentially
> idential to a 'retry after allocation failure'. Why not just let the mm
> (with __GFP_NOFAIL) to do the retry then?
>

Because deadlocks.

If raid5d is waiting for the mm to allocated memory, then it cannot
retire write requests which could free up memory.

NeilBrown

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 818 bytes --]

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RFC] using mempools for raid5-cache
  2015-12-09  1:28   ` Shaohua Li
  2015-12-09  6:34     ` NeilBrown
@ 2015-12-09 13:51     ` Wols Lists
  1 sibling, 0 replies; 16+ messages in thread
From: Wols Lists @ 2015-12-09 13:51 UTC (permalink / raw)
  To: Shaohua Li, NeilBrown; +Cc: Christoph Hellwig, linux-raid

On 09/12/15 01:28, Shaohua Li wrote:
> It does remove the scary __GFP_NOFAIL, but the approach is essentially
> idential to a 'retry after allocation failure'. Why not just let the mm
> (with __GFP_NOFAIL) to do the retry then?

Because that's almost as dangerous?

Forgive me if I'm talking out of my hat, but I remember an article on
lwn reasonably recently about kernel memory allocation, and it was
something to do with small allocations never failing and the actual
behaviour being COMPLETELY different to what most users THINK is happening.

(iirc, the actual behaviour was NOFAIL, whether it was requested or not,
so if you think that adding NOFAIL to your code is going to help, you
may be in for a nasty shock.)

Whatever, if the disk code is prone to deadlocks, and it's calling
memory code which is prone to deadlocks, then better safe than sorry...

Cheers,
Wol

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RFC] using mempools for raid5-cache
  2015-12-09  6:34     ` NeilBrown
@ 2015-12-10 23:40       ` Shaohua Li
  2015-12-11  0:09         ` NeilBrown
  0 siblings, 1 reply; 16+ messages in thread
From: Shaohua Li @ 2015-12-10 23:40 UTC (permalink / raw)
  To: NeilBrown; +Cc: Christoph Hellwig, linux-raid

On Wed, Dec 09, 2015 at 05:34:45PM +1100, NeilBrown wrote:
> On Wed, Dec 09 2015, Shaohua Li wrote:
> 
> > On Wed, Dec 09, 2015 at 11:36:30AM +1100, NeilBrown wrote:
> >> On Thu, Dec 03 2015, Christoph Hellwig wrote:
> >> 
> >> > Currently the raid5-cache code is heavily relying on GFP_NOFAIL allocations.
> >> >
> >> > I've looked into replacing these with mempools and biosets, and for the
> >> > bio and the meta_page that's pretty trivial as they have short life times
> >> > and do make guaranteed progress.  I'm massively struggling with the iounit
> >> > allocation, though.  These can live on for a long time over log I/O, cache
> >> > flushing and last but not least RAID I/O, and every attempt at something
> >> > mempool-like results in reproducible deadlocks.  I wonder if we need to
> >> > figure out some more efficient data structure to communicate the completion
> >> > status that doesn't rely on these fairly long living allocations from
> >> > the I/O path.
> >> 
> >> Presumably the root cause of these deadlocks is that the raid5d thread
> >> has called
> >>    handle_stripe -> ops_run_io ->r5l_write_stripe -> r5l_log_stripe
> >>       -> r5l_get_meta -> r5l_new_meta
> >> 
> >> and r5l_new_meta is blocked on memory allocation, which won't complete
> >> until some raid5 stripes get written out, which requires raid5d to do
> >> something more useful than sitting and waiting.
> >> 
> >> I suspect a good direction towards a solution would be to allow the
> >> memory allocation to fail, to cleanly propagate that failure indication
> >> up through r5l_log_stripe to r5l_write_stripe which falls back to adding
> >> the stripe_head to ->no_space_stripes.
> >> 
> >> Then we only release stripes from no_space_stripes when a memory
> >> allocation might succeed.
> >> 
> >> There are lots of missing details, and possibly we would need a separate
> >> list rather than re-using no_space_stripes.
> >> But the key idea is that raid5d should never block (except beneath
> >> submit_bio on some other device) and when it cannot make progress
> >> without blocking, it should queue the stripe_head for later handling.
> >> 
> >> Does that make sense?
> >
> > It does remove the scary __GFP_NOFAIL, but the approach is essentially
> > idential to a 'retry after allocation failure'. Why not just let the mm
> > (with __GFP_NOFAIL) to do the retry then?
> >
> 
> Because deadlocks.
> 
> If raid5d is waiting for the mm to allocated memory, then it cannot
> retire write requests which could free up memory.

Ok, I missed the deadlock issue. Looks raid5d can't sleep waitting for
memory. That means the mempool for metadata page/bio doesn't work too,
as raid5d might block and not dispatch previous IO. Your proposal looks
reasonable.

Would keeping NOFAIL and having a separate thread got r5l_log_stripe
work too?

Thanks,
Shaohua

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RFC] using mempools for raid5-cache
  2015-12-10 23:40       ` Shaohua Li
@ 2015-12-11  0:09         ` NeilBrown
  2015-12-11  1:10           ` Shaohua Li
  0 siblings, 1 reply; 16+ messages in thread
From: NeilBrown @ 2015-12-11  0:09 UTC (permalink / raw)
  To: Shaohua Li; +Cc: Christoph Hellwig, linux-raid

[-- Attachment #1: Type: text/plain, Size: 4261 bytes --]

On Fri, Dec 11 2015, Shaohua Li wrote:

> On Wed, Dec 09, 2015 at 05:34:45PM +1100, NeilBrown wrote:
>> On Wed, Dec 09 2015, Shaohua Li wrote:
>> 
>> > On Wed, Dec 09, 2015 at 11:36:30AM +1100, NeilBrown wrote:
>> >> On Thu, Dec 03 2015, Christoph Hellwig wrote:
>> >> 
>> >> > Currently the raid5-cache code is heavily relying on GFP_NOFAIL allocations.
>> >> >
>> >> > I've looked into replacing these with mempools and biosets, and for the
>> >> > bio and the meta_page that's pretty trivial as they have short life times
>> >> > and do make guaranteed progress.  I'm massively struggling with the iounit
>> >> > allocation, though.  These can live on for a long time over log I/O, cache
>> >> > flushing and last but not least RAID I/O, and every attempt at something
>> >> > mempool-like results in reproducible deadlocks.  I wonder if we need to
>> >> > figure out some more efficient data structure to communicate the completion
>> >> > status that doesn't rely on these fairly long living allocations from
>> >> > the I/O path.
>> >> 
>> >> Presumably the root cause of these deadlocks is that the raid5d thread
>> >> has called
>> >>    handle_stripe -> ops_run_io ->r5l_write_stripe -> r5l_log_stripe
>> >>       -> r5l_get_meta -> r5l_new_meta
>> >> 
>> >> and r5l_new_meta is blocked on memory allocation, which won't complete
>> >> until some raid5 stripes get written out, which requires raid5d to do
>> >> something more useful than sitting and waiting.
>> >> 
>> >> I suspect a good direction towards a solution would be to allow the
>> >> memory allocation to fail, to cleanly propagate that failure indication
>> >> up through r5l_log_stripe to r5l_write_stripe which falls back to adding
>> >> the stripe_head to ->no_space_stripes.
>> >> 
>> >> Then we only release stripes from no_space_stripes when a memory
>> >> allocation might succeed.
>> >> 
>> >> There are lots of missing details, and possibly we would need a separate
>> >> list rather than re-using no_space_stripes.
>> >> But the key idea is that raid5d should never block (except beneath
>> >> submit_bio on some other device) and when it cannot make progress
>> >> without blocking, it should queue the stripe_head for later handling.
>> >> 
>> >> Does that make sense?
>> >
>> > It does remove the scary __GFP_NOFAIL, but the approach is essentially
>> > idential to a 'retry after allocation failure'. Why not just let the mm
>> > (with __GFP_NOFAIL) to do the retry then?
>> >
>> 
>> Because deadlocks.
>> 
>> If raid5d is waiting for the mm to allocated memory, then it cannot
>> retire write requests which could free up memory.
>
> Ok, I missed the deadlock issue. Looks raid5d can't sleep waitting for
> memory. That means the mempool for metadata page/bio doesn't work too,
> as raid5d might block and not dispatch previous IO. Your proposal looks
> reasonable.

raid5d mustn't sleep waiting for something that only raid5d can provide.

A bio allocated from a mempool will be returned to that mempool when the
IO request completes (if the end_io routine calls bio_put, which it
does).
So when waiting on a mempool for a bio, you are only waiting for some
underlying device to perform IO, you are not waiting for anything that
raid5d might do.  So that is safe.

Similarly the meta_page is freed in the end_io function, so it is safe
to wait for that.
Note that the important feature of a mempool is not so much that it
pre-allocates some memory.  The important point is that when that
pre-allocated memory is freed it can *only* be used by the owner of the
mempool.
By contrast memory allocated with NOFAIL, when freed might be used by
any other caller anywhere in the kernel.
So waiting on a mempool means waiting for certain well defined events.
Waiting in NOFAIL could end up waiting for almost anything.  So it is
much harder to reason about deadlocks.

The bio and the meta_page contrast with the r5l_io_unit which isn't
freed by the first end_io, but remains until raid5d has done some more
work.  So raid5d cannot wait for a r5l_io_unit to be freed.

NeilBrown

>
> Would keeping NOFAIL and having a separate thread got r5l_log_stripe
> work too?
>
> Thanks,
> Shaohua

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 818 bytes --]

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RFC] using mempools for raid5-cache
  2015-12-11  0:09         ` NeilBrown
@ 2015-12-11  1:10           ` Shaohua Li
  2015-12-11  1:56             ` NeilBrown
  0 siblings, 1 reply; 16+ messages in thread
From: Shaohua Li @ 2015-12-11  1:10 UTC (permalink / raw)
  To: NeilBrown; +Cc: Christoph Hellwig, linux-raid

On Fri, Dec 11, 2015 at 11:09:58AM +1100, NeilBrown wrote:
> On Fri, Dec 11 2015, Shaohua Li wrote:
> 
> > On Wed, Dec 09, 2015 at 05:34:45PM +1100, NeilBrown wrote:
> >> On Wed, Dec 09 2015, Shaohua Li wrote:
> >> 
> >> > On Wed, Dec 09, 2015 at 11:36:30AM +1100, NeilBrown wrote:
> >> >> On Thu, Dec 03 2015, Christoph Hellwig wrote:
> >> >> 
> >> >> > Currently the raid5-cache code is heavily relying on GFP_NOFAIL allocations.
> >> >> >
> >> >> > I've looked into replacing these with mempools and biosets, and for the
> >> >> > bio and the meta_page that's pretty trivial as they have short life times
> >> >> > and do make guaranteed progress.  I'm massively struggling with the iounit
> >> >> > allocation, though.  These can live on for a long time over log I/O, cache
> >> >> > flushing and last but not least RAID I/O, and every attempt at something
> >> >> > mempool-like results in reproducible deadlocks.  I wonder if we need to
> >> >> > figure out some more efficient data structure to communicate the completion
> >> >> > status that doesn't rely on these fairly long living allocations from
> >> >> > the I/O path.
> >> >> 
> >> >> Presumably the root cause of these deadlocks is that the raid5d thread
> >> >> has called
> >> >>    handle_stripe -> ops_run_io ->r5l_write_stripe -> r5l_log_stripe
> >> >>       -> r5l_get_meta -> r5l_new_meta
> >> >> 
> >> >> and r5l_new_meta is blocked on memory allocation, which won't complete
> >> >> until some raid5 stripes get written out, which requires raid5d to do
> >> >> something more useful than sitting and waiting.
> >> >> 
> >> >> I suspect a good direction towards a solution would be to allow the
> >> >> memory allocation to fail, to cleanly propagate that failure indication
> >> >> up through r5l_log_stripe to r5l_write_stripe which falls back to adding
> >> >> the stripe_head to ->no_space_stripes.
> >> >> 
> >> >> Then we only release stripes from no_space_stripes when a memory
> >> >> allocation might succeed.
> >> >> 
> >> >> There are lots of missing details, and possibly we would need a separate
> >> >> list rather than re-using no_space_stripes.
> >> >> But the key idea is that raid5d should never block (except beneath
> >> >> submit_bio on some other device) and when it cannot make progress
> >> >> without blocking, it should queue the stripe_head for later handling.
> >> >> 
> >> >> Does that make sense?
> >> >
> >> > It does remove the scary __GFP_NOFAIL, but the approach is essentially
> >> > idential to a 'retry after allocation failure'. Why not just let the mm
> >> > (with __GFP_NOFAIL) to do the retry then?
> >> >
> >> 
> >> Because deadlocks.
> >> 
> >> If raid5d is waiting for the mm to allocated memory, then it cannot
> >> retire write requests which could free up memory.
> >
> > Ok, I missed the deadlock issue. Looks raid5d can't sleep waitting for
> > memory. That means the mempool for metadata page/bio doesn't work too,
> > as raid5d might block and not dispatch previous IO. Your proposal looks
> > reasonable.
> 
> raid5d mustn't sleep waiting for something that only raid5d can provide.
> 
> A bio allocated from a mempool will be returned to that mempool when the
> IO request completes (if the end_io routine calls bio_put, which it
> does).
> So when waiting on a mempool for a bio, you are only waiting for some
> underlying device to perform IO, you are not waiting for anything that
> raid5d might do.  So that is safe.
> 
> Similarly the meta_page is freed in the end_io function, so it is safe
> to wait for that.
> Note that the important feature of a mempool is not so much that it
> pre-allocates some memory.  The important point is that when that
> pre-allocated memory is freed it can *only* be used by the owner of the
> mempool.
> By contrast memory allocated with NOFAIL, when freed might be used by
> any other caller anywhere in the kernel.
> So waiting on a mempool means waiting for certain well defined events.
> Waiting in NOFAIL could end up waiting for almost anything.  So it is
> much harder to reason about deadlocks.
> 
> The bio and the meta_page contrast with the r5l_io_unit which isn't
> freed by the first end_io, but remains until raid5d has done some more
> work.  So raid5d cannot wait for a r5l_io_unit to be freed.

I'm confused 2 bios are already enough to avoid deadlock, sorry. Can you
peek Christoph's patches? I'll work on a patch for r5l_io_unit.

Thanks,
Shaohua

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RFC] using mempools for raid5-cache
  2015-12-11  1:10           ` Shaohua Li
@ 2015-12-11  1:56             ` NeilBrown
  0 siblings, 0 replies; 16+ messages in thread
From: NeilBrown @ 2015-12-11  1:56 UTC (permalink / raw)
  To: Shaohua Li; +Cc: Christoph Hellwig, linux-raid

[-- Attachment #1: Type: text/plain, Size: 4949 bytes --]

On Fri, Dec 11 2015, Shaohua Li wrote:

> On Fri, Dec 11, 2015 at 11:09:58AM +1100, NeilBrown wrote:
>> On Fri, Dec 11 2015, Shaohua Li wrote:
>> 
>> > On Wed, Dec 09, 2015 at 05:34:45PM +1100, NeilBrown wrote:
>> >> On Wed, Dec 09 2015, Shaohua Li wrote:
>> >> 
>> >> > On Wed, Dec 09, 2015 at 11:36:30AM +1100, NeilBrown wrote:
>> >> >> On Thu, Dec 03 2015, Christoph Hellwig wrote:
>> >> >> 
>> >> >> > Currently the raid5-cache code is heavily relying on GFP_NOFAIL allocations.
>> >> >> >
>> >> >> > I've looked into replacing these with mempools and biosets, and for the
>> >> >> > bio and the meta_page that's pretty trivial as they have short life times
>> >> >> > and do make guaranteed progress.  I'm massively struggling with the iounit
>> >> >> > allocation, though.  These can live on for a long time over log I/O, cache
>> >> >> > flushing and last but not least RAID I/O, and every attempt at something
>> >> >> > mempool-like results in reproducible deadlocks.  I wonder if we need to
>> >> >> > figure out some more efficient data structure to communicate the completion
>> >> >> > status that doesn't rely on these fairly long living allocations from
>> >> >> > the I/O path.
>> >> >> 
>> >> >> Presumably the root cause of these deadlocks is that the raid5d thread
>> >> >> has called
>> >> >>    handle_stripe -> ops_run_io ->r5l_write_stripe -> r5l_log_stripe
>> >> >>       -> r5l_get_meta -> r5l_new_meta
>> >> >> 
>> >> >> and r5l_new_meta is blocked on memory allocation, which won't complete
>> >> >> until some raid5 stripes get written out, which requires raid5d to do
>> >> >> something more useful than sitting and waiting.
>> >> >> 
>> >> >> I suspect a good direction towards a solution would be to allow the
>> >> >> memory allocation to fail, to cleanly propagate that failure indication
>> >> >> up through r5l_log_stripe to r5l_write_stripe which falls back to adding
>> >> >> the stripe_head to ->no_space_stripes.
>> >> >> 
>> >> >> Then we only release stripes from no_space_stripes when a memory
>> >> >> allocation might succeed.
>> >> >> 
>> >> >> There are lots of missing details, and possibly we would need a separate
>> >> >> list rather than re-using no_space_stripes.
>> >> >> But the key idea is that raid5d should never block (except beneath
>> >> >> submit_bio on some other device) and when it cannot make progress
>> >> >> without blocking, it should queue the stripe_head for later handling.
>> >> >> 
>> >> >> Does that make sense?
>> >> >
>> >> > It does remove the scary __GFP_NOFAIL, but the approach is essentially
>> >> > idential to a 'retry after allocation failure'. Why not just let the mm
>> >> > (with __GFP_NOFAIL) to do the retry then?
>> >> >
>> >> 
>> >> Because deadlocks.
>> >> 
>> >> If raid5d is waiting for the mm to allocated memory, then it cannot
>> >> retire write requests which could free up memory.
>> >
>> > Ok, I missed the deadlock issue. Looks raid5d can't sleep waitting for
>> > memory. That means the mempool for metadata page/bio doesn't work too,
>> > as raid5d might block and not dispatch previous IO. Your proposal looks
>> > reasonable.
>> 
>> raid5d mustn't sleep waiting for something that only raid5d can provide.
>> 
>> A bio allocated from a mempool will be returned to that mempool when the
>> IO request completes (if the end_io routine calls bio_put, which it
>> does).
>> So when waiting on a mempool for a bio, you are only waiting for some
>> underlying device to perform IO, you are not waiting for anything that
>> raid5d might do.  So that is safe.
>> 
>> Similarly the meta_page is freed in the end_io function, so it is safe
>> to wait for that.
>> Note that the important feature of a mempool is not so much that it
>> pre-allocates some memory.  The important point is that when that
>> pre-allocated memory is freed it can *only* be used by the owner of the
>> mempool.
>> By contrast memory allocated with NOFAIL, when freed might be used by
>> any other caller anywhere in the kernel.
>> So waiting on a mempool means waiting for certain well defined events.
>> Waiting in NOFAIL could end up waiting for almost anything.  So it is
>> much harder to reason about deadlocks.
>> 
>> The bio and the meta_page contrast with the r5l_io_unit which isn't
>> freed by the first end_io, but remains until raid5d has done some more
>> work.  So raid5d cannot wait for a r5l_io_unit to be freed.
>
> I'm confused 2 bios are already enough to avoid deadlock, sorry. Can you
> peek Christoph's patches? I'll work on a patch for r5l_io_unit.
>

I don't understand what you are tring to say.
Yes, a bio pool of 2 bios is (I believe) enough to avoid a deadlock when
allocation a bio.  It no affect on any deadlock involved in allocating
an r5l_io_unit.
I have looked at Christoph's patches.  What in particular did you hope I
would see?

NeilBrown

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 818 bytes --]

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 1/2] raid5-cache: use a bio_set
  2015-12-08 23:22   ` NeilBrown
@ 2015-12-14 21:12     ` Christoph Hellwig
  0 siblings, 0 replies; 16+ messages in thread
From: Christoph Hellwig @ 2015-12-14 21:12 UTC (permalink / raw)
  To: NeilBrown; +Cc: shli, linux-raid

On Wed, Dec 09, 2015 at 10:22:12AM +1100, NeilBrown wrote:
> > +/*
> > + * We only need 2 bios per I/O unit to make progress, but ensure we
> > + * have a few more available to not get too tight.
> > + */
> > +#define R5L_POOL_SIZE	1024
> > +
> 
> I'm really suspicious of big pool sizes.
> The memory allocated to the pool is almost never used - only where no
> other memory is available - so large pools are largely wasted.

I originally had 16 here, andd incremented it for some debugging that
stuck.  I retested with 4 and that works as well.  That will be included
in the next resend.

^ permalink raw reply	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2015-12-14 21:12 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2015-12-02 16:10 [RFC] using mempools for raid5-cache Christoph Hellwig
2015-12-02 16:10 ` [PATCH 1/2] raid5-cache: use a bio_set Christoph Hellwig
2015-12-03  5:01   ` Shaohua Li
2015-12-08 23:22   ` NeilBrown
2015-12-14 21:12     ` Christoph Hellwig
2015-12-02 16:10 ` [PATCH 2/2] raid5-cache: use a mempool for the metadata block Christoph Hellwig
2015-12-08 23:27   ` NeilBrown
2015-12-03  4:49 ` [RFC] using mempools for raid5-cache Shaohua Li
2015-12-09  0:36 ` NeilBrown
2015-12-09  1:28   ` Shaohua Li
2015-12-09  6:34     ` NeilBrown
2015-12-10 23:40       ` Shaohua Li
2015-12-11  0:09         ` NeilBrown
2015-12-11  1:10           ` Shaohua Li
2015-12-11  1:56             ` NeilBrown
2015-12-09 13:51     ` Wols Lists

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).