All of lore.kernel.org
 help / color / mirror / Atom feed
From: Ming Lei <ming.lei@redhat.com>
To: Nilay Shroff <nilay@linux.ibm.com>
Cc: linux-block@vger.kernel.org, hch@lst.de, axboe@kernel.dk,
	sth@linux.ibm.com, lkp@intel.com, gjoyce@ibm.com
Subject: Re: [PATCHv6 3/3] block: fix potential deadlock while running nr_hw_queue update
Date: Tue, 1 Jul 2025 11:24:12 +0800	[thread overview]
Message-ID: <aGNU3PPJ1wU--x-O@fedora> (raw)
In-Reply-To: <20250630054756.54532-4-nilay@linux.ibm.com>

On Mon, Jun 30, 2025 at 10:51:56AM +0530, Nilay Shroff wrote:
> Move scheduler tags (sched_tags) allocation and deallocation outside
> both the ->elevator_lock and ->freeze_lock when updating nr_hw_queues.
> This change breaks the dependency chain from the percpu allocator lock
> to the elevator lock, helping to prevent potential deadlocks, as
> observed in the reported lockdep splat[1].
> 
> This commit introduces batch allocation and deallocation helpers for
> sched_tags, which are now used from within __blk_mq_update_nr_hw_queues
> routine while iterating through the tagset.
> 
> With this change, all sched_tags memory management is handled entirely
> outside the ->elevator_lock and the ->freeze_lock context, thereby
> eliminating the lock dependency that could otherwise manifest during
> nr_hw_queues updates.
> 
> [1] https://lore.kernel.org/all/0659ea8d-a463-47c8-9180-43c719e106eb@linux.ibm.com/
> 
> Reported-by: Stefan Haberland <sth@linux.ibm.com>
> Closes: https://lore.kernel.org/all/0659ea8d-a463-47c8-9180-43c719e106eb@linux.ibm.com/
> Signed-off-by: Nilay Shroff <nilay@linux.ibm.com>
> ---
>  block/blk-mq-sched.c | 63 ++++++++++++++++++++++++++++++++++++++++++++
>  block/blk-mq-sched.h |  4 +++
>  block/blk-mq.c       | 11 +++++++-
>  block/blk.h          |  2 +-
>  block/elevator.c     |  4 +--
>  5 files changed, 80 insertions(+), 4 deletions(-)
> 
> diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
> index 2d6d1ebdd8fb..da802df34a8c 100644
> --- a/block/blk-mq-sched.c
> +++ b/block/blk-mq-sched.c
> @@ -427,6 +427,30 @@ void blk_mq_free_sched_tags(struct elevator_tags *et,
>  	kfree(et);
>  }
>  
> +void blk_mq_free_sched_tags_batch(struct xarray *et_table,
> +		struct blk_mq_tag_set *set)
> +{
> +	struct request_queue *q;
> +	struct elevator_tags *et;
> +
> +	lockdep_assert_held_write(&set->update_nr_hwq_lock);
> +
> +	list_for_each_entry(q, &set->tag_list, tag_set_list) {
> +		/*
> +		 * Accessing q->elevator without holding q->elevator_lock is
> +		 * safe because we're holding here set->update_nr_hwq_lock in
> +		 * the writer context. So, scheduler update/switch code (which
> +		 * acquires the same lock but in the reader context) can't run
> +		 * concurrently.
> +		 */
> +		if (q->elevator) {
> +			et = xa_load(et_table, q->id);
> +			if (et)
> +				blk_mq_free_sched_tags(et, set);
> +		}
> +	}
> +}
> +
>  struct elevator_tags *blk_mq_alloc_sched_tags(struct blk_mq_tag_set *set,
>  		unsigned int nr_hw_queues)
>  {
> @@ -477,6 +501,45 @@ struct elevator_tags *blk_mq_alloc_sched_tags(struct blk_mq_tag_set *set,
>  	return NULL;
>  }
>  
> +int blk_mq_alloc_sched_tags_batch(struct xarray *et_table,
> +		struct blk_mq_tag_set *set, unsigned int nr_hw_queues)
> +{
> +	struct request_queue *q;
> +	struct elevator_tags *et;
> +	gfp_t gfp = GFP_NOIO | __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;
> +
> +	lockdep_assert_held_write(&set->update_nr_hwq_lock);
> +
> +	list_for_each_entry(q, &set->tag_list, tag_set_list) {
> +		/*
> +		 * Accessing q->elevator without holding q->elevator_lock is
> +		 * safe because we're holding here set->update_nr_hwq_lock in
> +		 * the writer context. So, scheduler update/switch code (which
> +		 * acquires the same lock but in the reader context) can't run
> +		 * concurrently.
> +		 */
> +		if (q->elevator) {
> +			et = blk_mq_alloc_sched_tags(set, nr_hw_queues);
> +			if (!et)
> +				goto out_unwind;
> +			if (xa_insert(et_table, q->id, et, gfp))
> +				goto out_free_tags;
> +		}
> +	}
> +	return 0;
> +out_free_tags:
> +	blk_mq_free_sched_tags(et, set);
> +out_unwind:
> +	list_for_each_entry_continue_reverse(q, &set->tag_list, tag_set_list) {
> +		if (q->elevator) {
> +			et = xa_load(et_table, q->id);
> +			if (et)
> +				blk_mq_free_sched_tags(et, set);
> +		}
> +	}
> +	return -ENOMEM;
> +}
> +
>  /* caller must have a reference to @e, will grab another one if successful */
>  int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e,
>  		struct elevator_tags *et)
> diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h
> index 0cde00cd1c47..b554e1d55950 100644
> --- a/block/blk-mq-sched.h
> +++ b/block/blk-mq-sched.h
> @@ -25,8 +25,12 @@ void blk_mq_sched_free_rqs(struct request_queue *q);
>  
>  struct elevator_tags *blk_mq_alloc_sched_tags(struct blk_mq_tag_set *set,
>  		unsigned int nr_hw_queues);
> +int blk_mq_alloc_sched_tags_batch(struct xarray *et_table,
> +		struct blk_mq_tag_set *set, unsigned int nr_hw_queues);
>  void blk_mq_free_sched_tags(struct elevator_tags *et,
>  		struct blk_mq_tag_set *set);
> +void blk_mq_free_sched_tags_batch(struct xarray *et_table,
> +		struct blk_mq_tag_set *set);
>  
>  static inline void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx)
>  {
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index 4806b867e37d..a68b658ce07b 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -4972,6 +4972,7 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
>  	struct request_queue *q;
>  	int prev_nr_hw_queues = set->nr_hw_queues;
>  	unsigned int memflags;
> +	struct xarray et_table;
>  	int i;
>  
>  	lockdep_assert_held(&set->tag_list_lock);
> @@ -4984,6 +4985,11 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
>  		return;
>  
>  	memflags = memalloc_noio_save();
> +
> +	xa_init(&et_table);
> +	if (blk_mq_alloc_sched_tags_batch(&et_table, set, nr_hw_queues) < 0)
> +		goto out_memalloc_restore;
> +
>  	list_for_each_entry(q, &set->tag_list, tag_set_list) {
>  		blk_mq_debugfs_unregister_hctxs(q);
>  		blk_mq_sysfs_unregister_hctxs(q);
> @@ -4995,6 +5001,7 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
>  	if (blk_mq_realloc_tag_set_tags(set, nr_hw_queues) < 0) {
>  		list_for_each_entry(q, &set->tag_list, tag_set_list)
>  			blk_mq_unfreeze_queue_nomemrestore(q);
> +		blk_mq_free_sched_tags_batch(&et_table, set);
>  		goto reregister;
>  	}
>  
> @@ -5019,7 +5026,7 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
>  
>  	/* elv_update_nr_hw_queues() unfreeze queue for us */
>  	list_for_each_entry(q, &set->tag_list, tag_set_list)
> -		elv_update_nr_hw_queues(q);
> +		elv_update_nr_hw_queues(q, &et_table);
>  
>  reregister:
>  	list_for_each_entry(q, &set->tag_list, tag_set_list) {
> @@ -5029,7 +5036,9 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
>  		blk_mq_remove_hw_queues_cpuhp(q);
>  		blk_mq_add_hw_queues_cpuhp(q);
>  	}
> +out_memalloc_restore:
>  	memalloc_noio_restore(memflags);
> +	xa_destroy(&et_table);
>  
>  	/* Free the excess tags when nr_hw_queues shrink. */
>  	for (i = set->nr_hw_queues; i < prev_nr_hw_queues; i++)
> diff --git a/block/blk.h b/block/blk.h
> index 37ec459fe656..c6d1d1458388 100644
> --- a/block/blk.h
> +++ b/block/blk.h
> @@ -321,7 +321,7 @@ bool blk_bio_list_merge(struct request_queue *q, struct list_head *list,
>  
>  bool blk_insert_flush(struct request *rq);
>  
> -void elv_update_nr_hw_queues(struct request_queue *q);
> +void elv_update_nr_hw_queues(struct request_queue *q, struct xarray *et_table);
>  void elevator_set_default(struct request_queue *q);
>  void elevator_set_none(struct request_queue *q);
>  
> diff --git a/block/elevator.c b/block/elevator.c
> index 50f4b78efe66..8ba8b869d5a4 100644
> --- a/block/elevator.c
> +++ b/block/elevator.c
> @@ -705,7 +705,7 @@ static int elevator_change(struct request_queue *q, struct elv_change_ctx *ctx)
>   * The I/O scheduler depends on the number of hardware queues, this forces a
>   * reattachment when nr_hw_queues changes.
>   */
> -void elv_update_nr_hw_queues(struct request_queue *q)
> +void elv_update_nr_hw_queues(struct request_queue *q, struct xarray *et_table)

et_table isn't necessary to expose to elv_update_nr_hw_queues(), and it is
less readable than passing 'struct elevator_tags *' directly, but it can be
one followup improvement.

Anyway:

Reviewed-by: Ming Lei <ming.lei@redhat.com>



Thanks, 
Ming


  parent reply	other threads:[~2025-07-01  3:24 UTC|newest]

Thread overview: 15+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-06-30  5:21 [PATCHv6 0/3] block: move sched_tags allocation/de-allocation outside of locking context Nilay Shroff
2025-06-30  5:21 ` [PATCHv6 1/3] block: move elevator queue allocation logic into blk_mq_init_sched Nilay Shroff
2025-06-30  6:13   ` Christoph Hellwig
2025-06-30  6:17   ` Hannes Reinecke
2025-06-30  5:21 ` [PATCHv6 2/3] block: fix lockdep warning caused by lock dependency in elv_iosched_store Nilay Shroff
2025-06-30  6:15   ` Christoph Hellwig
2025-06-30  6:20   ` Hannes Reinecke
2025-06-30  6:48     ` Nilay Shroff
2025-07-01  3:19   ` Ming Lei
2025-06-30  5:21 ` [PATCHv6 3/3] block: fix potential deadlock while running nr_hw_queue update Nilay Shroff
2025-06-30  6:16   ` Christoph Hellwig
2025-06-30  6:24   ` Hannes Reinecke
2025-06-30  6:57     ` Nilay Shroff
2025-07-01  3:24   ` Ming Lei [this message]
2025-07-01  5:20     ` Nilay Shroff

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=aGNU3PPJ1wU--x-O@fedora \
    --to=ming.lei@redhat.com \
    --cc=axboe@kernel.dk \
    --cc=gjoyce@ibm.com \
    --cc=hch@lst.de \
    --cc=linux-block@vger.kernel.org \
    --cc=lkp@intel.com \
    --cc=nilay@linux.ibm.com \
    --cc=sth@linux.ibm.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.