public inbox for linux-block@vger.kernel.org
 help / color / mirror / Atom feed
From: Hannes Reinecke <hare@suse.de>
To: Nilay Shroff <nilay@linux.ibm.com>, linux-block@vger.kernel.org
Cc: hch@lst.de, ming.lei@redhat.com, axboe@kernel.dk,
	sth@linux.ibm.com, lkp@intel.com, gjoyce@ibm.com
Subject: Re: [PATCHv6 2/3] block: fix lockdep warning caused by lock dependency in elv_iosched_store
Date: Mon, 30 Jun 2025 08:20:28 +0200	[thread overview]
Message-ID: <d8e6702d-8787-4e85-9770-e415f975e266@suse.de> (raw)
In-Reply-To: <20250630054756.54532-3-nilay@linux.ibm.com>

On 6/30/25 07:21, Nilay Shroff wrote:
> Recent lockdep reports [1] have revealed a potential deadlock caused by a
> lock dependency between the percpu allocator lock and the elevator lock.
> This issue can be avoided by ensuring that the allocation and release of
> scheduler tags (sched_tags) are performed outside the elevator lock.
> Furthermore, the queue does not need to be remain frozen during these
> operations.
> 
> To address this, move all sched_tags allocations and deallocations outside
> of both the ->elevator_lock and the ->freeze_lock. Since the lifetime of
> the elevator queue and its associated sched_tags is closely tied, the
> allocated sched_tags are now stored in the elevator queue structure. Then,
> during the actual elevator switch (which runs under ->freeze_lock and
> ->elevator_lock), the pre-allocated sched_tags are assigned to the
> appropriate q->hctx. Once the elevator switch is complete and the locks
> are released, the old elevator queue and its associated sched_tags are
> freed.
> 
> This commit specifically addresses the allocation/deallocation of sched_
> tags during elevator switching. Note that sched_tags may also be allocated
> in other contexts, such as during nr_hw_queues updates. Supporting that
> use case will require batch allocation/deallocation, which will be handled
> in a follow-up patch.
> 
> This restructuring ensures that sched_tags memory management occurs
> entirely outside of the ->elevator_lock and ->freeze_lock context,
> eliminating the lock dependency problem seen during scheduler updates.
> 
> [1] https://lore.kernel.org/all/0659ea8d-a463-47c8-9180-43c719e106eb@linux.ibm.com/
> 
> Reported-by: Stefan Haberland <sth@linux.ibm.com>
> Closes: https://lore.kernel.org/all/0659ea8d-a463-47c8-9180-43c719e106eb@linux.ibm.com/
> Signed-off-by: Nilay Shroff <nilay@linux.ibm.com>
> ---
>   block/blk-mq-sched.c | 155 +++++++++++++++++++++++--------------------
>   block/blk-mq-sched.h |   8 ++-
>   block/elevator.c     |  47 +++++++++++--
>   block/elevator.h     |  14 +++-
>   4 files changed, 144 insertions(+), 80 deletions(-)
> 
> diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
> index 359e0704e09b..2d6d1ebdd8fb 100644
> --- a/block/blk-mq-sched.c
> +++ b/block/blk-mq-sched.c
> @@ -374,64 +374,17 @@ bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq,
>   }
>   EXPORT_SYMBOL_GPL(blk_mq_sched_try_insert_merge);
>   
> -static int blk_mq_sched_alloc_map_and_rqs(struct request_queue *q,
> -					  struct blk_mq_hw_ctx *hctx,
> -					  unsigned int hctx_idx)
> -{
> -	if (blk_mq_is_shared_tags(q->tag_set->flags)) {
> -		hctx->sched_tags = q->sched_shared_tags;
> -		return 0;
> -	}
> -
> -	hctx->sched_tags = blk_mq_alloc_map_and_rqs(q->tag_set, hctx_idx,
> -						    q->nr_requests);
> -
> -	if (!hctx->sched_tags)
> -		return -ENOMEM;
> -	return 0;
> -}
> -
> -static void blk_mq_exit_sched_shared_tags(struct request_queue *queue)
> -{
> -	blk_mq_free_rq_map(queue->sched_shared_tags);
> -	queue->sched_shared_tags = NULL;
> -}
> -
>   /* called in queue's release handler, tagset has gone away */
>   static void blk_mq_sched_tags_teardown(struct request_queue *q, unsigned int flags)
>   {
>   	struct blk_mq_hw_ctx *hctx;
>   	unsigned long i;
>   
> -	queue_for_each_hw_ctx(q, hctx, i) {
> -		if (hctx->sched_tags) {
> -			if (!blk_mq_is_shared_tags(flags))
> -				blk_mq_free_rq_map(hctx->sched_tags);
> -			hctx->sched_tags = NULL;
> -		}
> -	}
> +	queue_for_each_hw_ctx(q, hctx, i)
> +		hctx->sched_tags = NULL;
>   
>   	if (blk_mq_is_shared_tags(flags))
> -		blk_mq_exit_sched_shared_tags(q);
> -}
> -
> -static int blk_mq_init_sched_shared_tags(struct request_queue *queue)
> -{
> -	struct blk_mq_tag_set *set = queue->tag_set;
> -
> -	/*
> -	 * Set initial depth at max so that we don't need to reallocate for
> -	 * updating nr_requests.
> -	 */
> -	queue->sched_shared_tags = blk_mq_alloc_map_and_rqs(set,
> -						BLK_MQ_NO_HCTX_IDX,
> -						MAX_SCHED_RQ);
> -	if (!queue->sched_shared_tags)
> -		return -ENOMEM;
> -
> -	blk_mq_tag_update_sched_shared_tags(queue);
> -
> -	return 0;
> +		q->sched_shared_tags = NULL;
>   }
>   
>   void blk_mq_sched_reg_debugfs(struct request_queue *q)
> @@ -458,8 +411,75 @@ void blk_mq_sched_unreg_debugfs(struct request_queue *q)
>   	mutex_unlock(&q->debugfs_mutex);
>   }
>   
> +void blk_mq_free_sched_tags(struct elevator_tags *et,
> +		struct blk_mq_tag_set *set)
> +{
> +	unsigned long i;
> +
> +	/* Shared tags are stored at index 0 in @tags. */
> +	if (blk_mq_is_shared_tags(set->flags))
> +		blk_mq_free_map_and_rqs(set, et->tags[0], BLK_MQ_NO_HCTX_IDX);
> +	else {
> +		for (i = 0; i < et->nr_hw_queues; i++)
> +			blk_mq_free_map_and_rqs(set, et->tags[i], i);
> +	}
> +
> +	kfree(et);
> +}
> +
> +struct elevator_tags *blk_mq_alloc_sched_tags(struct blk_mq_tag_set *set,
> +		unsigned int nr_hw_queues)
> +{
> +	unsigned int nr_tags;
> +	int i;
> +	struct elevator_tags *et;
> +	gfp_t gfp = GFP_NOIO | __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;
> +
> +	if (blk_mq_is_shared_tags(set->flags))
> +		nr_tags = 1;
> +	else
> +		nr_tags = nr_hw_queues;
> +
> +	et = kmalloc(sizeof(struct elevator_tags) +
> +			nr_tags * sizeof(struct blk_mq_tags *), gfp);
> +	if (!et)
> +		return NULL;
> +	/*
> +	 * Default to double of smaller one between hw queue_depth and
> +	 * 128, since we don't split into sync/async like the old code
> +	 * did. Additionally, this is a per-hw queue depth.
> +	 */
> +	et->nr_requests = 2 * min_t(unsigned int, set->queue_depth,
> +			BLKDEV_DEFAULT_RQ);
> +	et->nr_hw_queues = nr_hw_queues;
> +
> +	if (blk_mq_is_shared_tags(set->flags)) {
> +		/* Shared tags are stored at index 0 in @tags. */
> +		et->tags[0] = blk_mq_alloc_map_and_rqs(set, BLK_MQ_NO_HCTX_IDX,
> +					MAX_SCHED_RQ);
> +		if (!et->tags[0])
> +			goto out;
> +	} else {
> +		for (i = 0; i < et->nr_hw_queues; i++) {
> +			et->tags[i] = blk_mq_alloc_map_and_rqs(set, i,
> +					et->nr_requests);
> +			if (!et->tags[i])
> +				goto out_unwind;
> +		}
> +	}
> +
> +	return et;
> +out_unwind:
> +	while (--i >= 0)
> +		blk_mq_free_map_and_rqs(set, et->tags[i], i);
> +out:
> +	kfree(et);
> +	return NULL;
> +}
> +

As smatch stated, the unwind pattern is a bit odd.
Maybe move the unwind into the 'else' branch, and us a conditional
to invoke it:

if (i < et->nr_hw_queues)
   while (--i >= 0)
     blk_mq_free_map_and_request()

Cheers,

Hannes
-- 
Dr. Hannes Reinecke                  Kernel Storage Architect
hare@suse.de                                +49 911 74053 688
SUSE Software Solutions GmbH, Frankenstr. 146, 90461 Nürnberg
HRB 36809 (AG Nürnberg), GF: I. Totev, A. McDonald, W. Knoblich

  parent reply	other threads:[~2025-06-30  6:20 UTC|newest]

Thread overview: 15+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-06-30  5:21 [PATCHv6 0/3] block: move sched_tags allocation/de-allocation outside of locking context Nilay Shroff
2025-06-30  5:21 ` [PATCHv6 1/3] block: move elevator queue allocation logic into blk_mq_init_sched Nilay Shroff
2025-06-30  6:13   ` Christoph Hellwig
2025-06-30  6:17   ` Hannes Reinecke
2025-06-30  5:21 ` [PATCHv6 2/3] block: fix lockdep warning caused by lock dependency in elv_iosched_store Nilay Shroff
2025-06-30  6:15   ` Christoph Hellwig
2025-06-30  6:20   ` Hannes Reinecke [this message]
2025-06-30  6:48     ` Nilay Shroff
2025-07-01  3:19   ` Ming Lei
2025-06-30  5:21 ` [PATCHv6 3/3] block: fix potential deadlock while running nr_hw_queue update Nilay Shroff
2025-06-30  6:16   ` Christoph Hellwig
2025-06-30  6:24   ` Hannes Reinecke
2025-06-30  6:57     ` Nilay Shroff
2025-07-01  3:24   ` Ming Lei
2025-07-01  5:20     ` Nilay Shroff

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=d8e6702d-8787-4e85-9770-e415f975e266@suse.de \
    --to=hare@suse.de \
    --cc=axboe@kernel.dk \
    --cc=gjoyce@ibm.com \
    --cc=hch@lst.de \
    --cc=linux-block@vger.kernel.org \
    --cc=lkp@intel.com \
    --cc=ming.lei@redhat.com \
    --cc=nilay@linux.ibm.com \
    --cc=sth@linux.ibm.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox