All of lore.kernel.org
 help / color / mirror / Atom feed
From: Ming Lei <ming.lei@redhat.com>
To: Nilay Shroff <nilay@linux.ibm.com>
Cc: linux-block@vger.kernel.org, hch@lst.de, axboe@kernel.dk,
	yi.zhang@redhat.com, czhong@redhat.com, gjoyce@ibm.com
Subject: Re: [PATCHv3 3/4] block: introduce alloc_sched_data and free_sched_data elevator methods
Date: Mon, 3 Nov 2025 12:08:35 +0800	[thread overview]
Message-ID: <aQgqw-DX1A3R3wuN@fedora> (raw)
In-Reply-To: <20251029103622.205607-4-nilay@linux.ibm.com>

On Wed, Oct 29, 2025 at 04:06:16PM +0530, Nilay Shroff wrote:
> The recent lockdep splat [1] highlights a potential deadlock risk
> involving ->elevator_lock and ->freeze_lock dependencies on -pcpu_alloc_
> mutex. The trace shows that the issue occurs when the Kyber scheduler
> allocates dynamic memory for its elevator data during initialization.
> 
> To address this, introduce two new elevator operation callbacks:
> ->alloc_sched_data and ->free_sched_data.
> 
> When an elevator implements these methods, they are invoked during
> scheduler switch before acquiring ->freeze_lock and ->elevator_lock.
> This allows safe allocation and deallocation of per-elevator data
> without holding locks that could depend on pcpu_alloc_mutex, effectively
> breaking the lock dependency chain and avoiding the reported deadlock
> scenario.
> 
> [1] https://lore.kernel.org/all/CAGVVp+VNW4M-5DZMNoADp6o2VKFhi7KxWpTDkcnVyjO0=-D5+A@mail.gmail.com/
> 
> Signed-off-by: Nilay Shroff <nilay@linux.ibm.com>
> ---
>  block/blk-mq-sched.c | 30 +++++++++++++++++++++++-------
>  block/blk-mq-sched.h | 25 +++++++++++++++++++++++--
>  block/elevator.c     | 35 +++++++++++++++++++++++++----------
>  block/elevator.h     |  4 ++++
>  4 files changed, 75 insertions(+), 19 deletions(-)
> 
> diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
> index 6db45b0819e6..4376d0ddbd1e 100644
> --- a/block/blk-mq-sched.c
> +++ b/block/blk-mq-sched.c
> @@ -428,12 +428,17 @@ void blk_mq_free_sched_tags(struct elevator_tags *et,
>  }
>  
>  void blk_mq_free_sched_res(struct elevator_resources *res,
> +		struct elevator_type *type,
>  		struct blk_mq_tag_set *set)
>  {
>  	if (res->et) {
>  		blk_mq_free_sched_tags(res->et, set);
>  		res->et = NULL;
>  	}
> +	if (res->data) {
> +		blk_mq_free_sched_data(type, res->data);
> +		res->data = NULL;
> +	}
>  }
>  
>  void blk_mq_free_sched_res_batch(struct xarray *elv_tbl,
> @@ -458,7 +463,7 @@ void blk_mq_free_sched_res_batch(struct xarray *elv_tbl,
>  				WARN_ON_ONCE(1);
>  				continue;
>  			}
> -			blk_mq_free_sched_res(&ctx->res, set);
> +			blk_mq_free_sched_res(&ctx->res, ctx->type, set);
>  		}
>  	}
>  }
> @@ -540,15 +545,24 @@ struct elevator_tags *blk_mq_alloc_sched_tags(struct blk_mq_tag_set *set,
>  	return NULL;
>  }
>  
> -int blk_mq_alloc_sched_res(struct elevator_resources *res,
> -		struct blk_mq_tag_set *set, unsigned int nr_hw_queues)
> +int blk_mq_alloc_sched_res(struct request_queue *q,
> +		struct elevator_type *type,
> +		struct elevator_resources *res,
> +		struct blk_mq_tag_set *set,
> +		unsigned int nr_hw_queues)
>  {
> +	int ret;
> +
>  	res->et = blk_mq_alloc_sched_tags(set, nr_hw_queues,
>  			blk_mq_default_nr_requests(set));
>  	if (!res->et)
>  		return -ENOMEM;
>  
> -	return 0;
> +	ret = blk_mq_alloc_sched_data(q, type, &res->data);
> +	if (ret)
> +		kfree(res->et);

use blk_mq_free_sched_res() instead of kfree() for avoiding memleak.

> +
> +	return ret;
>  }
>  
>  int blk_mq_alloc_sched_res_batch(struct xarray *elv_tbl,
> @@ -575,19 +589,21 @@ int blk_mq_alloc_sched_res_batch(struct xarray *elv_tbl,
>  				goto out_unwind;
>  			}
>  
> -			ret = blk_mq_alloc_sched_res(&ctx->res, set,
> -					nr_hw_queues);
> +			ret = blk_mq_alloc_sched_res(q, q->elevator->type,
> +					&ctx->res, set, nr_hw_queues);
>  			if (ret)
>  				goto out_unwind;
>  		}
>  	}
>  	return 0;
> +
>  out_unwind:
>  	list_for_each_entry_continue_reverse(q, &set->tag_list, tag_set_list) {
>  		if (q->elevator) {
>  			ctx = xa_load(elv_tbl, q->id);
>  			if (ctx)
> -				blk_mq_free_sched_res(&ctx->res, set);
> +				blk_mq_free_sched_res(&ctx->res,
> +						ctx->type, set);
>  		}
>  	}
>  	return ret;
> diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h
> index 97204df76def..acd4f1355be6 100644
> --- a/block/blk-mq-sched.h
> +++ b/block/blk-mq-sched.h
> @@ -25,8 +25,11 @@ void blk_mq_sched_free_rqs(struct request_queue *q);
>  
>  struct elevator_tags *blk_mq_alloc_sched_tags(struct blk_mq_tag_set *set,
>  		unsigned int nr_hw_queues, unsigned int nr_requests);
> -int blk_mq_alloc_sched_res(struct elevator_resources *res,
> -		struct blk_mq_tag_set *set, unsigned int nr_hw_queues);
> +int blk_mq_alloc_sched_res(struct request_queue *q,
> +		struct elevator_type *type,
> +		struct elevator_resources *res,
> +		struct blk_mq_tag_set *set,
> +		unsigned int nr_hw_queues);
>  int blk_mq_alloc_sched_res_batch(struct xarray *elv_tbl,
>  		struct blk_mq_tag_set *set, unsigned int nr_hw_queues);
>  int blk_mq_alloc_sched_ctx_batch(struct xarray *elv_tbl,
> @@ -35,10 +38,28 @@ void blk_mq_free_sched_ctx_batch(struct xarray *elv_tbl);
>  void blk_mq_free_sched_tags(struct elevator_tags *et,
>  		struct blk_mq_tag_set *set);
>  void blk_mq_free_sched_res(struct elevator_resources *res,
> +		struct elevator_type *type,
>  		struct blk_mq_tag_set *set);
>  void blk_mq_free_sched_res_batch(struct xarray *et_table,
>  		struct blk_mq_tag_set *set);
>  
> +static inline int blk_mq_alloc_sched_data(struct request_queue *q,
> +		struct elevator_type *e, void **data)
> +{
> +	if (e && e->ops.alloc_sched_data) {
> +		*data = e->ops.alloc_sched_data(q);
> +		if (!*data)
> +			return -ENOMEM;
> +	}
> +	return 0;
> +}
> +
> +static inline void blk_mq_free_sched_data(struct elevator_type *e, void *data)
> +{
> +	if (e && e->ops.free_sched_data)
> +		e->ops.free_sched_data(data);
> +}
> +
>  static inline void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx)
>  {
>  	if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
> diff --git a/block/elevator.c b/block/elevator.c
> index d5d89b202fda..8696b2a741b7 100644
> --- a/block/elevator.c
> +++ b/block/elevator.c
> @@ -135,6 +135,7 @@ struct elevator_queue *elevator_alloc(struct request_queue *q,
>  	mutex_init(&eq->sysfs_lock);
>  	hash_init(eq->hash);
>  	eq->et = res->et;
> +	eq->elevator_data = res->data;
>  
>  	return eq;
>  }
> @@ -617,7 +618,7 @@ static void elv_exit_and_release(struct elv_change_ctx *ctx,
>  	mutex_unlock(&q->elevator_lock);
>  	blk_mq_unfreeze_queue(q, memflags);
>  	if (e) {
> -		blk_mq_free_sched_res(&ctx->res, q->tag_set);
> +		blk_mq_free_sched_res(&ctx->res, ctx->type, q->tag_set);
>  		kobject_put(&e->kobj);
>  	}
>  }
> @@ -628,12 +629,15 @@ static int elevator_change_done(struct request_queue *q,
>  	int ret = 0;
>  
>  	if (ctx->old) {
> -		struct elevator_resources res = {.et = ctx->old->et};
> +		struct elevator_resources res = {
> +			.et = ctx->old->et,
> +			.data = ctx->old->elevator_data
> +		};
>  		bool enable_wbt = test_bit(ELEVATOR_FLAG_ENABLE_WBT_ON_EXIT,
>  				&ctx->old->flags);
>  
>  		elv_unregister_queue(q, ctx->old);
> -		blk_mq_free_sched_res(&res, q->tag_set);
> +		blk_mq_free_sched_res(&res, ctx->old->type, q->tag_set);
>  		kobject_put(&ctx->old->kobj);
>  		if (enable_wbt)
>  			wbt_enable_default(q->disk);
> @@ -658,7 +662,8 @@ static int elevator_change(struct request_queue *q, struct elv_change_ctx *ctx)
>  	lockdep_assert_held(&set->update_nr_hwq_lock);
>  
>  	if (strncmp(ctx->name, "none", 4)) {
> -		ret = blk_mq_alloc_sched_res(&ctx->res, set, set->nr_hw_queues);
> +		ret = blk_mq_alloc_sched_res(q, ctx->type, &ctx->res, set,
> +				set->nr_hw_queues);
>  		if (ret)
>  			return ret;
>  	}
> @@ -681,11 +686,15 @@ static int elevator_change(struct request_queue *q, struct elv_change_ctx *ctx)
>  	blk_mq_unfreeze_queue(q, memflags);
>  	if (!ret)
>  		ret = elevator_change_done(q, ctx);
> +
> +	if (ctx->new) /* switching to new elevator is successful */
> +		return ret;
> +

Not necessary.


>  	/*
>  	 * Free sched resource if it's allocated but we couldn't switch elevator.
>  	 */
>  	if (!ctx->new)
> -		blk_mq_free_sched_res(&ctx->res, set);
> +		blk_mq_free_sched_res(&ctx->res, ctx->type, set);
>  
>  	return ret;
>  }
> @@ -711,11 +720,14 @@ void elv_update_nr_hw_queues(struct request_queue *q,
>  	blk_mq_unfreeze_queue_nomemrestore(q);
>  	if (!ret)
>  		WARN_ON_ONCE(elevator_change_done(q, ctx));
> +
> +	if (ctx->new) /* switching to new elevator is successful */
> +		return;

Not necessary.

>  	/*
>  	 * Free sched resource if it's allocated but we couldn't switch elevator.
>  	 */
>  	if (!ctx->new)
> -		blk_mq_free_sched_res(&ctx->res, set);
> +		blk_mq_free_sched_res(&ctx->res, ctx->type, set);
>  }
>  
>  /*
> @@ -729,7 +741,6 @@ void elevator_set_default(struct request_queue *q)
>  		.no_uevent = true,
>  	};
>  	int err;
> -	struct elevator_type *e;
>  
>  	/* now we allow to switch elevator */
>  	blk_queue_flag_clear(QUEUE_FLAG_NO_ELV_SWITCH, q);
> @@ -742,8 +753,8 @@ void elevator_set_default(struct request_queue *q)
>  	 * have multiple queues or mq-deadline is not available, default
>  	 * to "none".
>  	 */
> -	e = elevator_find_get(ctx.name);
> -	if (!e)
> +	ctx.type = elevator_find_get(ctx.name);
> +	if (!ctx.type)
>  		return;
>  
>  	if ((q->nr_hw_queues == 1 ||
> @@ -753,7 +764,7 @@ void elevator_set_default(struct request_queue *q)
>  			pr_warn("\"%s\" elevator initialization, failed %d, falling back to \"none\"\n",
>  					ctx.name, err);
>  	}
> -	elevator_put(e);
> +	elevator_put(ctx.type);
>  }
>  
>  void elevator_set_none(struct request_queue *q)
> @@ -802,6 +813,7 @@ ssize_t elv_iosched_store(struct gendisk *disk, const char *buf,
>  	ctx.name = strstrip(elevator_name);
>  
>  	elv_iosched_load_module(ctx.name);
> +	ctx.type = elevator_find_get(ctx.name);
>  
>  	down_read(&set->update_nr_hwq_lock);
>  	if (!blk_queue_no_elv_switch(q)) {
> @@ -812,6 +824,9 @@ ssize_t elv_iosched_store(struct gendisk *disk, const char *buf,
>  		ret = -ENOENT;
>  	}
>  	up_read(&set->update_nr_hwq_lock);
> +
> +	if (ctx.type)
> +		elevator_put(ctx.type);
>  	return ret;

The above change can be unified into elevator_change() by one standalone
patch, so elv_iosched_store() can be covered too.


Thanks, 
Ming


  reply	other threads:[~2025-11-03  4:08 UTC|newest]

Thread overview: 10+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-10-29 10:36 [PATCHv3 0/4] block: restructure elevator switch path and fix a lockdep splat Nilay Shroff
2025-10-29 10:36 ` [PATCHv3 1/4] block: unify elevator tags and type xarrays into struct elv_change_ctx Nilay Shroff
2025-11-03  3:22   ` Ming Lei
2025-10-29 10:36 ` [PATCHv3 2/4] block: move elevator tags into struct elevator_resources Nilay Shroff
2025-11-03  3:50   ` Ming Lei
2025-11-03  7:03     ` Nilay Shroff
2025-10-29 10:36 ` [PATCHv3 3/4] block: introduce alloc_sched_data and free_sched_data elevator methods Nilay Shroff
2025-11-03  4:08   ` Ming Lei [this message]
2025-11-03  6:55     ` Nilay Shroff
2025-10-29 10:36 ` [PATCHv3 4/4] block: define alloc_sched_data and free_sched_data methods for kyber Nilay Shroff

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=aQgqw-DX1A3R3wuN@fedora \
    --to=ming.lei@redhat.com \
    --cc=axboe@kernel.dk \
    --cc=czhong@redhat.com \
    --cc=gjoyce@ibm.com \
    --cc=hch@lst.de \
    --cc=linux-block@vger.kernel.org \
    --cc=nilay@linux.ibm.com \
    --cc=yi.zhang@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.