Re: [PATCH v25 07/20] block/mq-deadline: Enable zoned write pipelining

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Damien Le Moal <dlemoal@kernel.org>
To: Bart Van Assche <bvanassche@acm.org>, Jens Axboe <axboe@kernel.dk>
Cc: linux-block@vger.kernel.org, linux-scsi@vger.kernel.org,
	Christoph Hellwig <hch@lst.de>
Subject: Re: [PATCH v25 07/20] block/mq-deadline: Enable zoned write pipelining
Date: Wed, 15 Oct 2025 16:31:23 +0900	[thread overview]
Message-ID: <08ce89bb-756a-4ce8-9980-ddea8baab1d1@kernel.org> (raw)
In-Reply-To: <20251014215428.3686084-8-bvanassche@acm.org>

On 2025/10/15 6:54, Bart Van Assche wrote:
> The hwq selected by blk_mq_run_hw_queues() for single-queue I/O schedulers
> depends on the CPU core that function has been called from. This may lead
> to concurrent dispatching of I/O requests on different CPU cores and hence
> may cause I/O reordering. Prevent as follows that zoned writes are
> reordered:
> - Set the ELEVATOR_FLAG_SUPPORTS_ZONED_WRITE_PIPELINING flag. This disables
>   the single hwq optimization in the block layer core.
> - Modify dd_has_work() such that it only reports that any work is pending
>   for zoned writes if the zoned writes have been submitted to the hwq that
>   has been passed as argument to dd_has_work().
> - Modify dd_dispatch_request() such that it only dispatches zoned writes
>   if the hwq argument passed to this function matches the hwq of the
>   pending zoned writes.

One of the goals of zone write plugging was to remove the dependence on IO
schedulers to control the ordering of write commands to zoned block devices.
Such change are going backward and I do not like that. What if the user sets
Kyber or bfq with your zone write pipelining ? Does it break ?

From the very light explanation above, it seems to me that what you are trying
to do can be generic in the block layer and leave mq-deadline untouched.

> 
> Cc: Damien Le Moal <dlemoal@kernel.org>
> Cc: Christoph Hellwig <hch@lst.de>
> Signed-off-by: Bart Van Assche <bvanassche@acm.org>
> ---
>  block/mq-deadline.c | 68 ++++++++++++++++++++++++++++++++++++++-------
>  1 file changed, 58 insertions(+), 10 deletions(-)
> 
> diff --git a/block/mq-deadline.c b/block/mq-deadline.c
> index 0a46d0f06f72..be6ed3d8fa36 100644
> --- a/block/mq-deadline.c
> +++ b/block/mq-deadline.c
> @@ -319,11 +319,25 @@ static struct request *dd_start_request(struct deadline_data *dd,
>  	return rq;
>  }
>  
> +/*
> + * If write pipelining is enabled, only dispatch sequential zoned writes if
> + * rq->mq_hctx == hctx.
> + */
> +static bool dd_dispatch_from_hctx(struct blk_mq_hw_ctx *hctx,
> +				  struct request *rq)
> +{
> +	struct request_queue *q = hctx->queue;
> +
> +	return !(q->limits.features & BLK_FEAT_ORDERED_HWQ) ||
> +		rq->mq_hctx == hctx || !blk_rq_is_seq_zoned_write(rq);
> +}
> +
>  /*
>   * deadline_dispatch_requests selects the best request according to
>   * read/write expire, fifo_batch, etc and with a start time <= @latest_start.
>   */
>  static struct request *__dd_dispatch_request(struct deadline_data *dd,
> +					     struct blk_mq_hw_ctx *hctx,
>  					     struct dd_per_prio *per_prio,
>  					     unsigned long latest_start)
>  {
> @@ -336,7 +350,8 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd,
>  	 * batches are currently reads XOR writes
>  	 */
>  	rq = deadline_next_request(dd, per_prio, dd->last_dir);
> -	if (rq && dd->batching < dd->fifo_batch) {
> +	if (rq && dd->batching < dd->fifo_batch &&
> +	    dd_dispatch_from_hctx(hctx, rq)) {
>  		/* we have a next request and are still entitled to batch */
>  		data_dir = rq_data_dir(rq);
>  		goto dispatch_request;
> @@ -396,7 +411,7 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd,
>  		rq = next_rq;
>  	}
>  
> -	if (!rq)
> +	if (!rq || !dd_dispatch_from_hctx(hctx, rq))
>  		return NULL;
>  
>  	dd->last_dir = data_dir;
> @@ -418,8 +433,9 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd,
>   * Check whether there are any requests with priority other than DD_RT_PRIO
>   * that were inserted more than prio_aging_expire jiffies ago.
>   */
> -static struct request *dd_dispatch_prio_aged_requests(struct deadline_data *dd,
> -						      unsigned long now)
> +static struct request *
> +dd_dispatch_prio_aged_requests(struct deadline_data *dd,
> +			       struct blk_mq_hw_ctx *hctx, unsigned long now)
>  {
>  	struct request *rq;
>  	enum dd_prio prio;
> @@ -433,7 +449,7 @@ static struct request *dd_dispatch_prio_aged_requests(struct deadline_data *dd,
>  		return NULL;
>  
>  	for (prio = DD_BE_PRIO; prio <= DD_PRIO_MAX; prio++) {
> -		rq = __dd_dispatch_request(dd, &dd->per_prio[prio],
> +		rq = __dd_dispatch_request(dd, hctx, &dd->per_prio[prio],
>  					   now - dd->prio_aging_expire);
>  		if (rq)
>  			return rq;
> @@ -466,7 +482,7 @@ static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx)
>  		goto unlock;
>  	}
>  
> -	rq = dd_dispatch_prio_aged_requests(dd, now);
> +	rq = dd_dispatch_prio_aged_requests(dd, hctx, now);
>  	if (rq)
>  		goto unlock;
>  
> @@ -475,7 +491,7 @@ static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx)
>  	 * requests if any higher priority requests are pending.
>  	 */
>  	for (prio = 0; prio <= DD_PRIO_MAX; prio++) {
> -		rq = __dd_dispatch_request(dd, &dd->per_prio[prio], now);
> +		rq = __dd_dispatch_request(dd, hctx, &dd->per_prio[prio], now);
>  		if (rq || dd_queued(dd, prio))
>  			break;
>  	}
> @@ -575,6 +591,8 @@ static int dd_init_sched(struct request_queue *q, struct elevator_queue *eq)
>  	/* We dispatch from request queue wide instead of hw queue */
>  	blk_queue_flag_set(QUEUE_FLAG_SQ_SCHED, q);
>  
> +	set_bit(ELEVATOR_FLAG_SUPPORTS_ZONED_WRITE_PIPELINING, &eq->flags);
> +
>  	q->elevator = eq;
>  	dd_depth_updated(q);
>  	return 0;
> @@ -731,10 +749,40 @@ static void dd_finish_request(struct request *rq)
>  		atomic_inc(&per_prio->stats.completed);
>  }
>  
> -static bool dd_has_work_for_prio(struct dd_per_prio *per_prio)
> +/* May be called from interrupt context. */
> +static bool dd_has_write_work(struct deadline_data *dd,
> +			      struct blk_mq_hw_ctx *hctx,
> +			      struct list_head *list)
> +{
> +	struct request_queue *q = hctx->queue;
> +	unsigned long flags;
> +	struct request *rq;
> +	bool has_work = false;
> +
> +	if (list_empty_careful(list))
> +		return false;
> +
> +	if (!(q->limits.features & BLK_FEAT_ORDERED_HWQ))
> +		return true;
> +
> +	spin_lock_irqsave(&dd->lock, flags);
> +	list_for_each_entry(rq, list, queuelist) {
> +		if (rq->mq_hctx == hctx) {
> +			has_work = true;
> +			break;
> +		}
> +	}
> +	spin_unlock_irqrestore(&dd->lock, flags);
> +
> +	return has_work;
> +}
> +
> +static bool dd_has_work_for_prio(struct deadline_data *dd,
> +				 struct blk_mq_hw_ctx *hctx,
> +				 struct dd_per_prio *per_prio)
>  {
>  	return !list_empty_careful(&per_prio->fifo_list[DD_READ]) ||
> -		!list_empty_careful(&per_prio->fifo_list[DD_WRITE]);
> +		dd_has_write_work(dd, hctx, &per_prio->fifo_list[DD_WRITE]);
>  }
>  
>  static bool dd_has_work(struct blk_mq_hw_ctx *hctx)
> @@ -746,7 +794,7 @@ static bool dd_has_work(struct blk_mq_hw_ctx *hctx)
>  		return true;
>  
>  	for (prio = 0; prio <= DD_PRIO_MAX; prio++)
> -		if (dd_has_work_for_prio(&dd->per_prio[prio]))
> +		if (dd_has_work_for_prio(dd, hctx, &dd->per_prio[prio]))
>  			return true;
>  
>  	return false;


-- 
Damien Le Moal
Western Digital Research

next prev parent reply	other threads:[~2025-10-15  7:31 UTC|newest]

Thread overview: 33+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-10-14 21:54 [PATCH v25 00/20] Improve write performance for zoned UFS devices Bart Van Assche
2025-10-14 21:54 ` [PATCH v25 01/20] block: Support block devices that preserve the order of write requests Bart Van Assche
2025-10-14 21:54 ` [PATCH v25 02/20] blk-mq: Always insert sequential zoned writes into a software queue Bart Van Assche
2025-10-14 21:54 ` [PATCH v25 03/20] blk-mq: Restore the zone write order when requeuing Bart Van Assche
2025-10-14 21:54 ` [PATCH v25 04/20] blk-mq: Move the blk_queue_sq_sched() calls Bart Van Assche
2025-10-14 21:54 ` [PATCH v25 05/20] blk-mq: Run all hwqs for sq scheds if write pipelining is enabled Bart Van Assche
2025-10-15  7:25   ` Damien Le Moal
2025-10-15 16:35     ` Bart Van Assche
2025-10-14 21:54 ` [PATCH v25 06/20] block/mq-deadline: Make locking IRQ-safe Bart Van Assche
2025-10-14 21:54 ` [PATCH v25 07/20] block/mq-deadline: Enable zoned write pipelining Bart Van Assche
2025-10-15  7:31   ` Damien Le Moal [this message]
2025-10-15 16:32     ` Bart Van Assche
2025-10-16 20:50     ` Bart Van Assche
2025-10-18  5:31       ` Damien Le Moal
2025-10-20 18:28         ` Bart Van Assche
2025-10-21 21:01           ` Damien Le Moal
2025-10-22 18:26             ` Bart Van Assche
2025-10-22  7:07           ` Christoph Hellwig
2025-10-14 21:54 ` [PATCH v25 08/20] blk-zoned: Fix a typo in a source code comment Bart Van Assche
2025-10-15  7:32   ` Damien Le Moal
2025-10-15 16:33     ` Bart Van Assche
2025-10-14 21:54 ` [PATCH v25 09/20] blk-zoned: Add an argument to blk_zone_plug_bio() Bart Van Assche
2025-10-14 21:54 ` [PATCH v25 10/20] blk-zoned: Split an if-statement Bart Van Assche
2025-10-14 21:54 ` [PATCH v25 11/20] blk-zoned: Move code from disk_zone_wplug_add_bio() into its caller Bart Van Assche
2025-10-14 21:54 ` [PATCH v25 12/20] blk-zoned: Introduce a loop in blk_zone_wplug_bio_work() Bart Van Assche
2025-10-14 21:54 ` [PATCH v25 13/20] blk-zoned: Document disk_zone_wplug_schedule_bio_work() locking Bart Van Assche
2025-10-14 21:54 ` [PATCH v25 14/20] blk-zoned: Support pipelining of zoned writes Bart Van Assche
2025-10-14 21:54 ` [PATCH v25 15/20] null_blk: Add the preserves_write_order attribute Bart Van Assche
2025-10-14 21:54 ` [PATCH v25 16/20] scsi: core: Retry unaligned zoned writes Bart Van Assche
2025-10-14 21:54 ` [PATCH v25 17/20] scsi: sd: Increase retry count for " Bart Van Assche
2025-10-14 21:54 ` [PATCH v25 18/20] scsi: scsi_debug: Add the preserves_write_order module parameter Bart Van Assche
2025-10-14 21:54 ` [PATCH v25 19/20] scsi: scsi_debug: Support injecting unaligned write errors Bart Van Assche
2025-10-14 21:54 ` [PATCH v25 20/20] ufs: core: Inform the block layer about write ordering Bart Van Assche

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=08ce89bb-756a-4ce8-9980-ddea8baab1d1@kernel.org \
    --to=dlemoal@kernel.org \
    --cc=axboe@kernel.dk \
    --cc=bvanassche@acm.org \
    --cc=hch@lst.de \
    --cc=linux-block@vger.kernel.org \
    --cc=linux-scsi@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.