Re: [PATCH v16 04/26] blk-zoned: Only handle errors after pending zoned writes have completed

public inbox for linux-scsi@vger.kernel.org
 help / color / mirror / Atom feed

From: Damien Le Moal <dlemoal@kernel.org>
To: Bart Van Assche <bvanassche@acm.org>, Jens Axboe <axboe@kernel.dk>
Cc: linux-block@vger.kernel.org, linux-scsi@vger.kernel.org,
	Christoph Hellwig <hch@lst.de>, Jaegeuk Kim <jaegeuk@kernel.org>
Subject: Re: [PATCH v16 04/26] blk-zoned: Only handle errors after pending zoned writes have completed
Date: Tue, 19 Nov 2024 11:50:48 +0900	[thread overview]
Message-ID: <7f4058f9-df04-404c-b4f0-25bf0e8e4886@kernel.org> (raw)
In-Reply-To: <20241119002815.600608-5-bvanassche@acm.org>

On 11/19/24 9:27 AM, Bart Van Assche wrote:
> Instead of handling write errors immediately, only handle these after all
> pending zoned write requests have completed or have been requeued. This
> patch prepares for changing the zone write pointer tracking approach.

A little more explanations about how this is achieved would be nice. I was
expecting a shorter change given the short commit message... Took some time to
understand the changes without details.

More comments below.

> 
> Signed-off-by: Bart Van Assche <bvanassche@acm.org>
> ---
>  block/blk-mq.c         |   9 +++
>  block/blk-zoned.c      | 154 +++++++++++++++++++++++++++++++++++++++--
>  block/blk.h            |  29 ++++++++
>  include/linux/blk-mq.h |  18 +++++
>  4 files changed, 203 insertions(+), 7 deletions(-)
> 
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index 270cfd9fc6b0..a45077e187b5 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -793,6 +793,9 @@ void blk_mq_free_request(struct request *rq)
>  	rq_qos_done(q, rq);
>  
>  	WRITE_ONCE(rq->state, MQ_RQ_IDLE);
> +
> +	blk_zone_free_request(rq);
> +
>  	if (req_ref_put_and_test(rq))
>  		__blk_mq_free_request(rq);
>  }
> @@ -1189,6 +1192,9 @@ void blk_mq_end_request_batch(struct io_comp_batch *iob)
>  			continue;
>  
>  		WRITE_ONCE(rq->state, MQ_RQ_IDLE);
> +
> +		blk_zone_free_request(rq);
> +
>  		if (!req_ref_put_and_test(rq))
>  			continue;
>  
> @@ -1507,6 +1513,7 @@ static void __blk_mq_requeue_request(struct request *rq)
>  	if (blk_mq_request_started(rq)) {
>  		WRITE_ONCE(rq->state, MQ_RQ_IDLE);
>  		rq->rq_flags &= ~RQF_TIMED_OUT;
> +		blk_zone_requeue_work(q);
>  	}
>  }
>  
> @@ -1542,6 +1549,8 @@ static void blk_mq_requeue_work(struct work_struct *work)
>  	list_splice_init(&q->flush_list, &flush_list);
>  	spin_unlock_irq(&q->requeue_lock);
>  
> +	blk_zone_requeue_work(q);
> +
>  	while (!list_empty(&rq_list)) {
>  		rq = list_entry(rq_list.next, struct request, queuelist);
>  		/*
> diff --git a/block/blk-zoned.c b/block/blk-zoned.c
> index 7e6e6ebb6235..b570b773e65f 100644
> --- a/block/blk-zoned.c
> +++ b/block/blk-zoned.c
> @@ -608,6 +608,8 @@ static inline void disk_zone_wplug_set_error(struct gendisk *disk,
>  	if (zwplug->flags & BLK_ZONE_WPLUG_ERROR)
>  		return;
>  
> +	zwplug->flags |= BLK_ZONE_WPLUG_PLUGGED;
> +	zwplug->flags |= BLK_ZONE_WPLUG_ERROR;

Why move these before the comment ? Also, why set BLK_ZONE_WPLUG_PLUGGED ? It
should be set already since this is handling a failed write that was either
being prepared for submission or submitted (and completed) already. In both
cases, the wplug is plugged since we have a write in flight.

>  	/*
>  	 * At this point, we already have a reference on the zone write plug.
>  	 * However, since we are going to add the plug to the disk zone write
> @@ -616,7 +618,6 @@ static inline void disk_zone_wplug_set_error(struct gendisk *disk,
>  	 * handled, or in disk_zone_wplug_clear_error() if the zone is reset or
>  	 * finished.
>  	 */
> -	zwplug->flags |= BLK_ZONE_WPLUG_ERROR;
>  	refcount_inc(&zwplug->ref);
>  
>  	spin_lock_irqsave(&disk->zone_wplugs_lock, flags);
> @@ -642,6 +643,7 @@ static inline void disk_zone_wplug_clear_error(struct gendisk *disk,
>  	spin_lock_irqsave(&disk->zone_wplugs_lock, flags);
>  	if (!list_empty(&zwplug->link)) {
>  		list_del_init(&zwplug->link);
> +		zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED;
>  		zwplug->flags &= ~BLK_ZONE_WPLUG_ERROR;
>  		disk_put_zone_wplug(zwplug);
>  	}
> @@ -746,6 +748,70 @@ static bool blk_zone_wplug_handle_reset_all(struct bio *bio)
>  	return false;
>  }
>  
> +struct all_zwr_inserted_data {
> +	struct blk_zone_wplug *zwplug;
> +	bool res;
> +};
> +
> +/*
> + * Changes @data->res to %false if and only if @rq is a zoned write for the
> + * given zone and if it is owned by the block driver.

It would be nice to have a request flag or a state indicating that instead of
needing all this code... Can't that be done ?

> + *
> + * @rq members may change while this function is in progress. Hence, use
> + * READ_ONCE() to read @rq members.
> + */
> +static bool blk_zwr_inserted(struct request *rq, void *data)
> +{
> +	struct all_zwr_inserted_data *d = data;
> +	struct blk_zone_wplug *zwplug = d->zwplug;
> +	struct request_queue *q = zwplug->disk->queue;
> +	struct bio *bio = READ_ONCE(rq->bio);
> +
> +	if (rq->q == q && READ_ONCE(rq->state) != MQ_RQ_IDLE &&
> +	    blk_rq_is_seq_zoned_write(rq) && bio &&
> +	    bio_zone_no(bio) == zwplug->zone_no) {
> +		d->res = false;
> +		return false;
> +	}
> +
> +	return true;
> +}
> +
> +/*
> + * Report whether or not all zoned writes for a zone have been inserted into a
> + * software queue, elevator queue or hardware queue.
> + */
> +static bool blk_zone_all_zwr_inserted(struct blk_zone_wplug *zwplug)
> +{
> +	struct gendisk *disk = zwplug->disk;
> +	struct request_queue *q = disk->queue;
> +	struct all_zwr_inserted_data d = { .zwplug = zwplug, .res = true };
> +	struct blk_mq_hw_ctx *hctx;
> +	long unsigned int i;
> +	struct request *rq;
> +
> +	scoped_guard(spinlock_irqsave, &q->requeue_lock) {
> +		list_for_each_entry(rq, &q->requeue_list, queuelist)
> +			if (blk_rq_is_seq_zoned_write(rq) &&
> +			    bio_zone_no(rq->bio) == zwplug->zone_no)
> +				return false;
> +		list_for_each_entry(rq, &q->flush_list, queuelist)
> +			if (blk_rq_is_seq_zoned_write(rq) &&
> +			    bio_zone_no(rq->bio) == zwplug->zone_no)
> +				return false;
> +	}
> +
> +	queue_for_each_hw_ctx(q, hctx, i) {
> +		struct blk_mq_tags *tags = hctx->sched_tags ?: hctx->tags;
> +
> +		blk_mq_all_tag_iter(tags, blk_zwr_inserted, &d);
> +		if (!d.res || blk_mq_is_shared_tags(q->tag_set->flags))
> +			break;
> +	}
> +
> +	return d.res;
> +}
> +
>  static inline void blk_zone_wplug_add_bio(struct blk_zone_wplug *zwplug,
>  					  struct bio *bio, unsigned int nr_segs)
>  {
> @@ -1096,6 +1162,29 @@ static void disk_zone_wplug_schedule_bio_work(struct gendisk *disk,
>  	queue_work(disk->zone_wplugs_wq, &zwplug->bio_work);
>  }
>  
> +/*
> + * Change the zone state to "error" if a request is requeued to postpone
> + * processing of requeued requests until all pending requests have either
> + * completed or have been requeued.
> + */
> +void blk_zone_write_plug_requeue_request(struct request *rq)
> +{
> +	struct gendisk *disk = rq->q->disk;
> +	struct blk_zone_wplug *zwplug;
> +
> +	if (!disk->zone_wplugs_hash_bits || !blk_rq_is_seq_zoned_write(rq))
> +		return;

I think the disk->zone_wplugs_hash_bits check needs to go inside
disk_get_zone_wplug() as that will avoid a similar check in
blk_zone_write_plug_free_request() too. That said, I am not even convinced it
is needed at all since these functions should be called only for a zoned drive
which should have its zone wplug hash setup.

> +
> +	zwplug = disk_get_zone_wplug(disk, blk_rq_pos(rq));
> +	if (WARN_ON_ONCE(!zwplug))
> +		return;
> +
> +	scoped_guard(spinlock_irqsave, &zwplug->lock)
> +		disk_zone_wplug_set_error(disk, zwplug);
> +
> +	disk_put_zone_wplug(zwplug);
> +}
> +
>  static void disk_zone_wplug_unplug_bio(struct gendisk *disk,
>  				       struct blk_zone_wplug *zwplug)
>  {
> @@ -1202,6 +1291,33 @@ void blk_zone_write_plug_finish_request(struct request *req)
>  	disk_put_zone_wplug(zwplug);
>  }
>  
> +/*
> + * Schedule zone_plugs_work if a zone is in the error state and if no requests
> + * are in flight. Called from blk_mq_free_request().
> + */
> +void blk_zone_write_plug_free_request(struct request *rq)
> +{
> +	struct gendisk *disk = rq->q->disk;
> +	struct blk_zone_wplug *zwplug;
> +
> +	/*
> +	 * Do nothing if this function is called before the zone information
> +	 * has been initialized.
> +	 */
> +	if (!disk->zone_wplugs_hash_bits)
> +		return;
> +
> +	zwplug = disk_get_zone_wplug(disk, blk_rq_pos(rq));
> +

Useless blank line here.

> +	if (!zwplug)
> +		return;
> +
> +	if (zwplug->flags & BLK_ZONE_WPLUG_ERROR)
> +		kblockd_schedule_work(&disk->zone_wplugs_work);

I think this needs to be done under the zone wplug spinlock ?

> +> +	disk_put_zone_wplug(zwplug);
> +}
> +
>  static void blk_zone_wplug_bio_work(struct work_struct *work)
>  {
>  	struct blk_zone_wplug *zwplug =
> @@ -1343,14 +1459,15 @@ static void disk_zone_wplug_handle_error(struct gendisk *disk,
>  
>  static void disk_zone_process_err_list(struct gendisk *disk)
>  {
> -	struct blk_zone_wplug *zwplug;
> +	struct blk_zone_wplug *zwplug, *next;
>  	unsigned long flags;
>  
>  	spin_lock_irqsave(&disk->zone_wplugs_lock, flags);
>  
> -	while (!list_empty(&disk->zone_wplugs_err_list)) {
> -		zwplug = list_first_entry(&disk->zone_wplugs_err_list,
> -					  struct blk_zone_wplug, link);
> +	list_for_each_entry_safe(zwplug, next, &disk->zone_wplugs_err_list,
> +				 link) {

You are holding the disk zwplug spinlock, so why use the _safe() loop ? Not
needed, right ?

> +		if (!blk_zone_all_zwr_inserted(zwplug))
> +			continue;
>  		list_del_init(&zwplug->link);
>  		spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags);
>  
> @@ -1361,6 +1478,12 @@ static void disk_zone_process_err_list(struct gendisk *disk)
>  	}
>  
>  	spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags);
> +
> +	/*
> +	 * If one or more zones have been skipped, this work will be requeued
> +	 * when a request is requeued (blk_zone_requeue_work()) or freed
> +	 * (blk_zone_write_plug_free_request()).
> +	 */
>  }
>  
>  static void disk_zone_wplugs_work(struct work_struct *work)
> @@ -1371,6 +1494,20 @@ static void disk_zone_wplugs_work(struct work_struct *work)
>  	disk_zone_process_err_list(disk);
>  }
>  
> +/* May be called from interrupt context and hence must not sleep. */
> +void blk_zone_requeue_work(struct request_queue *q)
> +{
> +	struct gendisk *disk = q->disk;
> +
> +	if (!disk)
> +		return;

Can this happen ?

> +
> +	if (in_interrupt())
> +		kblockd_schedule_work(&disk->zone_wplugs_work);
> +	else
> +		disk_zone_process_err_list(disk);
> +}
> +
>  static inline unsigned int disk_zone_wplugs_hash_size(struct gendisk *disk)
>  {
>  	return 1U << disk->zone_wplugs_hash_bits;
> @@ -1854,8 +1991,11 @@ static void queue_zone_wplug_show(struct blk_zone_wplug *zwplug,
>  	zwp_bio_list_size = bio_list_size(&zwplug->bio_list);
>  	spin_unlock_irqrestore(&zwplug->lock, flags);
>  
> -	seq_printf(m, "%u 0x%x %u %u %u\n", zwp_zone_no, zwp_flags, zwp_ref,
> -		   zwp_wp_offset, zwp_bio_list_size);
> +	bool all_zwr_inserted = blk_zone_all_zwr_inserted(zwplug);

Is this bool really needed ? If it is, shouldn't it be assigned while holding
the zwplug lock to have a "snapshot" of the plug with all printed values
consistent ?

> +
> +	seq_printf(m, "zone_no %u flags 0x%x ref %u wp_offset %u bio_list_size %u all_zwr_inserted %d\n",
> +		   zwp_zone_no, zwp_flags, zwp_ref, zwp_wp_offset,
> +		   zwp_bio_list_size, all_zwr_inserted);
>  }
>  
>  int queue_zone_wplugs_show(void *data, struct seq_file *m)
> diff --git a/block/blk.h b/block/blk.h
> index 2c26abf505b8..be945db6298d 100644
> --- a/block/blk.h
> +++ b/block/blk.h
> @@ -473,6 +473,18 @@ static inline void blk_zone_update_request_bio(struct request *rq,
>  	if (req_op(rq) == REQ_OP_ZONE_APPEND || bio_zone_write_plugging(bio))
>  		bio->bi_iter.bi_sector = rq->__sector;
>  }
> +
> +void blk_zone_write_plug_requeue_request(struct request *rq);
> +static inline void blk_zone_requeue_request(struct request *rq)
> +{
> +	if (!blk_rq_is_seq_zoned_write(rq))
> +		return;
> +
> +	blk_zone_write_plug_requeue_request(rq);

May be:

	if (blk_rq_is_seq_zoned_write(rq))
		blk_zone_write_plug_requeue_request(rq);

?

> diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
> index c596e0e4cb75..ac05974f08f9 100644
> --- a/include/linux/blk-mq.h
> +++ b/include/linux/blk-mq.h
> @@ -1169,4 +1169,22 @@ static inline int blk_rq_map_sg(struct request_queue *q, struct request *rq,
>  }
>  void blk_dump_rq_flags(struct request *, char *);
>  
> +#ifdef CONFIG_BLK_DEV_ZONED
> +static inline bool blk_rq_is_seq_zoned_write(struct request *rq)

bdev_zone_is_seq() is already stubbed for the !CONFIG_BLK_DEV_ZONED case, so I
do not think this function needs the ifdef. It will compile either way and will
never be called from anywhere in the !CONFIG_BLK_DEV_ZONED case.

> +{
> +	switch (req_op(rq)) {
> +	case REQ_OP_WRITE:
> +	case REQ_OP_WRITE_ZEROES:
> +		return bdev_zone_is_seq(rq->q->disk->part0, blk_rq_pos(rq));
> +	default:
> +		return false;
> +	}
> +}
> +#else /* CONFIG_BLK_DEV_ZONED */
> +static inline bool blk_rq_is_seq_zoned_write(struct request *rq)
> +{
> +	return false;
> +}
> +#endif /* CONFIG_BLK_DEV_ZONED */
> +
>  #endif /* BLK_MQ_H */


-- 
Damien Le Moal
Western Digital Research

next prev parent reply	other threads:[~2024-11-19  2:50 UTC|newest]

Thread overview: 73+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-11-19  0:27 [PATCH v16 00/26] Improve write performance for zoned UFS devices Bart Van Assche
2024-11-19  0:27 ` [PATCH v16 01/26] blk-zoned: Fix a reference count leak Bart Van Assche
2024-11-19  2:23   ` Damien Le Moal
2024-11-19 20:21     ` Bart Van Assche
2024-11-19  0:27 ` [PATCH v16 02/26] blk-zoned: Split disk_zone_wplugs_work() Bart Van Assche
2024-11-19  0:27 ` [PATCH v16 03/26] blk-zoned: Split queue_zone_wplugs_show() Bart Van Assche
2024-11-19  2:25   ` Damien Le Moal
2024-11-19  0:27 ` [PATCH v16 04/26] blk-zoned: Only handle errors after pending zoned writes have completed Bart Van Assche
2024-11-19  2:50   ` Damien Le Moal [this message]
2024-11-19 20:51     ` Bart Van Assche
2024-11-21  3:23       ` Damien Le Moal
2024-11-21 17:43         ` Bart Van Assche
2024-11-19  0:27 ` [PATCH v16 05/26] blk-zoned: Fix a deadlock triggered by unaligned writes Bart Van Assche
2024-11-19  2:57   ` Damien Le Moal
2024-11-19 21:04     ` Bart Van Assche
2024-11-21  3:32       ` Damien Le Moal
2024-11-21 17:51         ` Bart Van Assche
2024-11-25  4:00           ` Damien Le Moal
2024-11-25  4:19             ` Damien Le Moal
2025-01-09 19:11     ` Bart Van Assche
2025-01-10  5:07       ` Damien Le Moal
2025-01-10 18:17         ` Bart Van Assche
2024-11-19  0:27 ` [PATCH v16 06/26] blk-zoned: Fix requeuing of zoned writes Bart Van Assche
2024-11-19  3:00   ` Damien Le Moal
2024-11-19 21:06     ` Bart Van Assche
2024-11-19  0:27 ` [PATCH v16 07/26] block: Support block drivers that preserve the order of write requests Bart Van Assche
2024-11-19  7:37   ` Damien Le Moal
2024-11-19 21:08     ` Bart Van Assche
2024-11-19  0:27 ` [PATCH v16 08/26] dm-linear: Report to the block layer that the write order is preserved Bart Van Assche
2024-11-19  0:27 ` [PATCH v16 09/26] mq-deadline: Remove a local variable Bart Van Assche
2024-11-19  7:38   ` Damien Le Moal
2024-11-19 21:11     ` Bart Van Assche
2024-11-19  0:27 ` [PATCH v16 10/26] blk-mq: Clean up blk_mq_requeue_work() Bart Van Assche
2024-11-19  7:39   ` Damien Le Moal
2024-11-19  0:28 ` [PATCH v16 11/26] block: Optimize blk_mq_submit_bio() for the cache hit scenario Bart Van Assche
2024-11-19  7:40   ` Damien Le Moal
2024-11-19  0:28 ` [PATCH v16 12/26] block: Rework request allocation in blk_mq_submit_bio() Bart Van Assche
2024-11-19  7:44   ` Damien Le Moal
2024-11-19  0:28 ` [PATCH v16 13/26] block: Support allocating from a specific software queue Bart Van Assche
2024-11-19  0:28 ` [PATCH v16 14/26] blk-mq: Restore the zoned write order when requeuing Bart Van Assche
2024-11-19  7:52   ` Damien Le Moal
2024-11-19 21:16     ` Bart Van Assche
2024-11-19  0:28 ` [PATCH v16 15/26] blk-zoned: Document the locking order Bart Van Assche
2024-11-19  7:52   ` Damien Le Moal
2024-11-19  0:28 ` [PATCH v16 16/26] blk-zoned: Document locking assumptions Bart Van Assche
2024-11-19  7:53   ` Damien Le Moal
2024-11-19 21:18     ` Bart Van Assche
2024-11-21  3:34       ` Damien Le Moal
2024-11-19  0:28 ` [PATCH v16 17/26] blk-zoned: Uninline functions that are not in the hot path Bart Van Assche
2024-11-19  7:55   ` Damien Le Moal
2024-11-19 21:20     ` Bart Van Assche
2024-11-21  3:36       ` Damien Le Moal
2024-11-19  0:28 ` [PATCH v16 18/26] blk-zoned: Make disk_should_remove_zone_wplug() more robust Bart Van Assche
2024-11-19  7:58   ` Damien Le Moal
2024-11-19  0:28 ` [PATCH v16 19/26] blk-zoned: Add an argument to blk_zone_plug_bio() Bart Van Assche
2024-11-19  0:28 ` [PATCH v16 20/26] blk-zoned: Support pipelining of zoned writes Bart Van Assche
2024-11-19  0:28 ` [PATCH v16 21/26] scsi: core: Retry unaligned " Bart Van Assche
2024-11-19  0:28 ` [PATCH v16 22/26] scsi: sd: Increase retry count for " Bart Van Assche
2024-11-19  0:28 ` [PATCH v16 23/26] scsi: scsi_debug: Add the preserves_write_order module parameter Bart Van Assche
2024-11-19  0:28 ` [PATCH v16 24/26] scsi: scsi_debug: Support injecting unaligned write errors Bart Van Assche
2024-11-19  0:28 ` [PATCH v16 25/26] scsi: scsi_debug: Skip host/bus reset settle delay Bart Van Assche
2024-11-19  0:28 ` [PATCH v16 26/26] scsi: ufs: Inform the block layer about write ordering Bart Van Assche
     [not found]   ` <37f95f44-ab1d-20db-e0c7-94946cb9d4eb@quicinc.com>
2024-11-22 18:20     ` Bart Van Assche
2024-11-23  0:34       ` Can Guo
2024-11-19  8:01 ` [PATCH v16 00/26] Improve write performance for zoned UFS devices Damien Le Moal
2024-11-19 19:08   ` Bart Van Assche
2024-11-21  3:20     ` Damien Le Moal
2024-11-21 18:00       ` Bart Van Assche
2024-11-25  3:59         ` Damien Le Moal
2025-01-09 19:02   ` Bart Van Assche
2025-01-10  5:10     ` Damien Le Moal
2024-11-19 12:25 ` Christoph Hellwig
2024-11-19 18:52   ` Bart Van Assche

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=7f4058f9-df04-404c-b4f0-25bf0e8e4886@kernel.org \
    --to=dlemoal@kernel.org \
    --cc=axboe@kernel.dk \
    --cc=bvanassche@acm.org \
    --cc=hch@lst.de \
    --cc=jaegeuk@kernel.org \
    --cc=linux-block@vger.kernel.org \
    --cc=linux-scsi@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox