From: Bart Van Assche <bvanassche@acm.org>
To: Jens Axboe <axboe@kernel.dk>
Cc: linux-block@vger.kernel.org, linux-scsi@vger.kernel.org,
Christoph Hellwig <hch@lst.de>,
Damien Le Moal <dlemoal@kernel.org>,
Jaegeuk Kim <jaegeuk@kernel.org>,
Bart Van Assche <bvanassche@acm.org>
Subject: [PATCH v16 04/26] blk-zoned: Only handle errors after pending zoned writes have completed
Date: Mon, 18 Nov 2024 16:27:53 -0800 [thread overview]
Message-ID: <20241119002815.600608-5-bvanassche@acm.org> (raw)
In-Reply-To: <20241119002815.600608-1-bvanassche@acm.org>
Instead of handling write errors immediately, only handle these after all
pending zoned write requests have completed or have been requeued. This
patch prepares for changing the zone write pointer tracking approach.
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
---
block/blk-mq.c | 9 +++
block/blk-zoned.c | 154 +++++++++++++++++++++++++++++++++++++++--
block/blk.h | 29 ++++++++
include/linux/blk-mq.h | 18 +++++
4 files changed, 203 insertions(+), 7 deletions(-)
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 270cfd9fc6b0..a45077e187b5 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -793,6 +793,9 @@ void blk_mq_free_request(struct request *rq)
rq_qos_done(q, rq);
WRITE_ONCE(rq->state, MQ_RQ_IDLE);
+
+ blk_zone_free_request(rq);
+
if (req_ref_put_and_test(rq))
__blk_mq_free_request(rq);
}
@@ -1189,6 +1192,9 @@ void blk_mq_end_request_batch(struct io_comp_batch *iob)
continue;
WRITE_ONCE(rq->state, MQ_RQ_IDLE);
+
+ blk_zone_free_request(rq);
+
if (!req_ref_put_and_test(rq))
continue;
@@ -1507,6 +1513,7 @@ static void __blk_mq_requeue_request(struct request *rq)
if (blk_mq_request_started(rq)) {
WRITE_ONCE(rq->state, MQ_RQ_IDLE);
rq->rq_flags &= ~RQF_TIMED_OUT;
+ blk_zone_requeue_work(q);
}
}
@@ -1542,6 +1549,8 @@ static void blk_mq_requeue_work(struct work_struct *work)
list_splice_init(&q->flush_list, &flush_list);
spin_unlock_irq(&q->requeue_lock);
+ blk_zone_requeue_work(q);
+
while (!list_empty(&rq_list)) {
rq = list_entry(rq_list.next, struct request, queuelist);
/*
diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index 7e6e6ebb6235..b570b773e65f 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -608,6 +608,8 @@ static inline void disk_zone_wplug_set_error(struct gendisk *disk,
if (zwplug->flags & BLK_ZONE_WPLUG_ERROR)
return;
+ zwplug->flags |= BLK_ZONE_WPLUG_PLUGGED;
+ zwplug->flags |= BLK_ZONE_WPLUG_ERROR;
/*
* At this point, we already have a reference on the zone write plug.
* However, since we are going to add the plug to the disk zone write
@@ -616,7 +618,6 @@ static inline void disk_zone_wplug_set_error(struct gendisk *disk,
* handled, or in disk_zone_wplug_clear_error() if the zone is reset or
* finished.
*/
- zwplug->flags |= BLK_ZONE_WPLUG_ERROR;
refcount_inc(&zwplug->ref);
spin_lock_irqsave(&disk->zone_wplugs_lock, flags);
@@ -642,6 +643,7 @@ static inline void disk_zone_wplug_clear_error(struct gendisk *disk,
spin_lock_irqsave(&disk->zone_wplugs_lock, flags);
if (!list_empty(&zwplug->link)) {
list_del_init(&zwplug->link);
+ zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED;
zwplug->flags &= ~BLK_ZONE_WPLUG_ERROR;
disk_put_zone_wplug(zwplug);
}
@@ -746,6 +748,70 @@ static bool blk_zone_wplug_handle_reset_all(struct bio *bio)
return false;
}
+struct all_zwr_inserted_data {
+ struct blk_zone_wplug *zwplug;
+ bool res;
+};
+
+/*
+ * Changes @data->res to %false if and only if @rq is a zoned write for the
+ * given zone and if it is owned by the block driver.
+ *
+ * @rq members may change while this function is in progress. Hence, use
+ * READ_ONCE() to read @rq members.
+ */
+static bool blk_zwr_inserted(struct request *rq, void *data)
+{
+ struct all_zwr_inserted_data *d = data;
+ struct blk_zone_wplug *zwplug = d->zwplug;
+ struct request_queue *q = zwplug->disk->queue;
+ struct bio *bio = READ_ONCE(rq->bio);
+
+ if (rq->q == q && READ_ONCE(rq->state) != MQ_RQ_IDLE &&
+ blk_rq_is_seq_zoned_write(rq) && bio &&
+ bio_zone_no(bio) == zwplug->zone_no) {
+ d->res = false;
+ return false;
+ }
+
+ return true;
+}
+
+/*
+ * Report whether or not all zoned writes for a zone have been inserted into a
+ * software queue, elevator queue or hardware queue.
+ */
+static bool blk_zone_all_zwr_inserted(struct blk_zone_wplug *zwplug)
+{
+ struct gendisk *disk = zwplug->disk;
+ struct request_queue *q = disk->queue;
+ struct all_zwr_inserted_data d = { .zwplug = zwplug, .res = true };
+ struct blk_mq_hw_ctx *hctx;
+ long unsigned int i;
+ struct request *rq;
+
+ scoped_guard(spinlock_irqsave, &q->requeue_lock) {
+ list_for_each_entry(rq, &q->requeue_list, queuelist)
+ if (blk_rq_is_seq_zoned_write(rq) &&
+ bio_zone_no(rq->bio) == zwplug->zone_no)
+ return false;
+ list_for_each_entry(rq, &q->flush_list, queuelist)
+ if (blk_rq_is_seq_zoned_write(rq) &&
+ bio_zone_no(rq->bio) == zwplug->zone_no)
+ return false;
+ }
+
+ queue_for_each_hw_ctx(q, hctx, i) {
+ struct blk_mq_tags *tags = hctx->sched_tags ?: hctx->tags;
+
+ blk_mq_all_tag_iter(tags, blk_zwr_inserted, &d);
+ if (!d.res || blk_mq_is_shared_tags(q->tag_set->flags))
+ break;
+ }
+
+ return d.res;
+}
+
static inline void blk_zone_wplug_add_bio(struct blk_zone_wplug *zwplug,
struct bio *bio, unsigned int nr_segs)
{
@@ -1096,6 +1162,29 @@ static void disk_zone_wplug_schedule_bio_work(struct gendisk *disk,
queue_work(disk->zone_wplugs_wq, &zwplug->bio_work);
}
+/*
+ * Change the zone state to "error" if a request is requeued to postpone
+ * processing of requeued requests until all pending requests have either
+ * completed or have been requeued.
+ */
+void blk_zone_write_plug_requeue_request(struct request *rq)
+{
+ struct gendisk *disk = rq->q->disk;
+ struct blk_zone_wplug *zwplug;
+
+ if (!disk->zone_wplugs_hash_bits || !blk_rq_is_seq_zoned_write(rq))
+ return;
+
+ zwplug = disk_get_zone_wplug(disk, blk_rq_pos(rq));
+ if (WARN_ON_ONCE(!zwplug))
+ return;
+
+ scoped_guard(spinlock_irqsave, &zwplug->lock)
+ disk_zone_wplug_set_error(disk, zwplug);
+
+ disk_put_zone_wplug(zwplug);
+}
+
static void disk_zone_wplug_unplug_bio(struct gendisk *disk,
struct blk_zone_wplug *zwplug)
{
@@ -1202,6 +1291,33 @@ void blk_zone_write_plug_finish_request(struct request *req)
disk_put_zone_wplug(zwplug);
}
+/*
+ * Schedule zone_plugs_work if a zone is in the error state and if no requests
+ * are in flight. Called from blk_mq_free_request().
+ */
+void blk_zone_write_plug_free_request(struct request *rq)
+{
+ struct gendisk *disk = rq->q->disk;
+ struct blk_zone_wplug *zwplug;
+
+ /*
+ * Do nothing if this function is called before the zone information
+ * has been initialized.
+ */
+ if (!disk->zone_wplugs_hash_bits)
+ return;
+
+ zwplug = disk_get_zone_wplug(disk, blk_rq_pos(rq));
+
+ if (!zwplug)
+ return;
+
+ if (zwplug->flags & BLK_ZONE_WPLUG_ERROR)
+ kblockd_schedule_work(&disk->zone_wplugs_work);
+
+ disk_put_zone_wplug(zwplug);
+}
+
static void blk_zone_wplug_bio_work(struct work_struct *work)
{
struct blk_zone_wplug *zwplug =
@@ -1343,14 +1459,15 @@ static void disk_zone_wplug_handle_error(struct gendisk *disk,
static void disk_zone_process_err_list(struct gendisk *disk)
{
- struct blk_zone_wplug *zwplug;
+ struct blk_zone_wplug *zwplug, *next;
unsigned long flags;
spin_lock_irqsave(&disk->zone_wplugs_lock, flags);
- while (!list_empty(&disk->zone_wplugs_err_list)) {
- zwplug = list_first_entry(&disk->zone_wplugs_err_list,
- struct blk_zone_wplug, link);
+ list_for_each_entry_safe(zwplug, next, &disk->zone_wplugs_err_list,
+ link) {
+ if (!blk_zone_all_zwr_inserted(zwplug))
+ continue;
list_del_init(&zwplug->link);
spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags);
@@ -1361,6 +1478,12 @@ static void disk_zone_process_err_list(struct gendisk *disk)
}
spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags);
+
+ /*
+ * If one or more zones have been skipped, this work will be requeued
+ * when a request is requeued (blk_zone_requeue_work()) or freed
+ * (blk_zone_write_plug_free_request()).
+ */
}
static void disk_zone_wplugs_work(struct work_struct *work)
@@ -1371,6 +1494,20 @@ static void disk_zone_wplugs_work(struct work_struct *work)
disk_zone_process_err_list(disk);
}
+/* May be called from interrupt context and hence must not sleep. */
+void blk_zone_requeue_work(struct request_queue *q)
+{
+ struct gendisk *disk = q->disk;
+
+ if (!disk)
+ return;
+
+ if (in_interrupt())
+ kblockd_schedule_work(&disk->zone_wplugs_work);
+ else
+ disk_zone_process_err_list(disk);
+}
+
static inline unsigned int disk_zone_wplugs_hash_size(struct gendisk *disk)
{
return 1U << disk->zone_wplugs_hash_bits;
@@ -1854,8 +1991,11 @@ static void queue_zone_wplug_show(struct blk_zone_wplug *zwplug,
zwp_bio_list_size = bio_list_size(&zwplug->bio_list);
spin_unlock_irqrestore(&zwplug->lock, flags);
- seq_printf(m, "%u 0x%x %u %u %u\n", zwp_zone_no, zwp_flags, zwp_ref,
- zwp_wp_offset, zwp_bio_list_size);
+ bool all_zwr_inserted = blk_zone_all_zwr_inserted(zwplug);
+
+ seq_printf(m, "zone_no %u flags 0x%x ref %u wp_offset %u bio_list_size %u all_zwr_inserted %d\n",
+ zwp_zone_no, zwp_flags, zwp_ref, zwp_wp_offset,
+ zwp_bio_list_size, all_zwr_inserted);
}
int queue_zone_wplugs_show(void *data, struct seq_file *m)
diff --git a/block/blk.h b/block/blk.h
index 2c26abf505b8..be945db6298d 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -473,6 +473,18 @@ static inline void blk_zone_update_request_bio(struct request *rq,
if (req_op(rq) == REQ_OP_ZONE_APPEND || bio_zone_write_plugging(bio))
bio->bi_iter.bi_sector = rq->__sector;
}
+
+void blk_zone_write_plug_requeue_request(struct request *rq);
+static inline void blk_zone_requeue_request(struct request *rq)
+{
+ if (!blk_rq_is_seq_zoned_write(rq))
+ return;
+
+ blk_zone_write_plug_requeue_request(rq);
+}
+
+void blk_zone_requeue_work(struct request_queue *q);
+
void blk_zone_write_plug_bio_endio(struct bio *bio);
static inline void blk_zone_bio_endio(struct bio *bio)
{
@@ -490,6 +502,14 @@ static inline void blk_zone_finish_request(struct request *rq)
if (rq->rq_flags & RQF_ZONE_WRITE_PLUGGING)
blk_zone_write_plug_finish_request(rq);
}
+
+void blk_zone_write_plug_free_request(struct request *rq);
+static inline void blk_zone_free_request(struct request *rq)
+{
+ if (blk_queue_is_zoned(rq->q))
+ blk_zone_write_plug_free_request(rq);
+}
+
int blkdev_report_zones_ioctl(struct block_device *bdev, unsigned int cmd,
unsigned long arg);
int blkdev_zone_mgmt_ioctl(struct block_device *bdev, blk_mode_t mode,
@@ -515,12 +535,21 @@ static inline void blk_zone_update_request_bio(struct request *rq,
struct bio *bio)
{
}
+static inline void blk_zone_requeue_request(struct request *rq)
+{
+}
+static inline void blk_zone_requeue_work(struct request_queue *q)
+{
+}
static inline void blk_zone_bio_endio(struct bio *bio)
{
}
static inline void blk_zone_finish_request(struct request *rq)
{
}
+static inline void blk_zone_free_request(struct request *rq)
+{
+}
static inline int blkdev_report_zones_ioctl(struct block_device *bdev,
unsigned int cmd, unsigned long arg)
{
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index c596e0e4cb75..ac05974f08f9 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -1169,4 +1169,22 @@ static inline int blk_rq_map_sg(struct request_queue *q, struct request *rq,
}
void blk_dump_rq_flags(struct request *, char *);
+#ifdef CONFIG_BLK_DEV_ZONED
+static inline bool blk_rq_is_seq_zoned_write(struct request *rq)
+{
+ switch (req_op(rq)) {
+ case REQ_OP_WRITE:
+ case REQ_OP_WRITE_ZEROES:
+ return bdev_zone_is_seq(rq->q->disk->part0, blk_rq_pos(rq));
+ default:
+ return false;
+ }
+}
+#else /* CONFIG_BLK_DEV_ZONED */
+static inline bool blk_rq_is_seq_zoned_write(struct request *rq)
+{
+ return false;
+}
+#endif /* CONFIG_BLK_DEV_ZONED */
+
#endif /* BLK_MQ_H */
next prev parent reply other threads:[~2024-11-19 0:28 UTC|newest]
Thread overview: 73+ messages / expand[flat|nested] mbox.gz Atom feed top
2024-11-19 0:27 [PATCH v16 00/26] Improve write performance for zoned UFS devices Bart Van Assche
2024-11-19 0:27 ` [PATCH v16 01/26] blk-zoned: Fix a reference count leak Bart Van Assche
2024-11-19 2:23 ` Damien Le Moal
2024-11-19 20:21 ` Bart Van Assche
2024-11-19 0:27 ` [PATCH v16 02/26] blk-zoned: Split disk_zone_wplugs_work() Bart Van Assche
2024-11-19 0:27 ` [PATCH v16 03/26] blk-zoned: Split queue_zone_wplugs_show() Bart Van Assche
2024-11-19 2:25 ` Damien Le Moal
2024-11-19 0:27 ` Bart Van Assche [this message]
2024-11-19 2:50 ` [PATCH v16 04/26] blk-zoned: Only handle errors after pending zoned writes have completed Damien Le Moal
2024-11-19 20:51 ` Bart Van Assche
2024-11-21 3:23 ` Damien Le Moal
2024-11-21 17:43 ` Bart Van Assche
2024-11-19 0:27 ` [PATCH v16 05/26] blk-zoned: Fix a deadlock triggered by unaligned writes Bart Van Assche
2024-11-19 2:57 ` Damien Le Moal
2024-11-19 21:04 ` Bart Van Assche
2024-11-21 3:32 ` Damien Le Moal
2024-11-21 17:51 ` Bart Van Assche
2024-11-25 4:00 ` Damien Le Moal
2024-11-25 4:19 ` Damien Le Moal
2025-01-09 19:11 ` Bart Van Assche
2025-01-10 5:07 ` Damien Le Moal
2025-01-10 18:17 ` Bart Van Assche
2024-11-19 0:27 ` [PATCH v16 06/26] blk-zoned: Fix requeuing of zoned writes Bart Van Assche
2024-11-19 3:00 ` Damien Le Moal
2024-11-19 21:06 ` Bart Van Assche
2024-11-19 0:27 ` [PATCH v16 07/26] block: Support block drivers that preserve the order of write requests Bart Van Assche
2024-11-19 7:37 ` Damien Le Moal
2024-11-19 21:08 ` Bart Van Assche
2024-11-19 0:27 ` [PATCH v16 08/26] dm-linear: Report to the block layer that the write order is preserved Bart Van Assche
2024-11-19 0:27 ` [PATCH v16 09/26] mq-deadline: Remove a local variable Bart Van Assche
2024-11-19 7:38 ` Damien Le Moal
2024-11-19 21:11 ` Bart Van Assche
2024-11-19 0:27 ` [PATCH v16 10/26] blk-mq: Clean up blk_mq_requeue_work() Bart Van Assche
2024-11-19 7:39 ` Damien Le Moal
2024-11-19 0:28 ` [PATCH v16 11/26] block: Optimize blk_mq_submit_bio() for the cache hit scenario Bart Van Assche
2024-11-19 7:40 ` Damien Le Moal
2024-11-19 0:28 ` [PATCH v16 12/26] block: Rework request allocation in blk_mq_submit_bio() Bart Van Assche
2024-11-19 7:44 ` Damien Le Moal
2024-11-19 0:28 ` [PATCH v16 13/26] block: Support allocating from a specific software queue Bart Van Assche
2024-11-19 0:28 ` [PATCH v16 14/26] blk-mq: Restore the zoned write order when requeuing Bart Van Assche
2024-11-19 7:52 ` Damien Le Moal
2024-11-19 21:16 ` Bart Van Assche
2024-11-19 0:28 ` [PATCH v16 15/26] blk-zoned: Document the locking order Bart Van Assche
2024-11-19 7:52 ` Damien Le Moal
2024-11-19 0:28 ` [PATCH v16 16/26] blk-zoned: Document locking assumptions Bart Van Assche
2024-11-19 7:53 ` Damien Le Moal
2024-11-19 21:18 ` Bart Van Assche
2024-11-21 3:34 ` Damien Le Moal
2024-11-19 0:28 ` [PATCH v16 17/26] blk-zoned: Uninline functions that are not in the hot path Bart Van Assche
2024-11-19 7:55 ` Damien Le Moal
2024-11-19 21:20 ` Bart Van Assche
2024-11-21 3:36 ` Damien Le Moal
2024-11-19 0:28 ` [PATCH v16 18/26] blk-zoned: Make disk_should_remove_zone_wplug() more robust Bart Van Assche
2024-11-19 7:58 ` Damien Le Moal
2024-11-19 0:28 ` [PATCH v16 19/26] blk-zoned: Add an argument to blk_zone_plug_bio() Bart Van Assche
2024-11-19 0:28 ` [PATCH v16 20/26] blk-zoned: Support pipelining of zoned writes Bart Van Assche
2024-11-19 0:28 ` [PATCH v16 21/26] scsi: core: Retry unaligned " Bart Van Assche
2024-11-19 0:28 ` [PATCH v16 22/26] scsi: sd: Increase retry count for " Bart Van Assche
2024-11-19 0:28 ` [PATCH v16 23/26] scsi: scsi_debug: Add the preserves_write_order module parameter Bart Van Assche
2024-11-19 0:28 ` [PATCH v16 24/26] scsi: scsi_debug: Support injecting unaligned write errors Bart Van Assche
2024-11-19 0:28 ` [PATCH v16 25/26] scsi: scsi_debug: Skip host/bus reset settle delay Bart Van Assche
2024-11-19 0:28 ` [PATCH v16 26/26] scsi: ufs: Inform the block layer about write ordering Bart Van Assche
[not found] ` <37f95f44-ab1d-20db-e0c7-94946cb9d4eb@quicinc.com>
2024-11-22 18:20 ` Bart Van Assche
2024-11-23 0:34 ` Can Guo
2024-11-19 8:01 ` [PATCH v16 00/26] Improve write performance for zoned UFS devices Damien Le Moal
2024-11-19 19:08 ` Bart Van Assche
2024-11-21 3:20 ` Damien Le Moal
2024-11-21 18:00 ` Bart Van Assche
2024-11-25 3:59 ` Damien Le Moal
2025-01-09 19:02 ` Bart Van Assche
2025-01-10 5:10 ` Damien Le Moal
2024-11-19 12:25 ` Christoph Hellwig
2024-11-19 18:52 ` Bart Van Assche
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20241119002815.600608-5-bvanassche@acm.org \
--to=bvanassche@acm.org \
--cc=axboe@kernel.dk \
--cc=dlemoal@kernel.org \
--cc=hch@lst.de \
--cc=jaegeuk@kernel.org \
--cc=linux-block@vger.kernel.org \
--cc=linux-scsi@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox