public inbox for linux-scsi@vger.kernel.org
 help / color / mirror / Atom feed
From: Bart Van Assche <bvanassche@acm.org>
To: Jens Axboe <axboe@kernel.dk>
Cc: linux-block@vger.kernel.org, linux-scsi@vger.kernel.org,
	Christoph Hellwig <hch@lst.de>,
	Damien Le Moal <dlemoal@kernel.org>,
	Jaegeuk Kim <jaegeuk@kernel.org>,
	Bart Van Assche <bvanassche@acm.org>
Subject: [PATCH v16 05/26] blk-zoned: Fix a deadlock triggered by unaligned writes
Date: Mon, 18 Nov 2024 16:27:54 -0800	[thread overview]
Message-ID: <20241119002815.600608-6-bvanassche@acm.org> (raw)
In-Reply-To: <20241119002815.600608-1-bvanassche@acm.org>

If the queue is filled with unaligned writes then the following
deadlock occurs:

Call Trace:
 <TASK>
 __schedule+0x8cc/0x2190
 schedule+0xdd/0x2b0
 blk_queue_enter+0x2ce/0x4f0
 blk_mq_alloc_request+0x303/0x810
 scsi_execute_cmd+0x3f4/0x7b0
 sd_zbc_do_report_zones+0x19e/0x4c0
 sd_zbc_report_zones+0x304/0x920
 disk_zone_wplug_handle_error+0x237/0x920
 disk_zone_wplugs_work+0x17e/0x430
 process_one_work+0xdd0/0x1490
 worker_thread+0x5eb/0x1010
 kthread+0x2e5/0x3b0
 ret_from_fork+0x3a/0x80
 </TASK>

Fix this deadlock by removing the disk->fops->report_zones() call and by
deriving the write pointer information from successfully completed zoned
writes.

Signed-off-by: Bart Van Assche <bvanassche@acm.org>
---
 block/blk-zoned.c | 69 +++++++++++++++++++----------------------------
 block/blk.h       |  4 ++-
 2 files changed, 31 insertions(+), 42 deletions(-)

diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index b570b773e65f..284820b29285 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -56,6 +56,8 @@ static const char *const zone_cond_name[] = {
  * @zone_no: The number of the zone the plug is managing.
  * @wp_offset: The zone write pointer location relative to the start of the zone
  *             as a number of 512B sectors.
+ * @wp_offset_compl: End offset for completed zoned writes as a number of 512
+ *		     byte sectors.
  * @bio_list: The list of BIOs that are currently plugged.
  * @bio_work: Work struct to handle issuing of plugged BIOs
  * @rcu_head: RCU head to free zone write plugs with an RCU grace period.
@@ -69,6 +71,7 @@ struct blk_zone_wplug {
 	unsigned int		flags;
 	unsigned int		zone_no;
 	unsigned int		wp_offset;
+	unsigned int		wp_offset_compl;
 	struct bio_list		bio_list;
 	struct work_struct	bio_work;
 	struct rcu_head		rcu_head;
@@ -531,6 +534,7 @@ static struct blk_zone_wplug *disk_get_and_lock_zone_wplug(struct gendisk *disk,
 	zwplug->flags = 0;
 	zwplug->zone_no = zno;
 	zwplug->wp_offset = sector & (disk->queue->limits.chunk_sectors - 1);
+	zwplug->wp_offset_compl = 0;
 	bio_list_init(&zwplug->bio_list);
 	INIT_WORK(&zwplug->bio_work, blk_zone_wplug_bio_work);
 	zwplug->disk = disk;
@@ -1226,6 +1230,7 @@ void blk_zone_write_plug_bio_endio(struct bio *bio)
 	struct gendisk *disk = bio->bi_bdev->bd_disk;
 	struct blk_zone_wplug *zwplug =
 		disk_get_zone_wplug(disk, bio->bi_iter.bi_sector);
+	unsigned int end_sector;
 	unsigned long flags;
 
 	if (WARN_ON_ONCE(!zwplug))
@@ -1243,11 +1248,24 @@ void blk_zone_write_plug_bio_endio(struct bio *bio)
 		bio->bi_opf |= REQ_OP_ZONE_APPEND;
 	}
 
-	/*
-	 * If the BIO failed, mark the plug as having an error to trigger
-	 * recovery.
-	 */
-	if (bio->bi_status != BLK_STS_OK) {
+	if (bio->bi_status == BLK_STS_OK) {
+		switch (bio_op(bio)) {
+		case REQ_OP_WRITE:
+		case REQ_OP_ZONE_APPEND:
+		case REQ_OP_WRITE_ZEROES:
+			end_sector = bdev_offset_from_zone_start(disk->part0,
+				     bio->bi_iter.bi_sector + bio_sectors(bio));
+			if (end_sector > zwplug->wp_offset_compl)
+				zwplug->wp_offset_compl = end_sector;
+			break;
+		default:
+			break;
+		}
+	} else {
+		/*
+		 * If the BIO failed, mark the plug as having an error to
+		 * trigger recovery.
+		 */
 		spin_lock_irqsave(&zwplug->lock, flags);
 		disk_zone_wplug_set_error(disk, zwplug);
 		spin_unlock_irqrestore(&zwplug->lock, flags);
@@ -1388,30 +1406,10 @@ static unsigned int blk_zone_wp_offset(struct blk_zone *zone)
 	}
 }
 
-static int blk_zone_wplug_report_zone_cb(struct blk_zone *zone,
-					 unsigned int idx, void *data)
-{
-	struct blk_zone *zonep = data;
-
-	*zonep = *zone;
-	return 0;
-}
-
 static void disk_zone_wplug_handle_error(struct gendisk *disk,
 					 struct blk_zone_wplug *zwplug)
 {
-	sector_t zone_start_sector =
-		bdev_zone_sectors(disk->part0) * zwplug->zone_no;
-	unsigned int noio_flag;
-	struct blk_zone zone;
 	unsigned long flags;
-	int ret;
-
-	/* Get the current zone information from the device. */
-	noio_flag = memalloc_noio_save();
-	ret = disk->fops->report_zones(disk, zone_start_sector, 1,
-				       blk_zone_wplug_report_zone_cb, &zone);
-	memalloc_noio_restore(noio_flag);
 
 	spin_lock_irqsave(&zwplug->lock, flags);
 
@@ -1425,19 +1423,8 @@ static void disk_zone_wplug_handle_error(struct gendisk *disk,
 
 	zwplug->flags &= ~BLK_ZONE_WPLUG_ERROR;
 
-	if (ret != 1) {
-		/*
-		 * We failed to get the zone information, meaning that something
-		 * is likely really wrong with the device. Abort all remaining
-		 * plugged BIOs as otherwise we could endup waiting forever on
-		 * plugged BIOs to complete if there is a queue freeze on-going.
-		 */
-		disk_zone_wplug_abort(zwplug);
-		goto unplug;
-	}
-
 	/* Update the zone write pointer offset. */
-	zwplug->wp_offset = blk_zone_wp_offset(&zone);
+	zwplug->wp_offset = zwplug->wp_offset_compl;
 	disk_zone_wplug_abort_unaligned(disk, zwplug);
 
 	/* Restart BIO submission if we still have any BIO left. */
@@ -1446,7 +1433,6 @@ static void disk_zone_wplug_handle_error(struct gendisk *disk,
 		goto unlock;
 	}
 
-unplug:
 	zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED;
 	if (disk_should_remove_zone_wplug(disk, zwplug))
 		disk_remove_zone_wplug(disk, zwplug);
@@ -1978,7 +1964,7 @@ EXPORT_SYMBOL_GPL(blk_revalidate_disk_zones);
 static void queue_zone_wplug_show(struct blk_zone_wplug *zwplug,
 				  struct seq_file *m)
 {
-	unsigned int zwp_wp_offset, zwp_flags;
+	unsigned int zwp_wp_offset, zwp_wp_offset_compl, zwp_flags;
 	unsigned int zwp_zone_no, zwp_ref;
 	unsigned int zwp_bio_list_size;
 	unsigned long flags;
@@ -1988,14 +1974,15 @@ static void queue_zone_wplug_show(struct blk_zone_wplug *zwplug,
 	zwp_flags = zwplug->flags;
 	zwp_ref = refcount_read(&zwplug->ref);
 	zwp_wp_offset = zwplug->wp_offset;
+	zwp_wp_offset_compl = zwplug->wp_offset_compl;
 	zwp_bio_list_size = bio_list_size(&zwplug->bio_list);
 	spin_unlock_irqrestore(&zwplug->lock, flags);
 
 	bool all_zwr_inserted = blk_zone_all_zwr_inserted(zwplug);
 
-	seq_printf(m, "zone_no %u flags 0x%x ref %u wp_offset %u bio_list_size %u all_zwr_inserted %d\n",
+	seq_printf(m, "zone_no %u flags 0x%x ref %u wp_offset %u wp_offset_compl %u bio_list_size %u all_zwr_inserted %d\n",
 		   zwp_zone_no, zwp_flags, zwp_ref, zwp_wp_offset,
-		   zwp_bio_list_size, all_zwr_inserted);
+		   zwp_wp_offset_compl, zwp_bio_list_size, all_zwr_inserted);
 }
 
 int queue_zone_wplugs_show(void *data, struct seq_file *m)
diff --git a/block/blk.h b/block/blk.h
index be945db6298d..88a6e258eafe 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -470,8 +470,10 @@ static inline void blk_zone_update_request_bio(struct request *rq,
 	 * the original BIO sector so that blk_zone_write_plug_bio_endio() can
 	 * lookup the zone write plug.
 	 */
-	if (req_op(rq) == REQ_OP_ZONE_APPEND || bio_zone_write_plugging(bio))
+	if (req_op(rq) == REQ_OP_ZONE_APPEND || bio_zone_write_plugging(bio)) {
 		bio->bi_iter.bi_sector = rq->__sector;
+		bio->bi_iter.bi_size = rq->__data_len;
+	}
 }
 
 void blk_zone_write_plug_requeue_request(struct request *rq);

  parent reply	other threads:[~2024-11-19  0:28 UTC|newest]

Thread overview: 73+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-11-19  0:27 [PATCH v16 00/26] Improve write performance for zoned UFS devices Bart Van Assche
2024-11-19  0:27 ` [PATCH v16 01/26] blk-zoned: Fix a reference count leak Bart Van Assche
2024-11-19  2:23   ` Damien Le Moal
2024-11-19 20:21     ` Bart Van Assche
2024-11-19  0:27 ` [PATCH v16 02/26] blk-zoned: Split disk_zone_wplugs_work() Bart Van Assche
2024-11-19  0:27 ` [PATCH v16 03/26] blk-zoned: Split queue_zone_wplugs_show() Bart Van Assche
2024-11-19  2:25   ` Damien Le Moal
2024-11-19  0:27 ` [PATCH v16 04/26] blk-zoned: Only handle errors after pending zoned writes have completed Bart Van Assche
2024-11-19  2:50   ` Damien Le Moal
2024-11-19 20:51     ` Bart Van Assche
2024-11-21  3:23       ` Damien Le Moal
2024-11-21 17:43         ` Bart Van Assche
2024-11-19  0:27 ` Bart Van Assche [this message]
2024-11-19  2:57   ` [PATCH v16 05/26] blk-zoned: Fix a deadlock triggered by unaligned writes Damien Le Moal
2024-11-19 21:04     ` Bart Van Assche
2024-11-21  3:32       ` Damien Le Moal
2024-11-21 17:51         ` Bart Van Assche
2024-11-25  4:00           ` Damien Le Moal
2024-11-25  4:19             ` Damien Le Moal
2025-01-09 19:11     ` Bart Van Assche
2025-01-10  5:07       ` Damien Le Moal
2025-01-10 18:17         ` Bart Van Assche
2024-11-19  0:27 ` [PATCH v16 06/26] blk-zoned: Fix requeuing of zoned writes Bart Van Assche
2024-11-19  3:00   ` Damien Le Moal
2024-11-19 21:06     ` Bart Van Assche
2024-11-19  0:27 ` [PATCH v16 07/26] block: Support block drivers that preserve the order of write requests Bart Van Assche
2024-11-19  7:37   ` Damien Le Moal
2024-11-19 21:08     ` Bart Van Assche
2024-11-19  0:27 ` [PATCH v16 08/26] dm-linear: Report to the block layer that the write order is preserved Bart Van Assche
2024-11-19  0:27 ` [PATCH v16 09/26] mq-deadline: Remove a local variable Bart Van Assche
2024-11-19  7:38   ` Damien Le Moal
2024-11-19 21:11     ` Bart Van Assche
2024-11-19  0:27 ` [PATCH v16 10/26] blk-mq: Clean up blk_mq_requeue_work() Bart Van Assche
2024-11-19  7:39   ` Damien Le Moal
2024-11-19  0:28 ` [PATCH v16 11/26] block: Optimize blk_mq_submit_bio() for the cache hit scenario Bart Van Assche
2024-11-19  7:40   ` Damien Le Moal
2024-11-19  0:28 ` [PATCH v16 12/26] block: Rework request allocation in blk_mq_submit_bio() Bart Van Assche
2024-11-19  7:44   ` Damien Le Moal
2024-11-19  0:28 ` [PATCH v16 13/26] block: Support allocating from a specific software queue Bart Van Assche
2024-11-19  0:28 ` [PATCH v16 14/26] blk-mq: Restore the zoned write order when requeuing Bart Van Assche
2024-11-19  7:52   ` Damien Le Moal
2024-11-19 21:16     ` Bart Van Assche
2024-11-19  0:28 ` [PATCH v16 15/26] blk-zoned: Document the locking order Bart Van Assche
2024-11-19  7:52   ` Damien Le Moal
2024-11-19  0:28 ` [PATCH v16 16/26] blk-zoned: Document locking assumptions Bart Van Assche
2024-11-19  7:53   ` Damien Le Moal
2024-11-19 21:18     ` Bart Van Assche
2024-11-21  3:34       ` Damien Le Moal
2024-11-19  0:28 ` [PATCH v16 17/26] blk-zoned: Uninline functions that are not in the hot path Bart Van Assche
2024-11-19  7:55   ` Damien Le Moal
2024-11-19 21:20     ` Bart Van Assche
2024-11-21  3:36       ` Damien Le Moal
2024-11-19  0:28 ` [PATCH v16 18/26] blk-zoned: Make disk_should_remove_zone_wplug() more robust Bart Van Assche
2024-11-19  7:58   ` Damien Le Moal
2024-11-19  0:28 ` [PATCH v16 19/26] blk-zoned: Add an argument to blk_zone_plug_bio() Bart Van Assche
2024-11-19  0:28 ` [PATCH v16 20/26] blk-zoned: Support pipelining of zoned writes Bart Van Assche
2024-11-19  0:28 ` [PATCH v16 21/26] scsi: core: Retry unaligned " Bart Van Assche
2024-11-19  0:28 ` [PATCH v16 22/26] scsi: sd: Increase retry count for " Bart Van Assche
2024-11-19  0:28 ` [PATCH v16 23/26] scsi: scsi_debug: Add the preserves_write_order module parameter Bart Van Assche
2024-11-19  0:28 ` [PATCH v16 24/26] scsi: scsi_debug: Support injecting unaligned write errors Bart Van Assche
2024-11-19  0:28 ` [PATCH v16 25/26] scsi: scsi_debug: Skip host/bus reset settle delay Bart Van Assche
2024-11-19  0:28 ` [PATCH v16 26/26] scsi: ufs: Inform the block layer about write ordering Bart Van Assche
     [not found]   ` <37f95f44-ab1d-20db-e0c7-94946cb9d4eb@quicinc.com>
2024-11-22 18:20     ` Bart Van Assche
2024-11-23  0:34       ` Can Guo
2024-11-19  8:01 ` [PATCH v16 00/26] Improve write performance for zoned UFS devices Damien Le Moal
2024-11-19 19:08   ` Bart Van Assche
2024-11-21  3:20     ` Damien Le Moal
2024-11-21 18:00       ` Bart Van Assche
2024-11-25  3:59         ` Damien Le Moal
2025-01-09 19:02   ` Bart Van Assche
2025-01-10  5:10     ` Damien Le Moal
2024-11-19 12:25 ` Christoph Hellwig
2024-11-19 18:52   ` Bart Van Assche

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20241119002815.600608-6-bvanassche@acm.org \
    --to=bvanassche@acm.org \
    --cc=axboe@kernel.dk \
    --cc=dlemoal@kernel.org \
    --cc=hch@lst.de \
    --cc=jaegeuk@kernel.org \
    --cc=linux-block@vger.kernel.org \
    --cc=linux-scsi@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox