From: Damien Le Moal <dlemoal@kernel.org>
To: Bart Van Assche <bvanassche@acm.org>, Jens Axboe <axboe@kernel.dk>
Cc: linux-block@vger.kernel.org, linux-scsi@vger.kernel.org,
Christoph Hellwig <hch@lst.de>, Jaegeuk Kim <jaegeuk@kernel.org>
Subject: Re: [PATCH v16 05/26] blk-zoned: Fix a deadlock triggered by unaligned writes
Date: Tue, 19 Nov 2024 11:57:48 +0900 [thread overview]
Message-ID: <6729e88d-5311-4b6e-a3da-0f144aab56c9@kernel.org> (raw)
In-Reply-To: <20241119002815.600608-6-bvanassche@acm.org>
On 11/19/24 9:27 AM, Bart Van Assche wrote:
> If the queue is filled with unaligned writes then the following
> deadlock occurs:
>
> Call Trace:
> <TASK>
> __schedule+0x8cc/0x2190
> schedule+0xdd/0x2b0
> blk_queue_enter+0x2ce/0x4f0
> blk_mq_alloc_request+0x303/0x810
> scsi_execute_cmd+0x3f4/0x7b0
> sd_zbc_do_report_zones+0x19e/0x4c0
> sd_zbc_report_zones+0x304/0x920
> disk_zone_wplug_handle_error+0x237/0x920
> disk_zone_wplugs_work+0x17e/0x430
> process_one_work+0xdd0/0x1490
> worker_thread+0x5eb/0x1010
> kthread+0x2e5/0x3b0
> ret_from_fork+0x3a/0x80
> </TASK>
>
> Fix this deadlock by removing the disk->fops->report_zones() call and by
> deriving the write pointer information from successfully completed zoned
> writes.
>
> Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Doesn't this need a Fixes tag and CC stable, and come earlier in the series (or
sent separately) ?
Overall, this patch seems wrong anyway as zone reset and zone finish may be
done between 2 writes, failing the next one but the recovery done here will use
the previous succeful write end position as the wp, which is NOT correct as
reset or finish changed that... And we also have the possibility of torn writes
(partial writes) with SAS SMR drives. So I really think that you cannot avoid
doing a report zone to recover errors.
> ---
> block/blk-zoned.c | 69 +++++++++++++++++++----------------------------
> block/blk.h | 4 ++-
> 2 files changed, 31 insertions(+), 42 deletions(-)
>
> diff --git a/block/blk-zoned.c b/block/blk-zoned.c
> index b570b773e65f..284820b29285 100644
> --- a/block/blk-zoned.c
> +++ b/block/blk-zoned.c
> @@ -56,6 +56,8 @@ static const char *const zone_cond_name[] = {
> * @zone_no: The number of the zone the plug is managing.
> * @wp_offset: The zone write pointer location relative to the start of the zone
> * as a number of 512B sectors.
> + * @wp_offset_compl: End offset for completed zoned writes as a number of 512
> + * byte sectors.
> * @bio_list: The list of BIOs that are currently plugged.
> * @bio_work: Work struct to handle issuing of plugged BIOs
> * @rcu_head: RCU head to free zone write plugs with an RCU grace period.
> @@ -69,6 +71,7 @@ struct blk_zone_wplug {
> unsigned int flags;
> unsigned int zone_no;
> unsigned int wp_offset;
> + unsigned int wp_offset_compl;
> struct bio_list bio_list;
> struct work_struct bio_work;
> struct rcu_head rcu_head;
> @@ -531,6 +534,7 @@ static struct blk_zone_wplug *disk_get_and_lock_zone_wplug(struct gendisk *disk,
> zwplug->flags = 0;
> zwplug->zone_no = zno;
> zwplug->wp_offset = sector & (disk->queue->limits.chunk_sectors - 1);
> + zwplug->wp_offset_compl = 0;
> bio_list_init(&zwplug->bio_list);
> INIT_WORK(&zwplug->bio_work, blk_zone_wplug_bio_work);
> zwplug->disk = disk;
> @@ -1226,6 +1230,7 @@ void blk_zone_write_plug_bio_endio(struct bio *bio)
> struct gendisk *disk = bio->bi_bdev->bd_disk;
> struct blk_zone_wplug *zwplug =
> disk_get_zone_wplug(disk, bio->bi_iter.bi_sector);
> + unsigned int end_sector;
> unsigned long flags;
>
> if (WARN_ON_ONCE(!zwplug))
> @@ -1243,11 +1248,24 @@ void blk_zone_write_plug_bio_endio(struct bio *bio)
> bio->bi_opf |= REQ_OP_ZONE_APPEND;
> }
>
> - /*
> - * If the BIO failed, mark the plug as having an error to trigger
> - * recovery.
> - */
> - if (bio->bi_status != BLK_STS_OK) {
> + if (bio->bi_status == BLK_STS_OK) {
> + switch (bio_op(bio)) {
> + case REQ_OP_WRITE:
> + case REQ_OP_ZONE_APPEND:
> + case REQ_OP_WRITE_ZEROES:
> + end_sector = bdev_offset_from_zone_start(disk->part0,
> + bio->bi_iter.bi_sector + bio_sectors(bio));
> + if (end_sector > zwplug->wp_offset_compl)
> + zwplug->wp_offset_compl = end_sector;
> + break;
> + default:
> + break;
> + }
> + } else {
> + /*
> + * If the BIO failed, mark the plug as having an error to
> + * trigger recovery.
> + */
> spin_lock_irqsave(&zwplug->lock, flags);
> disk_zone_wplug_set_error(disk, zwplug);
> spin_unlock_irqrestore(&zwplug->lock, flags);
> @@ -1388,30 +1406,10 @@ static unsigned int blk_zone_wp_offset(struct blk_zone *zone)
> }
> }
>
> -static int blk_zone_wplug_report_zone_cb(struct blk_zone *zone,
> - unsigned int idx, void *data)
> -{
> - struct blk_zone *zonep = data;
> -
> - *zonep = *zone;
> - return 0;
> -}
> -
> static void disk_zone_wplug_handle_error(struct gendisk *disk,
> struct blk_zone_wplug *zwplug)
> {
> - sector_t zone_start_sector =
> - bdev_zone_sectors(disk->part0) * zwplug->zone_no;
> - unsigned int noio_flag;
> - struct blk_zone zone;
> unsigned long flags;
> - int ret;
> -
> - /* Get the current zone information from the device. */
> - noio_flag = memalloc_noio_save();
> - ret = disk->fops->report_zones(disk, zone_start_sector, 1,
> - blk_zone_wplug_report_zone_cb, &zone);
> - memalloc_noio_restore(noio_flag);
>
> spin_lock_irqsave(&zwplug->lock, flags);
>
> @@ -1425,19 +1423,8 @@ static void disk_zone_wplug_handle_error(struct gendisk *disk,
>
> zwplug->flags &= ~BLK_ZONE_WPLUG_ERROR;
>
> - if (ret != 1) {
> - /*
> - * We failed to get the zone information, meaning that something
> - * is likely really wrong with the device. Abort all remaining
> - * plugged BIOs as otherwise we could endup waiting forever on
> - * plugged BIOs to complete if there is a queue freeze on-going.
> - */
> - disk_zone_wplug_abort(zwplug);
> - goto unplug;
> - }
> -
> /* Update the zone write pointer offset. */
> - zwplug->wp_offset = blk_zone_wp_offset(&zone);
> + zwplug->wp_offset = zwplug->wp_offset_compl;
> disk_zone_wplug_abort_unaligned(disk, zwplug);
>
> /* Restart BIO submission if we still have any BIO left. */
> @@ -1446,7 +1433,6 @@ static void disk_zone_wplug_handle_error(struct gendisk *disk,
> goto unlock;
> }
>
> -unplug:
> zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED;
> if (disk_should_remove_zone_wplug(disk, zwplug))
> disk_remove_zone_wplug(disk, zwplug);
> @@ -1978,7 +1964,7 @@ EXPORT_SYMBOL_GPL(blk_revalidate_disk_zones);
> static void queue_zone_wplug_show(struct blk_zone_wplug *zwplug,
> struct seq_file *m)
> {
> - unsigned int zwp_wp_offset, zwp_flags;
> + unsigned int zwp_wp_offset, zwp_wp_offset_compl, zwp_flags;
> unsigned int zwp_zone_no, zwp_ref;
> unsigned int zwp_bio_list_size;
> unsigned long flags;
> @@ -1988,14 +1974,15 @@ static void queue_zone_wplug_show(struct blk_zone_wplug *zwplug,
> zwp_flags = zwplug->flags;
> zwp_ref = refcount_read(&zwplug->ref);
> zwp_wp_offset = zwplug->wp_offset;
> + zwp_wp_offset_compl = zwplug->wp_offset_compl;
> zwp_bio_list_size = bio_list_size(&zwplug->bio_list);
> spin_unlock_irqrestore(&zwplug->lock, flags);
>
> bool all_zwr_inserted = blk_zone_all_zwr_inserted(zwplug);
>
> - seq_printf(m, "zone_no %u flags 0x%x ref %u wp_offset %u bio_list_size %u all_zwr_inserted %d\n",
> + seq_printf(m, "zone_no %u flags 0x%x ref %u wp_offset %u wp_offset_compl %u bio_list_size %u all_zwr_inserted %d\n",
> zwp_zone_no, zwp_flags, zwp_ref, zwp_wp_offset,
> - zwp_bio_list_size, all_zwr_inserted);
> + zwp_wp_offset_compl, zwp_bio_list_size, all_zwr_inserted);
> }
>
> int queue_zone_wplugs_show(void *data, struct seq_file *m)
> diff --git a/block/blk.h b/block/blk.h
> index be945db6298d..88a6e258eafe 100644
> --- a/block/blk.h
> +++ b/block/blk.h
> @@ -470,8 +470,10 @@ static inline void blk_zone_update_request_bio(struct request *rq,
> * the original BIO sector so that blk_zone_write_plug_bio_endio() can
> * lookup the zone write plug.
> */
> - if (req_op(rq) == REQ_OP_ZONE_APPEND || bio_zone_write_plugging(bio))
> + if (req_op(rq) == REQ_OP_ZONE_APPEND || bio_zone_write_plugging(bio)) {
> bio->bi_iter.bi_sector = rq->__sector;
> + bio->bi_iter.bi_size = rq->__data_len;
> + }
> }
>
> void blk_zone_write_plug_requeue_request(struct request *rq);
--
Damien Le Moal
Western Digital Research
next prev parent reply other threads:[~2024-11-19 2:57 UTC|newest]
Thread overview: 73+ messages / expand[flat|nested] mbox.gz Atom feed top
2024-11-19 0:27 [PATCH v16 00/26] Improve write performance for zoned UFS devices Bart Van Assche
2024-11-19 0:27 ` [PATCH v16 01/26] blk-zoned: Fix a reference count leak Bart Van Assche
2024-11-19 2:23 ` Damien Le Moal
2024-11-19 20:21 ` Bart Van Assche
2024-11-19 0:27 ` [PATCH v16 02/26] blk-zoned: Split disk_zone_wplugs_work() Bart Van Assche
2024-11-19 0:27 ` [PATCH v16 03/26] blk-zoned: Split queue_zone_wplugs_show() Bart Van Assche
2024-11-19 2:25 ` Damien Le Moal
2024-11-19 0:27 ` [PATCH v16 04/26] blk-zoned: Only handle errors after pending zoned writes have completed Bart Van Assche
2024-11-19 2:50 ` Damien Le Moal
2024-11-19 20:51 ` Bart Van Assche
2024-11-21 3:23 ` Damien Le Moal
2024-11-21 17:43 ` Bart Van Assche
2024-11-19 0:27 ` [PATCH v16 05/26] blk-zoned: Fix a deadlock triggered by unaligned writes Bart Van Assche
2024-11-19 2:57 ` Damien Le Moal [this message]
2024-11-19 21:04 ` Bart Van Assche
2024-11-21 3:32 ` Damien Le Moal
2024-11-21 17:51 ` Bart Van Assche
2024-11-25 4:00 ` Damien Le Moal
2024-11-25 4:19 ` Damien Le Moal
2025-01-09 19:11 ` Bart Van Assche
2025-01-10 5:07 ` Damien Le Moal
2025-01-10 18:17 ` Bart Van Assche
2024-11-19 0:27 ` [PATCH v16 06/26] blk-zoned: Fix requeuing of zoned writes Bart Van Assche
2024-11-19 3:00 ` Damien Le Moal
2024-11-19 21:06 ` Bart Van Assche
2024-11-19 0:27 ` [PATCH v16 07/26] block: Support block drivers that preserve the order of write requests Bart Van Assche
2024-11-19 7:37 ` Damien Le Moal
2024-11-19 21:08 ` Bart Van Assche
2024-11-19 0:27 ` [PATCH v16 08/26] dm-linear: Report to the block layer that the write order is preserved Bart Van Assche
2024-11-19 0:27 ` [PATCH v16 09/26] mq-deadline: Remove a local variable Bart Van Assche
2024-11-19 7:38 ` Damien Le Moal
2024-11-19 21:11 ` Bart Van Assche
2024-11-19 0:27 ` [PATCH v16 10/26] blk-mq: Clean up blk_mq_requeue_work() Bart Van Assche
2024-11-19 7:39 ` Damien Le Moal
2024-11-19 0:28 ` [PATCH v16 11/26] block: Optimize blk_mq_submit_bio() for the cache hit scenario Bart Van Assche
2024-11-19 7:40 ` Damien Le Moal
2024-11-19 0:28 ` [PATCH v16 12/26] block: Rework request allocation in blk_mq_submit_bio() Bart Van Assche
2024-11-19 7:44 ` Damien Le Moal
2024-11-19 0:28 ` [PATCH v16 13/26] block: Support allocating from a specific software queue Bart Van Assche
2024-11-19 0:28 ` [PATCH v16 14/26] blk-mq: Restore the zoned write order when requeuing Bart Van Assche
2024-11-19 7:52 ` Damien Le Moal
2024-11-19 21:16 ` Bart Van Assche
2024-11-19 0:28 ` [PATCH v16 15/26] blk-zoned: Document the locking order Bart Van Assche
2024-11-19 7:52 ` Damien Le Moal
2024-11-19 0:28 ` [PATCH v16 16/26] blk-zoned: Document locking assumptions Bart Van Assche
2024-11-19 7:53 ` Damien Le Moal
2024-11-19 21:18 ` Bart Van Assche
2024-11-21 3:34 ` Damien Le Moal
2024-11-19 0:28 ` [PATCH v16 17/26] blk-zoned: Uninline functions that are not in the hot path Bart Van Assche
2024-11-19 7:55 ` Damien Le Moal
2024-11-19 21:20 ` Bart Van Assche
2024-11-21 3:36 ` Damien Le Moal
2024-11-19 0:28 ` [PATCH v16 18/26] blk-zoned: Make disk_should_remove_zone_wplug() more robust Bart Van Assche
2024-11-19 7:58 ` Damien Le Moal
2024-11-19 0:28 ` [PATCH v16 19/26] blk-zoned: Add an argument to blk_zone_plug_bio() Bart Van Assche
2024-11-19 0:28 ` [PATCH v16 20/26] blk-zoned: Support pipelining of zoned writes Bart Van Assche
2024-11-19 0:28 ` [PATCH v16 21/26] scsi: core: Retry unaligned " Bart Van Assche
2024-11-19 0:28 ` [PATCH v16 22/26] scsi: sd: Increase retry count for " Bart Van Assche
2024-11-19 0:28 ` [PATCH v16 23/26] scsi: scsi_debug: Add the preserves_write_order module parameter Bart Van Assche
2024-11-19 0:28 ` [PATCH v16 24/26] scsi: scsi_debug: Support injecting unaligned write errors Bart Van Assche
2024-11-19 0:28 ` [PATCH v16 25/26] scsi: scsi_debug: Skip host/bus reset settle delay Bart Van Assche
2024-11-19 0:28 ` [PATCH v16 26/26] scsi: ufs: Inform the block layer about write ordering Bart Van Assche
[not found] ` <37f95f44-ab1d-20db-e0c7-94946cb9d4eb@quicinc.com>
2024-11-22 18:20 ` Bart Van Assche
2024-11-23 0:34 ` Can Guo
2024-11-19 8:01 ` [PATCH v16 00/26] Improve write performance for zoned UFS devices Damien Le Moal
2024-11-19 19:08 ` Bart Van Assche
2024-11-21 3:20 ` Damien Le Moal
2024-11-21 18:00 ` Bart Van Assche
2024-11-25 3:59 ` Damien Le Moal
2025-01-09 19:02 ` Bart Van Assche
2025-01-10 5:10 ` Damien Le Moal
2024-11-19 12:25 ` Christoph Hellwig
2024-11-19 18:52 ` Bart Van Assche
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=6729e88d-5311-4b6e-a3da-0f144aab56c9@kernel.org \
--to=dlemoal@kernel.org \
--cc=axboe@kernel.dk \
--cc=bvanassche@acm.org \
--cc=hch@lst.de \
--cc=jaegeuk@kernel.org \
--cc=linux-block@vger.kernel.org \
--cc=linux-scsi@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox