* [PATCH v11 01/16] block: Introduce more member variables related to zone write locking
2023-08-22 19:16 [PATCH v11 00/16] Improve write performance for zoned UFS devices Bart Van Assche
@ 2023-08-22 19:16 ` Bart Van Assche
2023-08-23 6:18 ` Hannes Reinecke
2023-08-23 8:08 ` Nitesh Shetty
2023-08-22 19:16 ` [PATCH v11 02/16] block: Only use write locking if necessary Bart Van Assche
` (14 subsequent siblings)
15 siblings, 2 replies; 31+ messages in thread
From: Bart Van Assche @ 2023-08-22 19:16 UTC (permalink / raw)
To: Jens Axboe
Cc: linux-block, linux-scsi, Martin K . Petersen, Christoph Hellwig,
Bart Van Assche, Damien Le Moal, Ming Lei
Many but not all storage controllers require serialization of zoned writes.
Introduce two new request queue limit member variables related to write
serialization. 'driver_preserves_write_order' allows block drivers to
indicate that the order of write commands is preserved and hence that
serialization of writes per zone is not required. 'use_zone_write_lock' is
set by disk_set_zoned() if and only if the block device has zones and if
the block driver does not preserve the order of write requests.
Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
---
block/blk-settings.c | 15 +++++++++++++++
block/blk-zoned.c | 1 +
include/linux/blkdev.h | 10 ++++++++++
3 files changed, 26 insertions(+)
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 0046b447268f..4c776c08f190 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -56,6 +56,8 @@ void blk_set_default_limits(struct queue_limits *lim)
lim->alignment_offset = 0;
lim->io_opt = 0;
lim->misaligned = 0;
+ lim->driver_preserves_write_order = false;
+ lim->use_zone_write_lock = false;
lim->zoned = BLK_ZONED_NONE;
lim->zone_write_granularity = 0;
lim->dma_alignment = 511;
@@ -82,6 +84,8 @@ void blk_set_stacking_limits(struct queue_limits *lim)
lim->max_dev_sectors = UINT_MAX;
lim->max_write_zeroes_sectors = UINT_MAX;
lim->max_zone_append_sectors = UINT_MAX;
+ /* Request-based stacking drivers do not reorder requests. */
+ lim->driver_preserves_write_order = true;
}
EXPORT_SYMBOL(blk_set_stacking_limits);
@@ -685,6 +689,10 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
b->max_secure_erase_sectors);
t->zone_write_granularity = max(t->zone_write_granularity,
b->zone_write_granularity);
+ t->driver_preserves_write_order = t->driver_preserves_write_order &&
+ b->driver_preserves_write_order;
+ t->use_zone_write_lock = t->use_zone_write_lock ||
+ b->use_zone_write_lock;
t->zoned = max(t->zoned, b->zoned);
return ret;
}
@@ -949,6 +957,13 @@ void disk_set_zoned(struct gendisk *disk, enum blk_zoned_model model)
}
q->limits.zoned = model;
+ /*
+ * Use the zone write lock only for zoned block devices and only if
+ * the block driver does not preserve the order of write commands.
+ */
+ q->limits.use_zone_write_lock = model != BLK_ZONED_NONE &&
+ !q->limits.driver_preserves_write_order;
+
if (model != BLK_ZONED_NONE) {
/*
* Set the zone write granularity to the device logical block
diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index 619ee41a51cc..112620985bff 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -631,6 +631,7 @@ void disk_clear_zone_settings(struct gendisk *disk)
q->limits.chunk_sectors = 0;
q->limits.zone_write_granularity = 0;
q->limits.max_zone_append_sectors = 0;
+ q->limits.use_zone_write_lock = false;
blk_mq_unfreeze_queue(q);
}
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index c1421da8d45e..38a1a71b2bcc 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -316,6 +316,16 @@ struct queue_limits {
unsigned char misaligned;
unsigned char discard_misaligned;
unsigned char raid_partial_stripes_expensive;
+ /*
+ * Whether or not the block driver preserves the order of write
+ * requests. Set by the block driver.
+ */
+ bool driver_preserves_write_order;
+ /*
+ * Whether or not zone write locking should be used. Set by
+ * disk_set_zoned().
+ */
+ bool use_zone_write_lock;
enum blk_zoned_model zoned;
/*
^ permalink raw reply related [flat|nested] 31+ messages in thread* Re: [PATCH v11 01/16] block: Introduce more member variables related to zone write locking
2023-08-22 19:16 ` [PATCH v11 01/16] block: Introduce more member variables related to zone write locking Bart Van Assche
@ 2023-08-23 6:18 ` Hannes Reinecke
2023-08-23 8:08 ` Nitesh Shetty
1 sibling, 0 replies; 31+ messages in thread
From: Hannes Reinecke @ 2023-08-23 6:18 UTC (permalink / raw)
To: Bart Van Assche, Jens Axboe
Cc: linux-block, linux-scsi, Martin K . Petersen, Christoph Hellwig,
Damien Le Moal, Ming Lei
On 8/22/23 21:16, Bart Van Assche wrote:
> Many but not all storage controllers require serialization of zoned writes.
> Introduce two new request queue limit member variables related to write
> serialization. 'driver_preserves_write_order' allows block drivers to
> indicate that the order of write commands is preserved and hence that
> serialization of writes per zone is not required. 'use_zone_write_lock' is
> set by disk_set_zoned() if and only if the block device has zones and if
> the block driver does not preserve the order of write requests.
>
> Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
> Cc: Christoph Hellwig <hch@lst.de>
> Cc: Ming Lei <ming.lei@redhat.com>
> Signed-off-by: Bart Van Assche <bvanassche@acm.org>
> ---
> block/blk-settings.c | 15 +++++++++++++++
> block/blk-zoned.c | 1 +
> include/linux/blkdev.h | 10 ++++++++++
> 3 files changed, 26 insertions(+)
>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Cheers,
Hannes
--
Dr. Hannes Reinecke Kernel Storage Architect
hare@suse.de +49 911 74053 688
SUSE Software Solutions GmbH, Maxfeldstr. 5, 90409 Nürnberg
HRB 36809 (AG Nürnberg), Geschäftsführer: Ivo Totev, Andrew
Myers, Andrew McDonald, Martje Boudien Moerman
^ permalink raw reply [flat|nested] 31+ messages in thread
* Re: [PATCH v11 01/16] block: Introduce more member variables related to zone write locking
2023-08-22 19:16 ` [PATCH v11 01/16] block: Introduce more member variables related to zone write locking Bart Van Assche
2023-08-23 6:18 ` Hannes Reinecke
@ 2023-08-23 8:08 ` Nitesh Shetty
1 sibling, 0 replies; 31+ messages in thread
From: Nitesh Shetty @ 2023-08-23 8:08 UTC (permalink / raw)
To: Bart Van Assche
Cc: Jens Axboe, linux-block, linux-scsi, Martin K . Petersen,
Christoph Hellwig, Damien Le Moal, Ming Lei
[-- Attachment #1: Type: text/plain, Size: 785 bytes --]
On 23/08/22 12:16PM, Bart Van Assche wrote:
>Many but not all storage controllers require serialization of zoned writes.
>Introduce two new request queue limit member variables related to write
>serialization. 'driver_preserves_write_order' allows block drivers to
>indicate that the order of write commands is preserved and hence that
>serialization of writes per zone is not required. 'use_zone_write_lock' is
>set by disk_set_zoned() if and only if the block device has zones and if
>the block driver does not preserve the order of write requests.
>
>Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
>Cc: Christoph Hellwig <hch@lst.de>
>Cc: Ming Lei <ming.lei@redhat.com>
>Signed-off-by: Bart Van Assche <bvanassche@acm.org>
>---
Reviewed-by: Nitesh Shetty <nj.shetty@samsung.com>
[-- Attachment #2: Type: text/plain, Size: 0 bytes --]
^ permalink raw reply [flat|nested] 31+ messages in thread
* [PATCH v11 02/16] block: Only use write locking if necessary
2023-08-22 19:16 [PATCH v11 00/16] Improve write performance for zoned UFS devices Bart Van Assche
2023-08-22 19:16 ` [PATCH v11 01/16] block: Introduce more member variables related to zone write locking Bart Van Assche
@ 2023-08-22 19:16 ` Bart Van Assche
2023-08-23 6:19 ` Hannes Reinecke
2023-08-23 8:09 ` Nitesh Shetty
2023-08-22 19:16 ` [PATCH v11 03/16] block/mq-deadline: Only use zone " Bart Van Assche
` (13 subsequent siblings)
15 siblings, 2 replies; 31+ messages in thread
From: Bart Van Assche @ 2023-08-22 19:16 UTC (permalink / raw)
To: Jens Axboe
Cc: linux-block, linux-scsi, Martin K . Petersen, Christoph Hellwig,
Bart Van Assche, Damien Le Moal, Ming Lei
Make blk_req_needs_zone_write_lock() return false if
q->limits.use_zone_write_lock is false.
Cc: Damien Le Moal <dlemoal@kernel.org>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
---
block/blk-zoned.c | 9 +++++++--
1 file changed, 7 insertions(+), 2 deletions(-)
diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index 112620985bff..d8a80cce832f 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -53,11 +53,16 @@ const char *blk_zone_cond_str(enum blk_zone_cond zone_cond)
EXPORT_SYMBOL_GPL(blk_zone_cond_str);
/*
- * Return true if a request is a write requests that needs zone write locking.
+ * Return true if a request is a write request that needs zone write locking.
*/
bool blk_req_needs_zone_write_lock(struct request *rq)
{
- if (!rq->q->disk->seq_zones_wlock)
+ struct request_queue *q = rq->q;
+
+ if (!q->limits.use_zone_write_lock)
+ return false;
+
+ if (!q->disk->seq_zones_wlock)
return false;
return blk_rq_is_seq_zoned_write(rq);
^ permalink raw reply related [flat|nested] 31+ messages in thread* Re: [PATCH v11 02/16] block: Only use write locking if necessary
2023-08-22 19:16 ` [PATCH v11 02/16] block: Only use write locking if necessary Bart Van Assche
@ 2023-08-23 6:19 ` Hannes Reinecke
2023-08-23 8:09 ` Nitesh Shetty
1 sibling, 0 replies; 31+ messages in thread
From: Hannes Reinecke @ 2023-08-23 6:19 UTC (permalink / raw)
To: Bart Van Assche, Jens Axboe
Cc: linux-block, linux-scsi, Martin K . Petersen, Christoph Hellwig,
Damien Le Moal, Ming Lei
On 8/22/23 21:16, Bart Van Assche wrote:
> Make blk_req_needs_zone_write_lock() return false if
> q->limits.use_zone_write_lock is false.
>
> Cc: Damien Le Moal <dlemoal@kernel.org>
> Cc: Christoph Hellwig <hch@lst.de>
> Cc: Ming Lei <ming.lei@redhat.com>
> Signed-off-by: Bart Van Assche <bvanassche@acm.org>
> ---
> block/blk-zoned.c | 9 +++++++--
> 1 file changed, 7 insertions(+), 2 deletions(-)
>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Cheers,
Hannes
--
Dr. Hannes Reinecke Kernel Storage Architect
hare@suse.de +49 911 74053 688
SUSE Software Solutions GmbH, Maxfeldstr. 5, 90409 Nürnberg
HRB 36809 (AG Nürnberg), Geschäftsführer: Ivo Totev, Andrew
Myers, Andrew McDonald, Martje Boudien Moerman
^ permalink raw reply [flat|nested] 31+ messages in thread
* Re: [PATCH v11 02/16] block: Only use write locking if necessary
2023-08-22 19:16 ` [PATCH v11 02/16] block: Only use write locking if necessary Bart Van Assche
2023-08-23 6:19 ` Hannes Reinecke
@ 2023-08-23 8:09 ` Nitesh Shetty
1 sibling, 0 replies; 31+ messages in thread
From: Nitesh Shetty @ 2023-08-23 8:09 UTC (permalink / raw)
To: Bart Van Assche
Cc: Jens Axboe, linux-block, linux-scsi, Martin K . Petersen,
Christoph Hellwig, Damien Le Moal, Ming Lei
[-- Attachment #1: Type: text/plain, Size: 1181 bytes --]
On 23/08/22 12:16PM, Bart Van Assche wrote:
>Make blk_req_needs_zone_write_lock() return false if
>q->limits.use_zone_write_lock is false.
>
>Cc: Damien Le Moal <dlemoal@kernel.org>
>Cc: Christoph Hellwig <hch@lst.de>
>Cc: Ming Lei <ming.lei@redhat.com>
>Signed-off-by: Bart Van Assche <bvanassche@acm.org>
>---
> block/blk-zoned.c | 9 +++++++--
> 1 file changed, 7 insertions(+), 2 deletions(-)
>
>diff --git a/block/blk-zoned.c b/block/blk-zoned.c
>index 112620985bff..d8a80cce832f 100644
>--- a/block/blk-zoned.c
>+++ b/block/blk-zoned.c
>@@ -53,11 +53,16 @@ const char *blk_zone_cond_str(enum blk_zone_cond zone_cond)
> EXPORT_SYMBOL_GPL(blk_zone_cond_str);
>
> /*
>- * Return true if a request is a write requests that needs zone write locking.
>+ * Return true if a request is a write request that needs zone write locking.
> */
> bool blk_req_needs_zone_write_lock(struct request *rq)
> {
>- if (!rq->q->disk->seq_zones_wlock)
>+ struct request_queue *q = rq->q;
>+
>+ if (!q->limits.use_zone_write_lock)
>+ return false;
>+
>+ if (!q->disk->seq_zones_wlock)
> return false;
>
> return blk_rq_is_seq_zoned_write(rq);
Reviewed-by: Nitesh Shetty <nj.shetty@samsung.com>
[-- Attachment #2: Type: text/plain, Size: 0 bytes --]
^ permalink raw reply [flat|nested] 31+ messages in thread
* [PATCH v11 03/16] block/mq-deadline: Only use zone locking if necessary
2023-08-22 19:16 [PATCH v11 00/16] Improve write performance for zoned UFS devices Bart Van Assche
2023-08-22 19:16 ` [PATCH v11 01/16] block: Introduce more member variables related to zone write locking Bart Van Assche
2023-08-22 19:16 ` [PATCH v11 02/16] block: Only use write locking if necessary Bart Van Assche
@ 2023-08-22 19:16 ` Bart Van Assche
2023-08-23 6:21 ` Hannes Reinecke
2023-08-23 8:23 ` Nitesh Shetty
2023-08-22 19:16 ` [PATCH v11 04/16] scsi: core: Introduce a mechanism for reordering requests in the error handler Bart Van Assche
` (12 subsequent siblings)
15 siblings, 2 replies; 31+ messages in thread
From: Bart Van Assche @ 2023-08-22 19:16 UTC (permalink / raw)
To: Jens Axboe
Cc: linux-block, linux-scsi, Martin K . Petersen, Christoph Hellwig,
Bart Van Assche, Damien Le Moal, Ming Lei
Measurements have shown that limiting the queue depth to one per zone for
zoned writes has a significant negative performance impact on zoned UFS
devices. Hence this patch that disables zone locking by the mq-deadline
scheduler if the storage controller preserves the command order. This
patch is based on the following assumptions:
- It happens infrequently that zoned write requests are reordered by the
block layer.
- The I/O priority of all write requests is the same per zone.
- Either no I/O scheduler is used or an I/O scheduler is used that
serializes write requests per zone.
Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
---
block/mq-deadline.c | 11 ++++++-----
1 file changed, 6 insertions(+), 5 deletions(-)
diff --git a/block/mq-deadline.c b/block/mq-deadline.c
index f958e79277b8..082ccf3186f4 100644
--- a/block/mq-deadline.c
+++ b/block/mq-deadline.c
@@ -353,7 +353,7 @@ deadline_fifo_request(struct deadline_data *dd, struct dd_per_prio *per_prio,
return NULL;
rq = rq_entry_fifo(per_prio->fifo_list[data_dir].next);
- if (data_dir == DD_READ || !blk_queue_is_zoned(rq->q))
+ if (data_dir == DD_READ || !rq->q->limits.use_zone_write_lock)
return rq;
/*
@@ -398,7 +398,7 @@ deadline_next_request(struct deadline_data *dd, struct dd_per_prio *per_prio,
if (!rq)
return NULL;
- if (data_dir == DD_READ || !blk_queue_is_zoned(rq->q))
+ if (data_dir == DD_READ || !rq->q->limits.use_zone_write_lock)
return rq;
/*
@@ -526,8 +526,9 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd,
}
/*
- * For a zoned block device, if we only have writes queued and none of
- * them can be dispatched, rq will be NULL.
+ * For a zoned block device that requires write serialization, if we
+ * only have writes queued and none of them can be dispatched, rq will
+ * be NULL.
*/
if (!rq)
return NULL;
@@ -934,7 +935,7 @@ static void dd_finish_request(struct request *rq)
atomic_inc(&per_prio->stats.completed);
- if (blk_queue_is_zoned(q)) {
+ if (rq->q->limits.use_zone_write_lock) {
unsigned long flags;
spin_lock_irqsave(&dd->zone_lock, flags);
^ permalink raw reply related [flat|nested] 31+ messages in thread* Re: [PATCH v11 03/16] block/mq-deadline: Only use zone locking if necessary
2023-08-22 19:16 ` [PATCH v11 03/16] block/mq-deadline: Only use zone " Bart Van Assche
@ 2023-08-23 6:21 ` Hannes Reinecke
2023-08-23 8:23 ` Nitesh Shetty
1 sibling, 0 replies; 31+ messages in thread
From: Hannes Reinecke @ 2023-08-23 6:21 UTC (permalink / raw)
To: Bart Van Assche, Jens Axboe
Cc: linux-block, linux-scsi, Martin K . Petersen, Christoph Hellwig,
Damien Le Moal, Ming Lei
On 8/22/23 21:16, Bart Van Assche wrote:
> Measurements have shown that limiting the queue depth to one per zone for
> zoned writes has a significant negative performance impact on zoned UFS
> devices. Hence this patch that disables zone locking by the mq-deadline
> scheduler if the storage controller preserves the command order. This
> patch is based on the following assumptions:
> - It happens infrequently that zoned write requests are reordered by the
> block layer.
> - The I/O priority of all write requests is the same per zone.
> - Either no I/O scheduler is used or an I/O scheduler is used that
> serializes write requests per zone.
>
> Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
> Cc: Christoph Hellwig <hch@lst.de>
> Cc: Ming Lei <ming.lei@redhat.com>
> Signed-off-by: Bart Van Assche <bvanassche@acm.org>
> ---
> block/mq-deadline.c | 11 ++++++-----
> 1 file changed, 6 insertions(+), 5 deletions(-)
> Reviewed-by: Hannes Reinecke <hare@suse.de>
Cheers,
Hannes
--
Dr. Hannes Reinecke Kernel Storage Architect
hare@suse.de +49 911 74053 688
SUSE Software Solutions GmbH, Maxfeldstr. 5, 90409 Nürnberg
HRB 36809 (AG Nürnberg), Geschäftsführer: Ivo Totev, Andrew
Myers, Andrew McDonald, Martje Boudien Moerman
^ permalink raw reply [flat|nested] 31+ messages in thread
* Re: [PATCH v11 03/16] block/mq-deadline: Only use zone locking if necessary
2023-08-22 19:16 ` [PATCH v11 03/16] block/mq-deadline: Only use zone " Bart Van Assche
2023-08-23 6:21 ` Hannes Reinecke
@ 2023-08-23 8:23 ` Nitesh Shetty
1 sibling, 0 replies; 31+ messages in thread
From: Nitesh Shetty @ 2023-08-23 8:23 UTC (permalink / raw)
To: Bart Van Assche
Cc: Jens Axboe, linux-block, linux-scsi, Martin K . Petersen,
Christoph Hellwig, Damien Le Moal, Ming Lei
[-- Attachment #1: Type: text/plain, Size: 968 bytes --]
On 23/08/22 12:16PM, Bart Van Assche wrote:
>Measurements have shown that limiting the queue depth to one per zone for
>zoned writes has a significant negative performance impact on zoned UFS
>devices. Hence this patch that disables zone locking by the mq-deadline
>scheduler if the storage controller preserves the command order. This
>patch is based on the following assumptions:
>- It happens infrequently that zoned write requests are reordered by the
> block layer.
>- The I/O priority of all write requests is the same per zone.
>- Either no I/O scheduler is used or an I/O scheduler is used that
> serializes write requests per zone.
>
>Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
>Cc: Christoph Hellwig <hch@lst.de>
>Cc: Ming Lei <ming.lei@redhat.com>
>Signed-off-by: Bart Van Assche <bvanassche@acm.org>
>---
> block/mq-deadline.c | 11 ++++++-----
> 1 file changed, 6 insertions(+), 5 deletions(-)
>
Reviewed-by: Nitesh Shetty <nj.shetty@samsung.com>
[-- Attachment #2: Type: text/plain, Size: 0 bytes --]
^ permalink raw reply [flat|nested] 31+ messages in thread
* [PATCH v11 04/16] scsi: core: Introduce a mechanism for reordering requests in the error handler
2023-08-22 19:16 [PATCH v11 00/16] Improve write performance for zoned UFS devices Bart Van Assche
` (2 preceding siblings ...)
2023-08-22 19:16 ` [PATCH v11 03/16] block/mq-deadline: Only use zone " Bart Van Assche
@ 2023-08-22 19:16 ` Bart Van Assche
2023-08-23 6:26 ` Hannes Reinecke
2023-08-22 19:17 ` [PATCH v11 05/16] scsi: core: Add unit tests for scsi_call_prepare_resubmit() Bart Van Assche
` (11 subsequent siblings)
15 siblings, 1 reply; 31+ messages in thread
From: Bart Van Assche @ 2023-08-22 19:16 UTC (permalink / raw)
To: Jens Axboe
Cc: linux-block, linux-scsi, Martin K . Petersen, Christoph Hellwig,
Bart Van Assche, Damien Le Moal, Ming Lei, James E.J. Bottomley
Introduce the .eh_needs_prepare_resubmit and the .eh_prepare_resubmit
function pointers in struct scsi_driver. Make the error handler call
.eh_prepare_resubmit() before resubmitting commands if any of the
.eh_needs_prepare_resubmit() invocations return true. A later patch
will use this functionality to sort SCSI commands by LBA from inside
the SCSI disk driver before these are resubmitted by the error handler.
Cc: Martin K. Petersen <martin.petersen@oracle.com>
Cc: Damien Le Moal <dlemoal@kernel.org>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
---
drivers/scsi/scsi_error.c | 65 ++++++++++++++++++++++++++++++++++++++
drivers/scsi/scsi_priv.h | 1 +
include/scsi/scsi_driver.h | 2 ++
3 files changed, 68 insertions(+)
diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c
index c67cdcdc3ba8..c4d817f044a0 100644
--- a/drivers/scsi/scsi_error.c
+++ b/drivers/scsi/scsi_error.c
@@ -27,6 +27,7 @@
#include <linux/blkdev.h>
#include <linux/delay.h>
#include <linux/jiffies.h>
+#include <linux/list_sort.h>
#include <scsi/scsi.h>
#include <scsi/scsi_cmnd.h>
@@ -2186,6 +2187,68 @@ void scsi_eh_ready_devs(struct Scsi_Host *shost,
}
EXPORT_SYMBOL_GPL(scsi_eh_ready_devs);
+/*
+ * Returns true if .eh_prepare_resubmit should be called for the commands in
+ * @done_q.
+ */
+static bool scsi_needs_preparation(struct list_head *done_q)
+{
+ struct scsi_cmnd *scmd;
+
+ list_for_each_entry(scmd, done_q, eh_entry) {
+ struct scsi_driver *uld = scsi_cmd_to_driver(scmd);
+ bool (*npr)(struct scsi_cmnd *) = uld->eh_needs_prepare_resubmit;
+
+ if (npr && npr(scmd))
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * Comparison function that allows to sort SCSI commands by ULD driver.
+ */
+static int scsi_cmp_uld(void *priv, const struct list_head *_a,
+ const struct list_head *_b)
+{
+ struct scsi_cmnd *a = list_entry(_a, typeof(*a), eh_entry);
+ struct scsi_cmnd *b = list_entry(_b, typeof(*b), eh_entry);
+
+ /* See also the comment above the list_sort() definition. */
+ return scsi_cmd_to_driver(a) > scsi_cmd_to_driver(b);
+}
+
+void scsi_call_prepare_resubmit(struct list_head *done_q)
+{
+ struct scsi_cmnd *scmd, *next;
+
+ if (!scsi_needs_preparation(done_q))
+ return;
+
+ /* Sort pending SCSI commands by ULD. */
+ list_sort(NULL, done_q, scsi_cmp_uld);
+
+ /*
+ * Call .eh_prepare_resubmit for each range of commands with identical
+ * ULD driver pointer.
+ */
+ list_for_each_entry_safe(scmd, next, done_q, eh_entry) {
+ struct scsi_driver *uld = scsi_cmd_to_driver(scmd);
+ struct list_head *prev, uld_cmd_list;
+
+ while (&next->eh_entry != done_q &&
+ scsi_cmd_to_driver(next) == uld)
+ next = list_next_entry(next, eh_entry);
+ if (!uld->eh_prepare_resubmit)
+ continue;
+ prev = scmd->eh_entry.prev;
+ list_cut_position(&uld_cmd_list, prev, next->eh_entry.prev);
+ uld->eh_prepare_resubmit(&uld_cmd_list);
+ list_splice(&uld_cmd_list, prev);
+ }
+}
+
/**
* scsi_eh_flush_done_q - finish processed commands or retry them.
* @done_q: list_head of processed commands.
@@ -2194,6 +2257,8 @@ void scsi_eh_flush_done_q(struct list_head *done_q)
{
struct scsi_cmnd *scmd, *next;
+ scsi_call_prepare_resubmit(done_q);
+
list_for_each_entry_safe(scmd, next, done_q, eh_entry) {
list_del_init(&scmd->eh_entry);
if (scsi_device_online(scmd->device) &&
diff --git a/drivers/scsi/scsi_priv.h b/drivers/scsi/scsi_priv.h
index f42388ecb024..df4af4645430 100644
--- a/drivers/scsi/scsi_priv.h
+++ b/drivers/scsi/scsi_priv.h
@@ -101,6 +101,7 @@ int scsi_eh_get_sense(struct list_head *work_q,
struct list_head *done_q);
bool scsi_noretry_cmd(struct scsi_cmnd *scmd);
void scsi_eh_done(struct scsi_cmnd *scmd);
+void scsi_call_prepare_resubmit(struct list_head *done_q);
/* scsi_lib.c */
extern int scsi_maybe_unblock_host(struct scsi_device *sdev);
diff --git a/include/scsi/scsi_driver.h b/include/scsi/scsi_driver.h
index 4ce1988b2ba0..00ffa470724a 100644
--- a/include/scsi/scsi_driver.h
+++ b/include/scsi/scsi_driver.h
@@ -18,6 +18,8 @@ struct scsi_driver {
int (*done)(struct scsi_cmnd *);
int (*eh_action)(struct scsi_cmnd *, int);
void (*eh_reset)(struct scsi_cmnd *);
+ bool (*eh_needs_prepare_resubmit)(struct scsi_cmnd *cmd);
+ void (*eh_prepare_resubmit)(struct list_head *cmd_list);
};
#define to_scsi_driver(drv) \
container_of((drv), struct scsi_driver, gendrv)
^ permalink raw reply related [flat|nested] 31+ messages in thread* Re: [PATCH v11 04/16] scsi: core: Introduce a mechanism for reordering requests in the error handler
2023-08-22 19:16 ` [PATCH v11 04/16] scsi: core: Introduce a mechanism for reordering requests in the error handler Bart Van Assche
@ 2023-08-23 6:26 ` Hannes Reinecke
2023-08-23 15:15 ` Bart Van Assche
0 siblings, 1 reply; 31+ messages in thread
From: Hannes Reinecke @ 2023-08-23 6:26 UTC (permalink / raw)
To: Bart Van Assche, Jens Axboe
Cc: linux-block, linux-scsi, Martin K . Petersen, Christoph Hellwig,
Damien Le Moal, Ming Lei, James E.J. Bottomley
On 8/22/23 21:16, Bart Van Assche wrote:
> Introduce the .eh_needs_prepare_resubmit and the .eh_prepare_resubmit
> function pointers in struct scsi_driver. Make the error handler call
> .eh_prepare_resubmit() before resubmitting commands if any of the
> .eh_needs_prepare_resubmit() invocations return true. A later patch
> will use this functionality to sort SCSI commands by LBA from inside
> the SCSI disk driver before these are resubmitted by the error handler.
>
> Cc: Martin K. Petersen <martin.petersen@oracle.com>
> Cc: Damien Le Moal <dlemoal@kernel.org>
> Cc: Christoph Hellwig <hch@lst.de>
> Cc: Ming Lei <ming.lei@redhat.com>
> Signed-off-by: Bart Van Assche <bvanassche@acm.org>
> ---
> drivers/scsi/scsi_error.c | 65 ++++++++++++++++++++++++++++++++++++++
> drivers/scsi/scsi_priv.h | 1 +
> include/scsi/scsi_driver.h | 2 ++
> 3 files changed, 68 insertions(+)
>
> diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c
> index c67cdcdc3ba8..c4d817f044a0 100644
> --- a/drivers/scsi/scsi_error.c
> +++ b/drivers/scsi/scsi_error.c
> @@ -27,6 +27,7 @@
> #include <linux/blkdev.h>
> #include <linux/delay.h>
> #include <linux/jiffies.h>
> +#include <linux/list_sort.h>
>
> #include <scsi/scsi.h>
> #include <scsi/scsi_cmnd.h>
> @@ -2186,6 +2187,68 @@ void scsi_eh_ready_devs(struct Scsi_Host *shost,
> }
> EXPORT_SYMBOL_GPL(scsi_eh_ready_devs);
>
> +/*
> + * Returns true if .eh_prepare_resubmit should be called for the commands in
> + * @done_q.
> + */
> +static bool scsi_needs_preparation(struct list_head *done_q)
> +{
> + struct scsi_cmnd *scmd;
> +
> + list_for_each_entry(scmd, done_q, eh_entry) {
> + struct scsi_driver *uld = scsi_cmd_to_driver(scmd);
> + bool (*npr)(struct scsi_cmnd *) = uld->eh_needs_prepare_resubmit;
> +
> + if (npr && npr(scmd))
> + return true;
> + }
> +
> + return false;
> +}
> + > +/*
> + * Comparison function that allows to sort SCSI commands by ULD driver.
> + */
> +static int scsi_cmp_uld(void *priv, const struct list_head *_a,
> + const struct list_head *_b)
> +{
> + struct scsi_cmnd *a = list_entry(_a, typeof(*a), eh_entry);
> + struct scsi_cmnd *b = list_entry(_b, typeof(*b), eh_entry);
> +
> + /* See also the comment above the list_sort() definition. */
> + return scsi_cmd_to_driver(a) > scsi_cmd_to_driver(b);
I have to agree with Christoph here.
Comparing LBA numbers at the SCSI level is really the wrong place.
SCSI commands might be anything, and quite some of these commands don't
even have LBA numbers. So trying to order them will be pointless.
The reordering mechanism really has to go into the block layer, with
the driver failing the request and the block layer resubmitting in-order.
Sorry.
Cheers,
Hannes
--
Dr. Hannes Reinecke Kernel Storage Architect
hare@suse.de +49 911 74053 688
SUSE Software Solutions GmbH, Maxfeldstr. 5, 90409 Nürnberg
HRB 36809 (AG Nürnberg), Geschäftsführer: Ivo Totev, Andrew
Myers, Andrew McDonald, Martje Boudien Moerman
^ permalink raw reply [flat|nested] 31+ messages in thread* Re: [PATCH v11 04/16] scsi: core: Introduce a mechanism for reordering requests in the error handler
2023-08-23 6:26 ` Hannes Reinecke
@ 2023-08-23 15:15 ` Bart Van Assche
2023-08-23 23:22 ` Damien Le Moal
0 siblings, 1 reply; 31+ messages in thread
From: Bart Van Assche @ 2023-08-23 15:15 UTC (permalink / raw)
To: Jens Axboe
Cc: Hannes Reinecke, linux-block, linux-scsi, Martin K . Petersen,
Christoph Hellwig, Damien Le Moal, Ming Lei, James E.J. Bottomley
On 8/22/23 23:26, Hannes Reinecke wrote:
> On 8/22/23 21:16, Bart Van Assche wrote:
>> +/*
>> + * Comparison function that allows to sort SCSI commands by ULD driver.
>> + */
>> +static int scsi_cmp_uld(void *priv, const struct list_head *_a,
>> + const struct list_head *_b)
>> +{
>> + struct scsi_cmnd *a = list_entry(_a, typeof(*a), eh_entry);
>> + struct scsi_cmnd *b = list_entry(_b, typeof(*b), eh_entry);
>> +
>> + /* See also the comment above the list_sort() definition. */
>> + return scsi_cmd_to_driver(a) > scsi_cmd_to_driver(b);
>
> I have to agree with Christoph here.
> Comparing LBA numbers at the SCSI level is really the wrong place.
> SCSI commands might be anything, and quite some of these commands don't
> even have LBA numbers. So trying to order them will be pointless.
>
> The reordering mechanism really has to go into the block layer, with
> the driver failing the request and the block layer resubmitting in-order.
Hi Hannes,
Please take another look at patches 04/16 and 05/16. As one can see no
LBA numbers are being compared in the SCSI core - comparing LBA numbers
happens in the sd (SCSI disk) driver. The code that you replied to
compares ULD pointers, a well-defined concept in the SCSI core.
Your request to move the functionality from patches 04/16 and 05/16 into
the block layer would involve the following:
* Report the unaligned write errors (because a write did not happen at the
write pointer) to the block layer (BLK_STS_WP_MISMATCH?).
* Introduce a mechanism in the block layer for postponing error handling
until all outstanding commands have failed. The approach from the SCSI
core (tracking the number of failed and the number of busy commands
and only waking up the error handler after these counters are equal)
would be unacceptable because of the runtime overhead this mechanism
would introduce in the block layer hot path. Additionally, I strongly
doubt that it is possible to introduce any mechanism for postponing
error handling in the block layer without introducing additional
overhead in the hot path.
* Christoph's opinion is that NVMe software should use zone append
(REQ_OP_ZONE_APPEND) instead of regular writes (REQ_OP_WRITE) when
writing to a zoned namespace. So the SCSI subsystem would be the only
user of the new mechanism introduced in the block layer. The reason we
chose REQ_OP_WRITE for zoned UFS devices is because the SCSI standard
does not support a zone append command and introducing a zone append
command in the SCSI standards is not something that can be realized in
time for the first generation of zoned UFS devices.
Because I assume that both Jens and Christoph disagree strongly with your
request: I have no plans to move the code for sorting zoned writes into
the block layer core.
Jens and Christoph, please correct me if I misunderstood something.
Thanks,
Bart.
^ permalink raw reply [flat|nested] 31+ messages in thread* Re: [PATCH v11 04/16] scsi: core: Introduce a mechanism for reordering requests in the error handler
2023-08-23 15:15 ` Bart Van Assche
@ 2023-08-23 23:22 ` Damien Le Moal
2023-08-24 14:47 ` Bart Van Assche
0 siblings, 1 reply; 31+ messages in thread
From: Damien Le Moal @ 2023-08-23 23:22 UTC (permalink / raw)
To: Bart Van Assche, Jens Axboe
Cc: Hannes Reinecke, linux-block, linux-scsi, Martin K . Petersen,
Christoph Hellwig, Ming Lei, James E.J. Bottomley
On 8/24/23 00:15, Bart Van Assche wrote:
> On 8/22/23 23:26, Hannes Reinecke wrote:
>> On 8/22/23 21:16, Bart Van Assche wrote:
>>> +/*
>>> + * Comparison function that allows to sort SCSI commands by ULD driver.
>>> + */
>>> +static int scsi_cmp_uld(void *priv, const struct list_head *_a,
>>> + const struct list_head *_b)
>>> +{
>>> + struct scsi_cmnd *a = list_entry(_a, typeof(*a), eh_entry);
>>> + struct scsi_cmnd *b = list_entry(_b, typeof(*b), eh_entry);
>>> +
>>> + /* See also the comment above the list_sort() definition. */
>>> + return scsi_cmd_to_driver(a) > scsi_cmd_to_driver(b);
>>
>> I have to agree with Christoph here.
>> Comparing LBA numbers at the SCSI level is really the wrong place.
>> SCSI commands might be anything, and quite some of these commands don't
>> even have LBA numbers. So trying to order them will be pointless.
>>
>> The reordering mechanism really has to go into the block layer, with
>> the driver failing the request and the block layer resubmitting in-order.
>
> Hi Hannes,
>
> Please take another look at patches 04/16 and 05/16. As one can see no
> LBA numbers are being compared in the SCSI core - comparing LBA numbers
> happens in the sd (SCSI disk) driver. The code that you replied to
> compares ULD pointers, a well-defined concept in the SCSI core.
>
> Your request to move the functionality from patches 04/16 and 05/16 into
> the block layer would involve the following:
> * Report the unaligned write errors (because a write did not happen at the
> write pointer) to the block layer (BLK_STS_WP_MISMATCH?).
> * Introduce a mechanism in the block layer for postponing error handling
> until all outstanding commands have failed. The approach from the SCSI
> core (tracking the number of failed and the number of busy commands
> and only waking up the error handler after these counters are equal)
> would be unacceptable because of the runtime overhead this mechanism
> would introduce in the block layer hot path. Additionally, I strongly
> doubt that it is possible to introduce any mechanism for postponing
> error handling in the block layer without introducing additional
> overhead in the hot path.
> * Christoph's opinion is that NVMe software should use zone append
> (REQ_OP_ZONE_APPEND) instead of regular writes (REQ_OP_WRITE) when
> writing to a zoned namespace. So the SCSI subsystem would be the only
> user of the new mechanism introduced in the block layer. The reason we
> chose REQ_OP_WRITE for zoned UFS devices is because the SCSI standard
> does not support a zone append command and introducing a zone append
> command in the SCSI standards is not something that can be realized in
> time for the first generation of zoned UFS devices.
The sd driver does have zone append emulation using regular writes. The
emulation relies on zone write locking to avoid issues with adapters that do not
have strong ordering guarantees, but that could be adapted to be removed for UFS
devices with write ordering guarantees. This solution would greatly simplify
your series since zone append requests are not subject to zone write locking at
the block layer. So no changes would be needed at that layer.
However, that implies that your preferred use case (f2fs) must be adapted to use
zone append instead of regular writes. That in itself may be a bigger-ish
change, but in the long run, likely a better one I think as that would be
compatible with NVMe ZNS and also future UFS standards defining a zone append
command.
>
> Because I assume that both Jens and Christoph disagree strongly with your
> request: I have no plans to move the code for sorting zoned writes into
> the block layer core.
>
> Jens and Christoph, please correct me if I misunderstood something.
>
> Thanks,
>
> Bart.
--
Damien Le Moal
Western Digital Research
^ permalink raw reply [flat|nested] 31+ messages in thread* Re: [PATCH v11 04/16] scsi: core: Introduce a mechanism for reordering requests in the error handler
2023-08-23 23:22 ` Damien Le Moal
@ 2023-08-24 14:47 ` Bart Van Assche
2023-08-24 16:44 ` Hannes Reinecke
0 siblings, 1 reply; 31+ messages in thread
From: Bart Van Assche @ 2023-08-24 14:47 UTC (permalink / raw)
To: Damien Le Moal, Jens Axboe
Cc: Hannes Reinecke, linux-block, linux-scsi, Martin K . Petersen,
Christoph Hellwig, Ming Lei, James E.J. Bottomley, Jaegeuk Kim
On 8/23/23 16:22, Damien Le Moal wrote:
> The sd driver does have zone append emulation using regular writes. The
> emulation relies on zone write locking to avoid issues with adapters that do not
> have strong ordering guarantees, but that could be adapted to be removed for UFS
> devices with write ordering guarantees. This solution would greatly simplify
> your series since zone append requests are not subject to zone write locking at
> the block layer. So no changes would be needed at that layer.
>
> However, that implies that your preferred use case (f2fs) must be adapted to use
> zone append instead of regular writes. That in itself may be a bigger-ish
> change, but in the long run, likely a better one I think as that would be
> compatible with NVMe ZNS and also future UFS standards defining a zone append
> command.
Hi Damien,
Thanks for the feedback. I agree that it would be great to have zone append
support in F2FS. However, I do not agree that switching from regular writes
to zone append in F2FS would remove the need for sorting SCSI commands by LBA
in the SCSI error handler. Even if F2FS would submit zoned writes then the
following mechanisms could still cause reordering of the zoned writes after
these have been translated into regular writes:
* The SCSI protocol allows SCSI devices, including UFS devices, to respond
with a unit attention or the SCSI BUSY status at any time. If multiple write
commands are pending and some of the pending SCSI writes are not executed
because of a unit attention or because of another reason, this causes
command reordering.
* Although the link between the UFS controller and the UFS device is pretty
reliable, there is a non-zero chance that a SCSI command is lost. If this
happens the SCSI timeout and error handlers are activated. This can cause
reordering of write commands.
In other words, whether F2FS submits regular writes (REQ_OP_WRITE) or zone
appends (REQ_OP_ZONE_APPEND), I think we need the entire patch series.
Thanks,
Bart.
^ permalink raw reply [flat|nested] 31+ messages in thread
* Re: [PATCH v11 04/16] scsi: core: Introduce a mechanism for reordering requests in the error handler
2023-08-24 14:47 ` Bart Van Assche
@ 2023-08-24 16:44 ` Hannes Reinecke
2023-08-24 16:52 ` Bart Van Assche
0 siblings, 1 reply; 31+ messages in thread
From: Hannes Reinecke @ 2023-08-24 16:44 UTC (permalink / raw)
To: Bart Van Assche, Damien Le Moal, Jens Axboe
Cc: linux-block, linux-scsi, Martin K . Petersen, Christoph Hellwig,
Ming Lei, James E.J. Bottomley, Jaegeuk Kim
On 8/24/23 16:47, Bart Van Assche wrote:
> On 8/23/23 16:22, Damien Le Moal wrote:
>> The sd driver does have zone append emulation using regular writes. The
>> emulation relies on zone write locking to avoid issues with adapters
>> that do not
>> have strong ordering guarantees, but that could be adapted to be
>> removed for UFS
>> devices with write ordering guarantees. This solution would greatly
>> simplify
>> your series since zone append requests are not subject to zone write
>> locking at
>> the block layer. So no changes would be needed at that layer.
>>
>> However, that implies that your preferred use case (f2fs) must be
>> adapted to use
>> zone append instead of regular writes. That in itself may be a bigger-ish
>> change, but in the long run, likely a better one I think as that would be
>> compatible with NVMe ZNS and also future UFS standards defining a zone
>> append
>> command.
>
> Hi Damien,
>
> Thanks for the feedback. I agree that it would be great to have zone append
> support in F2FS. However, I do not agree that switching from regular writes
> to zone append in F2FS would remove the need for sorting SCSI commands
> by LBA in the SCSI error handler. Even if F2FS would submit zoned writes
> then the following mechanisms could still cause reordering of the zoned
> write after these have been translated into regular writes:
> * The SCSI protocol allows SCSI devices, including UFS devices, to respond
> with a unit attention or the SCSI BUSY status at any time. If multiple
> write commands are pending and some of the pending SCSI writes are not
> executed because of a unit attention or because of another reason, this
> causes command reordering.
Yes. But the important thing to remember is that with 'zone append' the
resulting LBA will be returned on completion, they will _not_ be
specified in the submission. So any command reordering doesn't affect
the zone append commands as they heven't been written yet.
> * Although the link between the UFS controller and the UFS device is pretty
> reliable, there is a non-zero chance that a SCSI command is lost. If this
> happens the SCSI timeout and error handlers are activated. This can cause
> reordering of write commands.
>
Again, reordering is not an issue with zone append. With zone append you
specify in which zone the command should land, and upon completion the
LBA where the data is written will be returned.
So if there is an error the command has not been written, consequently
there is no LBA to worry about, and you can reorder at will.
Cheers,
Hannes
--
Dr. Hannes Reinecke Kernel Storage Architect
hare@suse.de +49 911 74053 688
SUSE Software Solutions GmbH, Maxfeldstr. 5, 90409 Nürnberg
HRB 36809 (AG Nürnberg), Geschäftsführer: Ivo Totev, Andrew
Myers, Andrew McDonald, Martje Boudien Moerman
^ permalink raw reply [flat|nested] 31+ messages in thread
* Re: [PATCH v11 04/16] scsi: core: Introduce a mechanism for reordering requests in the error handler
2023-08-24 16:44 ` Hannes Reinecke
@ 2023-08-24 16:52 ` Bart Van Assche
2023-08-24 23:53 ` Damien Le Moal
0 siblings, 1 reply; 31+ messages in thread
From: Bart Van Assche @ 2023-08-24 16:52 UTC (permalink / raw)
To: Hannes Reinecke, Damien Le Moal, Jens Axboe
Cc: linux-block, linux-scsi, Martin K . Petersen, Christoph Hellwig,
Ming Lei, James E.J. Bottomley, Jaegeuk Kim
On 8/24/23 09:44, Hannes Reinecke wrote:
> On 8/24/23 16:47, Bart Van Assche wrote:
>> Thanks for the feedback. I agree that it would be great to have zone
>> append
>> support in F2FS. However, I do not agree that switching from regular
>> writes
>> to zone append in F2FS would remove the need for sorting SCSI commands
>> by LBA in the SCSI error handler. Even if F2FS would submit zoned writes
>> then the following mechanisms could still cause reordering of the zoned
>> write after these have been translated into regular writes:
>> * The SCSI protocol allows SCSI devices, including UFS devices, to
>> respond
>> with a unit attention or the SCSI BUSY status at any time. If multiple
>> write commands are pending and some of the pending SCSI writes are not
>> executed because of a unit attention or because of another reason, this
>> causes command reordering.
>
> Yes. But the important thing to remember is that with 'zone append' the
> resulting LBA will be returned on completion, they will _not_ be
> specified in the submission. So any command reordering doesn't affect
> the zone append commands as they heven't been written yet.
>
>> * Although the link between the UFS controller and the UFS device is
>> pretty
>> reliable, there is a non-zero chance that a SCSI command is lost. If this
>> happens the SCSI timeout and error handlers are activated. This can cause
>> reordering of write commands.
>>
> Again, reordering is not an issue with zone append. With zone append you
> specify in which zone the command should land, and upon completion the
> LBA where the data is written will be returned.
>
> So if there is an error the command has not been written, consequently
> there is no LBA to worry about, and you can reorder at will.
Hi Hannes,
I agree that reordering is not an issue for NVMe zone append commands.
It is an issue however with SCSI devices because there is no zone append
command in the SCSI command set. The sd_zbc.c code translates zone
appends (REQ_OP_ZONE_APPEND) into regular WRITE commands. If these WRITE
commands are reordered, the ZBC standard requires that these commands
fail with an UNALIGNED WRITE error. So I think for SCSI devices what you
wrote is wrong.
Bart.
^ permalink raw reply [flat|nested] 31+ messages in thread
* Re: [PATCH v11 04/16] scsi: core: Introduce a mechanism for reordering requests in the error handler
2023-08-24 16:52 ` Bart Van Assche
@ 2023-08-24 23:53 ` Damien Le Moal
2023-08-25 1:00 ` Bart Van Assche
0 siblings, 1 reply; 31+ messages in thread
From: Damien Le Moal @ 2023-08-24 23:53 UTC (permalink / raw)
To: Bart Van Assche, Hannes Reinecke, Jens Axboe
Cc: linux-block, linux-scsi, Martin K . Petersen, Christoph Hellwig,
Ming Lei, James E.J. Bottomley, Jaegeuk Kim
On 8/25/23 01:52, Bart Van Assche wrote:
> On 8/24/23 09:44, Hannes Reinecke wrote:
>> On 8/24/23 16:47, Bart Van Assche wrote:
>>> Thanks for the feedback. I agree that it would be great to have zone
>>> append
>>> support in F2FS. However, I do not agree that switching from regular
>>> writes
>>> to zone append in F2FS would remove the need for sorting SCSI commands
>>> by LBA in the SCSI error handler. Even if F2FS would submit zoned writes
>>> then the following mechanisms could still cause reordering of the zoned
>>> write after these have been translated into regular writes:
>>> * The SCSI protocol allows SCSI devices, including UFS devices, to
>>> respond
>>> with a unit attention or the SCSI BUSY status at any time. If multiple
>>> write commands are pending and some of the pending SCSI writes are not
>>> executed because of a unit attention or because of another reason, this
>>> causes command reordering.
>>
>> Yes. But the important thing to remember is that with 'zone append' the
>> resulting LBA will be returned on completion, they will _not_ be
>> specified in the submission. So any command reordering doesn't affect
>> the zone append commands as they heven't been written yet.
>>
>>> * Although the link between the UFS controller and the UFS device is
>>> pretty
>>> reliable, there is a non-zero chance that a SCSI command is lost. If this
>>> happens the SCSI timeout and error handlers are activated. This can cause
>>> reordering of write commands.
>>>
>> Again, reordering is not an issue with zone append. With zone append you
>> specify in which zone the command should land, and upon completion the
>> LBA where the data is written will be returned.
>>
>> So if there is an error the command has not been written, consequently
>> there is no LBA to worry about, and you can reorder at will.
>
> Hi Hannes,
>
> I agree that reordering is not an issue for NVMe zone append commands.
> It is an issue however with SCSI devices because there is no zone append
> command in the SCSI command set. The sd_zbc.c code translates zone
> appends (REQ_OP_ZONE_APPEND) into regular WRITE commands. If these WRITE
> commands are reordered, the ZBC standard requires that these commands
> fail with an UNALIGNED WRITE error. So I think for SCSI devices what you
> wrote is wrong.
I think that Hannes point was that if you ensure that the rejected regular write
commands used to emulate zone append when requeued go through the sd driver
again when resubmitted, they will be changed again to emulate the original zone
append using the latest wp location, which is assumed correct. And that does not
depend on the ordering. So requeuing these regular writes does not need sorting.
It can be in any order. The constraint is of course that they must be re-preped
from the original REQ_OP_ZONE_APPEND every time they are requeued.
--
Damien Le Moal
Western Digital Research
^ permalink raw reply [flat|nested] 31+ messages in thread
* Re: [PATCH v11 04/16] scsi: core: Introduce a mechanism for reordering requests in the error handler
2023-08-24 23:53 ` Damien Le Moal
@ 2023-08-25 1:00 ` Bart Van Assche
0 siblings, 0 replies; 31+ messages in thread
From: Bart Van Assche @ 2023-08-25 1:00 UTC (permalink / raw)
To: Damien Le Moal, Hannes Reinecke, Jens Axboe
Cc: linux-block, linux-scsi, Martin K . Petersen, Christoph Hellwig,
Ming Lei, James E.J. Bottomley, Jaegeuk Kim
On 8/24/23 16:53, Damien Le Moal wrote:
> I think that Hannes point was that if you ensure that the rejected regular write
> commands used to emulate zone append when requeued go through the sd driver
> again when resubmitted, they will be changed again to emulate the original zone
> append using the latest wp location, which is assumed correct. And that does not
> depend on the ordering. So requeuing these regular writes does not need sorting.
> It can be in any order. The constraint is of course that they must be re-preped
> from the original REQ_OP_ZONE_APPEND every time they are requeued.
Hi Damien,
Although this is a possible approach, I do not agree that this is a better
approach. Compared to the patch series that I posted, the disadvantages of
this approach are as follows:
* It relies on the zone append emulation. Although that emulation is not too
complicated, it adds more code in the hot path. The spin lock in the zone
append emulation is expected to cause performance issues at the performance
levels supported by UFS devices. Current UFS devices already support more
than 300 K IOPS and future UFS devices are expected to support even higher
performance.
* It would be challenging to derive the write pointer in the error handler
without submitting a REPORT ZONES command to the storage device. This
means that the error handler becomes more complicated.
In summary: the feedback is appreciated but I do not agree that the zone
append approach is a better approach for UFS devices than the approach of
this patch series.
Thanks,
Bart.
^ permalink raw reply [flat|nested] 31+ messages in thread
* [PATCH v11 05/16] scsi: core: Add unit tests for scsi_call_prepare_resubmit()
2023-08-22 19:16 [PATCH v11 00/16] Improve write performance for zoned UFS devices Bart Van Assche
` (3 preceding siblings ...)
2023-08-22 19:16 ` [PATCH v11 04/16] scsi: core: Introduce a mechanism for reordering requests in the error handler Bart Van Assche
@ 2023-08-22 19:17 ` Bart Van Assche
2023-08-22 19:17 ` [PATCH v11 06/16] scsi: sd: Sort commands by LBA before resubmitting Bart Van Assche
` (10 subsequent siblings)
15 siblings, 0 replies; 31+ messages in thread
From: Bart Van Assche @ 2023-08-22 19:17 UTC (permalink / raw)
To: Jens Axboe
Cc: linux-block, linux-scsi, Martin K . Petersen, Christoph Hellwig,
Bart Van Assche, Damien Le Moal, Ming Lei, James E.J. Bottomley
Triggering all code paths in scsi_call_prepare_resubmit() via manual
testing is difficult. Hence add unit tests for this function.
Cc: Martin K. Petersen <martin.petersen@oracle.com>
Cc: Damien Le Moal <dlemoal@kernel.org>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
---
drivers/scsi/Kconfig | 2 +
drivers/scsi/Kconfig.kunit | 4 +
drivers/scsi/Makefile | 2 +
drivers/scsi/Makefile.kunit | 1 +
drivers/scsi/scsi_error_test.c | 207 +++++++++++++++++++++++++++++++++
5 files changed, 216 insertions(+)
create mode 100644 drivers/scsi/Kconfig.kunit
create mode 100644 drivers/scsi/Makefile.kunit
create mode 100644 drivers/scsi/scsi_error_test.c
diff --git a/drivers/scsi/Kconfig b/drivers/scsi/Kconfig
index 4962ce989113..fc288f8fb800 100644
--- a/drivers/scsi/Kconfig
+++ b/drivers/scsi/Kconfig
@@ -232,6 +232,8 @@ config SCSI_SCAN_ASYNC
Note that this setting also affects whether resuming from
system suspend will be performed asynchronously.
+source "drivers/scsi/Kconfig.kunit"
+
menu "SCSI Transports"
depends on SCSI
diff --git a/drivers/scsi/Kconfig.kunit b/drivers/scsi/Kconfig.kunit
new file mode 100644
index 000000000000..90984a6ec7cc
--- /dev/null
+++ b/drivers/scsi/Kconfig.kunit
@@ -0,0 +1,4 @@
+config SCSI_ERROR_TEST
+ tristate "scsi_error.c unit tests" if !KUNIT_ALL_TESTS
+ depends on SCSI && KUNIT
+ default KUNIT_ALL_TESTS
diff --git a/drivers/scsi/Makefile b/drivers/scsi/Makefile
index f055bfd54a68..1c5c3afb6c6e 100644
--- a/drivers/scsi/Makefile
+++ b/drivers/scsi/Makefile
@@ -168,6 +168,8 @@ scsi_mod-$(CONFIG_PM) += scsi_pm.o
scsi_mod-$(CONFIG_SCSI_DH) += scsi_dh.o
scsi_mod-$(CONFIG_BLK_DEV_BSG) += scsi_bsg.o
+include $(srctree)/drivers/scsi/Makefile.kunit
+
hv_storvsc-y := storvsc_drv.o
sd_mod-objs := sd.o
diff --git a/drivers/scsi/Makefile.kunit b/drivers/scsi/Makefile.kunit
new file mode 100644
index 000000000000..3e98053b2709
--- /dev/null
+++ b/drivers/scsi/Makefile.kunit
@@ -0,0 +1 @@
+obj-$(CONFIG_SCSI_ERROR_TEST) += scsi_error_test.o
diff --git a/drivers/scsi/scsi_error_test.c b/drivers/scsi/scsi_error_test.c
new file mode 100644
index 000000000000..be0d25e1fb57
--- /dev/null
+++ b/drivers/scsi/scsi_error_test.c
@@ -0,0 +1,207 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2023 Google LLC
+ */
+#include <kunit/test.h>
+#include <scsi/scsi_cmnd.h>
+#include <scsi/scsi_driver.h>
+#include "scsi_priv.h"
+
+static struct kunit *kunit_test;
+
+static bool uld_needs_prepare_resubmit(struct scsi_cmnd *cmd)
+{
+ struct request *rq = scsi_cmd_to_rq(cmd);
+
+ return !rq->q->limits.use_zone_write_lock &&
+ blk_rq_is_seq_zoned_write(rq);
+}
+
+static void uld_prepare_resubmit(struct list_head *cmd_list)
+{
+ /* This function must not be called. */
+ KUNIT_EXPECT_TRUE(kunit_test, false);
+}
+
+/*
+ * Verify that .eh_prepare_resubmit() is not called if use_zone_write_lock is
+ * true.
+ */
+static void test_prepare_resubmit1(struct kunit *test)
+{
+ static struct gendisk disk;
+ static struct request_queue q = {
+ .disk = &disk,
+ .limits = {
+ .driver_preserves_write_order = false,
+ .use_zone_write_lock = true,
+ .zoned = BLK_ZONED_HM,
+ }
+ };
+ static struct scsi_driver uld = {
+ .eh_needs_prepare_resubmit = uld_needs_prepare_resubmit,
+ .eh_prepare_resubmit = uld_prepare_resubmit,
+ };
+ static struct scsi_device dev = {
+ .request_queue = &q,
+ .sdev_gendev.driver = &uld.gendrv,
+ };
+ static struct rq_and_cmd {
+ struct request rq;
+ struct scsi_cmnd cmd;
+ } cmd1, cmd2;
+ LIST_HEAD(cmd_list);
+
+ BUILD_BUG_ON(scsi_cmd_to_rq(&cmd1.cmd) != &cmd1.rq);
+
+ disk.queue = &q;
+ cmd1 = (struct rq_and_cmd){
+ .rq = {
+ .q = &q,
+ .cmd_flags = REQ_OP_WRITE,
+ .__sector = 2,
+ },
+ .cmd.device = &dev,
+ };
+ cmd2 = cmd1;
+ cmd2.rq.__sector = 1;
+ list_add_tail(&cmd1.cmd.eh_entry, &cmd_list);
+ list_add_tail(&cmd2.cmd.eh_entry, &cmd_list);
+
+ KUNIT_EXPECT_EQ(test, list_count_nodes(&cmd_list), 2);
+ kunit_test = test;
+ scsi_call_prepare_resubmit(&cmd_list);
+ kunit_test = NULL;
+ KUNIT_EXPECT_EQ(test, list_count_nodes(&cmd_list), 2);
+ KUNIT_EXPECT_PTR_EQ(test, cmd_list.next, &cmd1.cmd.eh_entry);
+ KUNIT_EXPECT_PTR_EQ(test, cmd_list.next->next, &cmd2.cmd.eh_entry);
+}
+
+static struct scsi_driver *uld1, *uld2, *uld3;
+
+static void uld1_prepare_resubmit(struct list_head *cmd_list)
+{
+ struct scsi_cmnd *cmd;
+
+ KUNIT_EXPECT_EQ(kunit_test, list_count_nodes(cmd_list), 2);
+ list_for_each_entry(cmd, cmd_list, eh_entry)
+ KUNIT_EXPECT_PTR_EQ(kunit_test, scsi_cmd_to_driver(cmd), uld1);
+}
+
+static void uld2_prepare_resubmit(struct list_head *cmd_list)
+{
+ struct scsi_cmnd *cmd;
+
+ KUNIT_EXPECT_EQ(kunit_test, list_count_nodes(cmd_list), 2);
+ list_for_each_entry(cmd, cmd_list, eh_entry)
+ KUNIT_EXPECT_PTR_EQ(kunit_test, scsi_cmd_to_driver(cmd), uld2);
+}
+
+static void test_prepare_resubmit2(struct kunit *test)
+{
+ static struct gendisk disk;
+ static struct request_queue q = {
+ .disk = &disk,
+ .limits = {
+ .driver_preserves_write_order = true,
+ .use_zone_write_lock = false,
+ .zoned = BLK_ZONED_HM,
+ }
+ };
+ static struct rq_and_cmd {
+ struct request rq;
+ struct scsi_cmnd cmd;
+ } cmd1, cmd2, cmd3, cmd4, cmd5, cmd6;
+ static struct scsi_device dev1, dev2, dev3;
+ struct scsi_driver *uld;
+ LIST_HEAD(cmd_list);
+
+ BUILD_BUG_ON(scsi_cmd_to_rq(&cmd1.cmd) != &cmd1.rq);
+
+ uld = kzalloc(3 * sizeof(*uld), GFP_KERNEL);
+ uld1 = &uld[0];
+ uld1->eh_needs_prepare_resubmit = uld_needs_prepare_resubmit;
+ uld1->eh_prepare_resubmit = uld1_prepare_resubmit;
+ uld2 = &uld[1];
+ uld2->eh_needs_prepare_resubmit = uld_needs_prepare_resubmit;
+ uld2->eh_prepare_resubmit = uld2_prepare_resubmit;
+ uld3 = &uld[2];
+ disk.queue = &q;
+ dev1.sdev_gendev.driver = &uld1->gendrv;
+ dev1.request_queue = &q;
+ dev2.sdev_gendev.driver = &uld2->gendrv;
+ dev2.request_queue = &q;
+ dev3.sdev_gendev.driver = &uld3->gendrv;
+ dev3.request_queue = &q;
+ cmd1 = (struct rq_and_cmd){
+ .rq = {
+ .q = &q,
+ .cmd_flags = REQ_OP_WRITE,
+ .__sector = 3,
+ },
+ .cmd.device = &dev1,
+ };
+ cmd2 = cmd1;
+ cmd2.rq.__sector = 4;
+ cmd3 = (struct rq_and_cmd){
+ .rq = {
+ .q = &q,
+ .cmd_flags = REQ_OP_WRITE,
+ .__sector = 1,
+ },
+ .cmd.device = &dev2,
+ };
+ cmd4 = cmd3;
+ cmd4.rq.__sector = 2,
+ cmd5 = (struct rq_and_cmd){
+ .rq = {
+ .q = &q,
+ .cmd_flags = REQ_OP_WRITE,
+ .__sector = 5,
+ },
+ .cmd.device = &dev3,
+ };
+ cmd6 = cmd5;
+ cmd6.rq.__sector = 6;
+ list_add_tail(&cmd3.cmd.eh_entry, &cmd_list);
+ list_add_tail(&cmd1.cmd.eh_entry, &cmd_list);
+ list_add_tail(&cmd2.cmd.eh_entry, &cmd_list);
+ list_add_tail(&cmd5.cmd.eh_entry, &cmd_list);
+ list_add_tail(&cmd6.cmd.eh_entry, &cmd_list);
+ list_add_tail(&cmd4.cmd.eh_entry, &cmd_list);
+
+ KUNIT_EXPECT_EQ(test, list_count_nodes(&cmd_list), 6);
+ kunit_test = test;
+ scsi_call_prepare_resubmit(&cmd_list);
+ kunit_test = NULL;
+ KUNIT_EXPECT_EQ(test, list_count_nodes(&cmd_list), 6);
+ KUNIT_EXPECT_TRUE(test, uld1 < uld2);
+ KUNIT_EXPECT_TRUE(test, uld2 < uld3);
+ KUNIT_EXPECT_PTR_EQ(test, cmd_list.next, &cmd1.cmd.eh_entry);
+ KUNIT_EXPECT_PTR_EQ(test, cmd_list.next->next, &cmd2.cmd.eh_entry);
+ KUNIT_EXPECT_PTR_EQ(test, cmd_list.next->next->next,
+ &cmd3.cmd.eh_entry);
+ KUNIT_EXPECT_PTR_EQ(test, cmd_list.next->next->next->next,
+ &cmd4.cmd.eh_entry);
+ KUNIT_EXPECT_PTR_EQ(test, cmd_list.next->next->next->next->next,
+ &cmd5.cmd.eh_entry);
+ KUNIT_EXPECT_PTR_EQ(test, cmd_list.next->next->next->next->next->next,
+ &cmd6.cmd.eh_entry);
+ kfree(uld);
+}
+
+static struct kunit_case prepare_resubmit_test_cases[] = {
+ KUNIT_CASE(test_prepare_resubmit1),
+ KUNIT_CASE(test_prepare_resubmit2),
+ {}
+};
+
+static struct kunit_suite prepare_resubmit_test_suite = {
+ .name = "prepare_resubmit",
+ .test_cases = prepare_resubmit_test_cases,
+};
+kunit_test_suite(prepare_resubmit_test_suite);
+
+MODULE_DESCRIPTION("scsi_call_prepare_resubmit() unit tests");
+MODULE_AUTHOR("Bart Van Assche");
+MODULE_LICENSE("GPL");
^ permalink raw reply related [flat|nested] 31+ messages in thread* [PATCH v11 06/16] scsi: sd: Sort commands by LBA before resubmitting
2023-08-22 19:16 [PATCH v11 00/16] Improve write performance for zoned UFS devices Bart Van Assche
` (4 preceding siblings ...)
2023-08-22 19:17 ` [PATCH v11 05/16] scsi: core: Add unit tests for scsi_call_prepare_resubmit() Bart Van Assche
@ 2023-08-22 19:17 ` Bart Van Assche
2023-08-22 19:17 ` [PATCH v11 07/16] scsi: core: Retry unaligned zoned writes Bart Van Assche
` (9 subsequent siblings)
15 siblings, 0 replies; 31+ messages in thread
From: Bart Van Assche @ 2023-08-22 19:17 UTC (permalink / raw)
To: Jens Axboe
Cc: linux-block, linux-scsi, Martin K . Petersen, Christoph Hellwig,
Bart Van Assche, Damien Le Moal, Ming Lei, James E.J. Bottomley
Sort SCSI commands by LBA before the SCSI error handler resubmits
these commands. This is necessary when resubmitting zoned writes
(REQ_OP_WRITE) if multiple writes have been queued for a single zone.
Cc: Martin K. Petersen <martin.petersen@oracle.com>
Cc: Damien Le Moal <dlemoal@kernel.org>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
---
drivers/scsi/sd.c | 45 +++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 45 insertions(+)
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index 3c668cfb146d..d4feac5de17a 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -47,6 +47,7 @@
#include <linux/blkpg.h>
#include <linux/blk-pm.h>
#include <linux/delay.h>
+#include <linux/list_sort.h>
#include <linux/major.h>
#include <linux/mutex.h>
#include <linux/string_helpers.h>
@@ -117,6 +118,8 @@ static void sd_uninit_command(struct scsi_cmnd *SCpnt);
static int sd_done(struct scsi_cmnd *);
static void sd_eh_reset(struct scsi_cmnd *);
static int sd_eh_action(struct scsi_cmnd *, int);
+static bool sd_needs_prepare_resubmit(struct scsi_cmnd *cmd);
+static void sd_prepare_resubmit(struct list_head *cmd_list);
static void sd_read_capacity(struct scsi_disk *sdkp, unsigned char *buffer);
static void scsi_disk_release(struct device *cdev);
@@ -617,6 +620,8 @@ static struct scsi_driver sd_template = {
.done = sd_done,
.eh_action = sd_eh_action,
.eh_reset = sd_eh_reset,
+ .eh_needs_prepare_resubmit = sd_needs_prepare_resubmit,
+ .eh_prepare_resubmit = sd_prepare_resubmit,
};
/*
@@ -2018,6 +2023,46 @@ static int sd_eh_action(struct scsi_cmnd *scmd, int eh_disp)
return eh_disp;
}
+static bool sd_needs_prepare_resubmit(struct scsi_cmnd *cmd)
+{
+ struct request *rq = scsi_cmd_to_rq(cmd);
+
+ return !rq->q->limits.use_zone_write_lock &&
+ blk_rq_is_seq_zoned_write(rq);
+}
+
+static int sd_cmp_sector(void *priv, const struct list_head *_a,
+ const struct list_head *_b)
+{
+ struct scsi_cmnd *a = list_entry(_a, typeof(*a), eh_entry);
+ struct scsi_cmnd *b = list_entry(_b, typeof(*b), eh_entry);
+ struct request *rq_a = scsi_cmd_to_rq(a);
+ struct request *rq_b = scsi_cmd_to_rq(b);
+ bool use_zwl_a = rq_a->q->limits.use_zone_write_lock;
+ bool use_zwl_b = rq_b->q->limits.use_zone_write_lock;
+
+ /*
+ * Order the commands that need zone write locking after the commands
+ * that do not need zone write locking. Order the commands that do not
+ * need zone write locking by LBA. Do not reorder the commands that
+ * need zone write locking. See also the comment above the list_sort()
+ * definition.
+ */
+ if (use_zwl_a || use_zwl_b)
+ return use_zwl_a > use_zwl_b;
+ return blk_rq_pos(rq_a) > blk_rq_pos(rq_b);
+}
+
+static void sd_prepare_resubmit(struct list_head *cmd_list)
+{
+ /*
+ * Sort pending SCSI commands in starting sector order. This is
+ * important if one of the SCSI devices associated with @shost is a
+ * zoned block device for which zone write locking is disabled.
+ */
+ list_sort(NULL, cmd_list, sd_cmp_sector);
+}
+
static unsigned int sd_completed_bytes(struct scsi_cmnd *scmd)
{
struct request *req = scsi_cmd_to_rq(scmd);
^ permalink raw reply related [flat|nested] 31+ messages in thread* [PATCH v11 07/16] scsi: core: Retry unaligned zoned writes
2023-08-22 19:16 [PATCH v11 00/16] Improve write performance for zoned UFS devices Bart Van Assche
` (5 preceding siblings ...)
2023-08-22 19:17 ` [PATCH v11 06/16] scsi: sd: Sort commands by LBA before resubmitting Bart Van Assche
@ 2023-08-22 19:17 ` Bart Van Assche
2023-08-22 19:17 ` [PATCH v11 08/16] scsi: sd_zbc: Only require an I/O scheduler if needed Bart Van Assche
` (8 subsequent siblings)
15 siblings, 0 replies; 31+ messages in thread
From: Bart Van Assche @ 2023-08-22 19:17 UTC (permalink / raw)
To: Jens Axboe
Cc: linux-block, linux-scsi, Martin K . Petersen, Christoph Hellwig,
Bart Van Assche, Damien Le Moal, Ming Lei, James E.J. Bottomley
If zoned writes (REQ_OP_WRITE) for a sequential write required zone have
a starting LBA that differs from the write pointer, e.g. because zoned
writes have been reordered, then the storage device will respond with an
UNALIGNED WRITE COMMAND error. Send commands that failed with an
unaligned write error to the SCSI error handler if zone write locking is
disabled. The SCSI error handler will sort SCSI commands per LBA before
resubmitting these.
If zone write locking is disabled, increase the number of retries for
write commands sent to a sequential zone to the maximum number of
outstanding commands because in the worst case the number of times
reordered zoned writes have to be retried is (number of outstanding
writes per sequential zone) - 1.
Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
Cc: Martin K. Petersen <martin.petersen@oracle.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
---
drivers/scsi/scsi_error.c | 16 ++++++++++++++++
drivers/scsi/scsi_lib.c | 1 +
drivers/scsi/sd.c | 6 ++++++
include/scsi/scsi.h | 1 +
4 files changed, 24 insertions(+)
diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c
index c4d817f044a0..e393d78b921b 100644
--- a/drivers/scsi/scsi_error.c
+++ b/drivers/scsi/scsi_error.c
@@ -699,6 +699,22 @@ enum scsi_disposition scsi_check_sense(struct scsi_cmnd *scmd)
fallthrough;
case ILLEGAL_REQUEST:
+ /*
+ * Unaligned write command. This may indicate that zoned writes
+ * have been received by the device in the wrong order. If zone
+ * write locking is disabled, retry after all pending commands
+ * have completed.
+ */
+ if (sshdr.asc == 0x21 && sshdr.ascq == 0x04 &&
+ !req->q->limits.use_zone_write_lock &&
+ blk_rq_is_seq_zoned_write(req)) {
+ SCSI_LOG_ERROR_RECOVERY(3,
+ sdev_printk(KERN_INFO, scmd->device,
+ "Retrying unaligned write at LBA %#llx.\n",
+ scsi_get_lba(scmd)));
+ return NEEDS_DELAYED_RETRY;
+ }
+
if (sshdr.asc == 0x20 || /* Invalid command operation code */
sshdr.asc == 0x21 || /* Logical block address out of range */
sshdr.asc == 0x22 || /* Invalid function */
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 59176946ab56..69da8aee13df 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -1443,6 +1443,7 @@ static void scsi_complete(struct request *rq)
case ADD_TO_MLQUEUE:
scsi_queue_insert(cmd, SCSI_MLQUEUE_DEVICE_BUSY);
break;
+ case NEEDS_DELAYED_RETRY:
default:
scsi_eh_scmd_add(cmd);
break;
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index d4feac5de17a..e96014caa34c 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -1240,6 +1240,12 @@ static blk_status_t sd_setup_read_write_cmnd(struct scsi_cmnd *cmd)
cmd->transfersize = sdp->sector_size;
cmd->underflow = nr_blocks << 9;
cmd->allowed = sdkp->max_retries;
+ /*
+ * Increase the number of allowed retries for zoned writes if zone
+ * write locking is disabled.
+ */
+ if (!rq->q->limits.use_zone_write_lock && blk_rq_is_seq_zoned_write(rq))
+ cmd->allowed += rq->q->nr_requests;
cmd->sdb.length = nr_blocks * sdp->sector_size;
SCSI_LOG_HLQUEUE(1,
diff --git a/include/scsi/scsi.h b/include/scsi/scsi.h
index ec093594ba53..6600db046227 100644
--- a/include/scsi/scsi.h
+++ b/include/scsi/scsi.h
@@ -93,6 +93,7 @@ static inline int scsi_status_is_check_condition(int status)
* Internal return values.
*/
enum scsi_disposition {
+ NEEDS_DELAYED_RETRY = 0x2000,
NEEDS_RETRY = 0x2001,
SUCCESS = 0x2002,
FAILED = 0x2003,
^ permalink raw reply related [flat|nested] 31+ messages in thread* [PATCH v11 08/16] scsi: sd_zbc: Only require an I/O scheduler if needed
2023-08-22 19:16 [PATCH v11 00/16] Improve write performance for zoned UFS devices Bart Van Assche
` (6 preceding siblings ...)
2023-08-22 19:17 ` [PATCH v11 07/16] scsi: core: Retry unaligned zoned writes Bart Van Assche
@ 2023-08-22 19:17 ` Bart Van Assche
2023-08-22 19:17 ` [PATCH v11 09/16] scsi: scsi_debug: Add the preserves_write_order module parameter Bart Van Assche
` (7 subsequent siblings)
15 siblings, 0 replies; 31+ messages in thread
From: Bart Van Assche @ 2023-08-22 19:17 UTC (permalink / raw)
To: Jens Axboe
Cc: linux-block, linux-scsi, Martin K . Petersen, Christoph Hellwig,
Bart Van Assche, Damien Le Moal, Ming Lei, James E.J. Bottomley
An I/O scheduler that serializes zoned writes is only needed if the SCSI
LLD does not preserve the write order. Hence only set
ELEVATOR_F_ZBD_SEQ_WRITE if the LLD does not preserve the write order.
Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
Cc: Martin K. Petersen <martin.petersen@oracle.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
---
drivers/scsi/sd_zbc.c | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c
index a25215507668..718b31bed878 100644
--- a/drivers/scsi/sd_zbc.c
+++ b/drivers/scsi/sd_zbc.c
@@ -955,7 +955,9 @@ int sd_zbc_read_zones(struct scsi_disk *sdkp, u8 buf[SD_BUF_SIZE])
/* The drive satisfies the kernel restrictions: set it up */
blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q);
- blk_queue_required_elevator_features(q, ELEVATOR_F_ZBD_SEQ_WRITE);
+ if (!q->limits.driver_preserves_write_order)
+ blk_queue_required_elevator_features(q,
+ ELEVATOR_F_ZBD_SEQ_WRITE);
if (sdkp->zones_max_open == U32_MAX)
disk_set_max_open_zones(disk, 0);
else
^ permalink raw reply related [flat|nested] 31+ messages in thread* [PATCH v11 09/16] scsi: scsi_debug: Add the preserves_write_order module parameter
2023-08-22 19:16 [PATCH v11 00/16] Improve write performance for zoned UFS devices Bart Van Assche
` (7 preceding siblings ...)
2023-08-22 19:17 ` [PATCH v11 08/16] scsi: sd_zbc: Only require an I/O scheduler if needed Bart Van Assche
@ 2023-08-22 19:17 ` Bart Van Assche
2023-08-22 19:17 ` [PATCH v11 10/16] scsi: scsi_debug: Support injecting unaligned write errors Bart Van Assche
` (6 subsequent siblings)
15 siblings, 0 replies; 31+ messages in thread
From: Bart Van Assche @ 2023-08-22 19:17 UTC (permalink / raw)
To: Jens Axboe
Cc: linux-block, linux-scsi, Martin K . Petersen, Christoph Hellwig,
Bart Van Assche, Douglas Gilbert, Damien Le Moal, Ming Lei,
James E.J. Bottomley
Zone write locking is not used for zoned devices if the block driver
reports that it preserves the order of write commands. Make it easier to
test not using zone write locking by adding support for setting the
driver_preserves_write_order flag.
Acked-by: Douglas Gilbert <dgilbert@interlog.com>
Cc: Martin K. Petersen <martin.petersen@oracle.com>
Cc: Damien Le Moal <dlemoal@kernel.org>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
---
drivers/scsi/scsi_debug.c | 9 +++++++++
1 file changed, 9 insertions(+)
diff --git a/drivers/scsi/scsi_debug.c b/drivers/scsi/scsi_debug.c
index 9c0af50501f9..1ea4925d2c2f 100644
--- a/drivers/scsi/scsi_debug.c
+++ b/drivers/scsi/scsi_debug.c
@@ -832,6 +832,7 @@ static int dix_reads;
static int dif_errors;
/* ZBC global data */
+static bool sdeb_preserves_write_order;
static bool sdeb_zbc_in_use; /* true for host-aware and host-managed disks */
static int sdeb_zbc_zone_cap_mb;
static int sdeb_zbc_zone_size_mb;
@@ -5138,9 +5139,13 @@ static struct sdebug_dev_info *find_build_dev_info(struct scsi_device *sdev)
static int scsi_debug_slave_alloc(struct scsi_device *sdp)
{
+ struct request_queue *q = sdp->request_queue;
+
if (sdebug_verbose)
pr_info("slave_alloc <%u %u %u %llu>\n",
sdp->host->host_no, sdp->channel, sdp->id, sdp->lun);
+ if (sdeb_preserves_write_order)
+ q->limits.driver_preserves_write_order = true;
return 0;
}
@@ -5755,6 +5760,8 @@ module_param_named(statistics, sdebug_statistics, bool, S_IRUGO | S_IWUSR);
module_param_named(strict, sdebug_strict, bool, S_IRUGO | S_IWUSR);
module_param_named(submit_queues, submit_queues, int, S_IRUGO);
module_param_named(poll_queues, poll_queues, int, S_IRUGO);
+module_param_named(preserves_write_order, sdeb_preserves_write_order, bool,
+ S_IRUGO);
module_param_named(tur_ms_to_ready, sdeb_tur_ms_to_ready, int, S_IRUGO);
module_param_named(unmap_alignment, sdebug_unmap_alignment, int, S_IRUGO);
module_param_named(unmap_granularity, sdebug_unmap_granularity, int, S_IRUGO);
@@ -5812,6 +5819,8 @@ MODULE_PARM_DESC(ndelay, "response delay in nanoseconds (def=0 -> ignore)");
MODULE_PARM_DESC(no_lun_0, "no LU number 0 (def=0 -> have lun 0)");
MODULE_PARM_DESC(no_rwlock, "don't protect user data reads+writes (def=0)");
MODULE_PARM_DESC(no_uld, "stop ULD (e.g. sd driver) attaching (def=0))");
+MODULE_PARM_DESC(preserves_write_order,
+ "Whether or not to inform the block layer that this driver preserves the order of WRITE commands (def=0)");
MODULE_PARM_DESC(num_parts, "number of partitions(def=0)");
MODULE_PARM_DESC(num_tgts, "number of targets per host to simulate(def=1)");
MODULE_PARM_DESC(opt_blks, "optimal transfer length in blocks (def=1024)");
^ permalink raw reply related [flat|nested] 31+ messages in thread* [PATCH v11 10/16] scsi: scsi_debug: Support injecting unaligned write errors
2023-08-22 19:16 [PATCH v11 00/16] Improve write performance for zoned UFS devices Bart Van Assche
` (8 preceding siblings ...)
2023-08-22 19:17 ` [PATCH v11 09/16] scsi: scsi_debug: Add the preserves_write_order module parameter Bart Van Assche
@ 2023-08-22 19:17 ` Bart Van Assche
2023-08-22 19:17 ` [PATCH v11 11/16] scsi: ufs: hisi: Rework the code that disables auto-hibernation Bart Van Assche
` (5 subsequent siblings)
15 siblings, 0 replies; 31+ messages in thread
From: Bart Van Assche @ 2023-08-22 19:17 UTC (permalink / raw)
To: Jens Axboe
Cc: linux-block, linux-scsi, Martin K . Petersen, Christoph Hellwig,
Bart Van Assche, Douglas Gilbert, Damien Le Moal, Ming Lei,
James E.J. Bottomley
Allow user space software, e.g. a blktests test, to inject unaligned
write errors.
Acked-by: Douglas Gilbert <dgilbert@interlog.com>
Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
Cc: Martin K. Petersen <martin.petersen@oracle.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
---
drivers/scsi/scsi_debug.c | 12 +++++++++++-
1 file changed, 11 insertions(+), 1 deletion(-)
diff --git a/drivers/scsi/scsi_debug.c b/drivers/scsi/scsi_debug.c
index 1ea4925d2c2f..164e82c218ff 100644
--- a/drivers/scsi/scsi_debug.c
+++ b/drivers/scsi/scsi_debug.c
@@ -181,6 +181,7 @@ static const char *sdebug_version_date = "20210520";
#define SDEBUG_OPT_NO_CDB_NOISE 0x4000
#define SDEBUG_OPT_HOST_BUSY 0x8000
#define SDEBUG_OPT_CMD_ABORT 0x10000
+#define SDEBUG_OPT_UNALIGNED_WRITE 0x20000
#define SDEBUG_OPT_ALL_NOISE (SDEBUG_OPT_NOISE | SDEBUG_OPT_Q_NOISE | \
SDEBUG_OPT_RESET_NOISE)
#define SDEBUG_OPT_ALL_INJECTING (SDEBUG_OPT_RECOVERED_ERR | \
@@ -188,7 +189,8 @@ static const char *sdebug_version_date = "20210520";
SDEBUG_OPT_DIF_ERR | SDEBUG_OPT_DIX_ERR | \
SDEBUG_OPT_SHORT_TRANSFER | \
SDEBUG_OPT_HOST_BUSY | \
- SDEBUG_OPT_CMD_ABORT)
+ SDEBUG_OPT_CMD_ABORT | \
+ SDEBUG_OPT_UNALIGNED_WRITE)
#define SDEBUG_OPT_RECOV_DIF_DIX (SDEBUG_OPT_RECOVERED_ERR | \
SDEBUG_OPT_DIF_ERR | SDEBUG_OPT_DIX_ERR)
@@ -3587,6 +3589,14 @@ static int resp_write_dt0(struct scsi_cmnd *scp, struct sdebug_dev_info *devip)
struct sdeb_store_info *sip = devip2sip(devip, true);
u8 *cmd = scp->cmnd;
+ if (unlikely(sdebug_opts & SDEBUG_OPT_UNALIGNED_WRITE &&
+ atomic_read(&sdeb_inject_pending))) {
+ atomic_set(&sdeb_inject_pending, 0);
+ mk_sense_buffer(scp, ILLEGAL_REQUEST, LBA_OUT_OF_RANGE,
+ UNALIGNED_WRITE_ASCQ);
+ return check_condition_result;
+ }
+
switch (cmd[0]) {
case WRITE_16:
ei_lba = 0;
^ permalink raw reply related [flat|nested] 31+ messages in thread* [PATCH v11 11/16] scsi: ufs: hisi: Rework the code that disables auto-hibernation
2023-08-22 19:16 [PATCH v11 00/16] Improve write performance for zoned UFS devices Bart Van Assche
` (9 preceding siblings ...)
2023-08-22 19:17 ` [PATCH v11 10/16] scsi: scsi_debug: Support injecting unaligned write errors Bart Van Assche
@ 2023-08-22 19:17 ` Bart Van Assche
2023-08-22 19:17 ` [PATCH v11 12/16] scsi: ufs: Rename ufshcd_auto_hibern8_enable() and make it static Bart Van Assche
` (4 subsequent siblings)
15 siblings, 0 replies; 31+ messages in thread
From: Bart Van Assche @ 2023-08-22 19:17 UTC (permalink / raw)
To: Jens Axboe
Cc: linux-block, linux-scsi, Martin K . Petersen, Christoph Hellwig,
Bart Van Assche, Bao D . Nguyen, Can Guo, Avri Altman,
James E.J. Bottomley, Adrian Hunter, Krzysztof Kozlowski
The host driver link startup callback is called indirectly by
ufshcd_probe_hba(). That function applies the auto-hibernation
settings by writing hba->ahit into the auto-hibernation control
register. Simplify the code for disabling auto-hibernation by
setting hba->ahit instead of writing into the auto-hibernation
control register. This patch is part of an effort to move all
auto-hibernation register changes into the UFSHCI driver core.
Reviewed-by: Bao D. Nguyen <quic_nguyenb@quicinc.com>
Cc: Martin K. Petersen <martin.petersen@oracle.com>
Cc: Can Guo <quic_cang@quicinc.com>
Cc: Avri Altman <avri.altman@wdc.com>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
---
drivers/ufs/host/ufs-hisi.c | 5 +----
1 file changed, 1 insertion(+), 4 deletions(-)
diff --git a/drivers/ufs/host/ufs-hisi.c b/drivers/ufs/host/ufs-hisi.c
index 5b3060cd0ab8..f2ec687121bb 100644
--- a/drivers/ufs/host/ufs-hisi.c
+++ b/drivers/ufs/host/ufs-hisi.c
@@ -142,7 +142,6 @@ static int ufs_hisi_link_startup_pre_change(struct ufs_hba *hba)
struct ufs_hisi_host *host = ufshcd_get_variant(hba);
int err;
uint32_t value;
- uint32_t reg;
/* Unipro VS_mphy_disable */
ufshcd_dme_set(hba, UIC_ARG_MIB_SEL(0xD0C1, 0x0), 0x1);
@@ -232,9 +231,7 @@ static int ufs_hisi_link_startup_pre_change(struct ufs_hba *hba)
ufshcd_writel(hba, UFS_HCLKDIV_NORMAL_VALUE, UFS_REG_HCLKDIV);
/* disable auto H8 */
- reg = ufshcd_readl(hba, REG_AUTO_HIBERNATE_IDLE_TIMER);
- reg = reg & (~UFS_AHIT_AH8ITV_MASK);
- ufshcd_writel(hba, reg, REG_AUTO_HIBERNATE_IDLE_TIMER);
+ hba->ahit = 0;
/* Unipro PA_Local_TX_LCC_Enable */
ufshcd_disable_host_tx_lcc(hba);
^ permalink raw reply related [flat|nested] 31+ messages in thread* [PATCH v11 12/16] scsi: ufs: Rename ufshcd_auto_hibern8_enable() and make it static
2023-08-22 19:16 [PATCH v11 00/16] Improve write performance for zoned UFS devices Bart Van Assche
` (10 preceding siblings ...)
2023-08-22 19:17 ` [PATCH v11 11/16] scsi: ufs: hisi: Rework the code that disables auto-hibernation Bart Van Assche
@ 2023-08-22 19:17 ` Bart Van Assche
2023-08-22 19:17 ` [PATCH v11 13/16] scsi: ufs: Change the return type of ufshcd_auto_hibern8_update() Bart Van Assche
` (3 subsequent siblings)
15 siblings, 0 replies; 31+ messages in thread
From: Bart Van Assche @ 2023-08-22 19:17 UTC (permalink / raw)
To: Jens Axboe
Cc: linux-block, linux-scsi, Martin K . Petersen, Christoph Hellwig,
Bart Van Assche, Bao D . Nguyen, Can Guo, Avri Altman,
James E.J. Bottomley, Stanley Chu, Bean Huo, Asutosh Das,
Arthur Simchaev, Manivannan Sadhasivam, Po-Wen Kao, Eric Biggers,
Keoseong Park, Daniil Lunev
Rename ufshcd_auto_hibern8_enable() into ufshcd_configure_auto_hibern8()
since this function can enable or disable auto-hibernation. Since
ufshcd_auto_hibern8_enable() is only used inside the UFSHCI driver core,
declare it static. Additionally, move the definition of this function to
just before its first caller.
Suggested-by: Bao D. Nguyen <quic_nguyenb@quicinc.com>
Reviewed-by: Bao D. Nguyen <quic_nguyenb@quicinc.com>
Reviewed-by: Can Guo <quic_cang@quicinc.com>
Cc: Martin K. Petersen <martin.petersen@oracle.com>
Cc: Avri Altman <avri.altman@wdc.com>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
---
drivers/ufs/core/ufshcd.c | 24 +++++++++++-------------
include/ufs/ufshcd.h | 1 -
2 files changed, 11 insertions(+), 14 deletions(-)
diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c
index 129446775796..f1bba459b46f 100644
--- a/drivers/ufs/core/ufshcd.c
+++ b/drivers/ufs/core/ufshcd.c
@@ -4337,6 +4337,14 @@ int ufshcd_uic_hibern8_exit(struct ufs_hba *hba)
}
EXPORT_SYMBOL_GPL(ufshcd_uic_hibern8_exit);
+static void ufshcd_configure_auto_hibern8(struct ufs_hba *hba)
+{
+ if (!ufshcd_is_auto_hibern8_supported(hba))
+ return;
+
+ ufshcd_writel(hba, hba->ahit, REG_AUTO_HIBERNATE_IDLE_TIMER);
+}
+
void ufshcd_auto_hibern8_update(struct ufs_hba *hba, u32 ahit)
{
unsigned long flags;
@@ -4356,21 +4364,13 @@ void ufshcd_auto_hibern8_update(struct ufs_hba *hba, u32 ahit)
!pm_runtime_suspended(&hba->ufs_device_wlun->sdev_gendev)) {
ufshcd_rpm_get_sync(hba);
ufshcd_hold(hba);
- ufshcd_auto_hibern8_enable(hba);
+ ufshcd_configure_auto_hibern8(hba);
ufshcd_release(hba);
ufshcd_rpm_put_sync(hba);
}
}
EXPORT_SYMBOL_GPL(ufshcd_auto_hibern8_update);
-void ufshcd_auto_hibern8_enable(struct ufs_hba *hba)
-{
- if (!ufshcd_is_auto_hibern8_supported(hba))
- return;
-
- ufshcd_writel(hba, hba->ahit, REG_AUTO_HIBERNATE_IDLE_TIMER);
-}
-
/**
* ufshcd_init_pwr_info - setting the POR (power on reset)
* values in hba power info
@@ -8815,8 +8815,7 @@ static int ufshcd_probe_hba(struct ufs_hba *hba, bool init_dev_params)
if (hba->ee_usr_mask)
ufshcd_write_ee_control(hba);
- /* Enable Auto-Hibernate if configured */
- ufshcd_auto_hibern8_enable(hba);
+ ufshcd_configure_auto_hibern8(hba);
ufshpb_toggle_state(hba, HPB_RESET, HPB_PRESENT);
out:
@@ -9809,8 +9808,7 @@ static int __ufshcd_wl_resume(struct ufs_hba *hba, enum ufs_pm_op pm_op)
cancel_delayed_work(&hba->rpm_dev_flush_recheck_work);
}
- /* Enable Auto-Hibernate if configured */
- ufshcd_auto_hibern8_enable(hba);
+ ufshcd_configure_auto_hibern8(hba);
ufshpb_resume(hba);
goto out;
diff --git a/include/ufs/ufshcd.h b/include/ufs/ufshcd.h
index 6dc11fa0ebb1..040d66d99912 100644
--- a/include/ufs/ufshcd.h
+++ b/include/ufs/ufshcd.h
@@ -1363,7 +1363,6 @@ static inline int ufshcd_disable_host_tx_lcc(struct ufs_hba *hba)
return ufshcd_dme_set(hba, UIC_ARG_MIB(PA_LOCAL_TX_LCC_ENABLE), 0);
}
-void ufshcd_auto_hibern8_enable(struct ufs_hba *hba);
void ufshcd_auto_hibern8_update(struct ufs_hba *hba, u32 ahit);
void ufshcd_fixup_dev_quirks(struct ufs_hba *hba,
const struct ufs_dev_quirk *fixups);
^ permalink raw reply related [flat|nested] 31+ messages in thread* [PATCH v11 13/16] scsi: ufs: Change the return type of ufshcd_auto_hibern8_update()
2023-08-22 19:16 [PATCH v11 00/16] Improve write performance for zoned UFS devices Bart Van Assche
` (11 preceding siblings ...)
2023-08-22 19:17 ` [PATCH v11 12/16] scsi: ufs: Rename ufshcd_auto_hibern8_enable() and make it static Bart Van Assche
@ 2023-08-22 19:17 ` Bart Van Assche
2023-08-22 19:17 ` [PATCH v11 14/16] scsi: ufs: Simplify ufshcd_auto_hibern8_update() Bart Van Assche
` (2 subsequent siblings)
15 siblings, 0 replies; 31+ messages in thread
From: Bart Van Assche @ 2023-08-22 19:17 UTC (permalink / raw)
To: Jens Axboe
Cc: linux-block, linux-scsi, Martin K . Petersen, Christoph Hellwig,
Bart Van Assche, Bao D . Nguyen, Can Guo, Peter Wang, Avri Altman,
James E.J. Bottomley, Matthias Brugger, Bean Huo, Jinyoung Choi,
Lu Hongfei, Daniil Lunev, Stanley Chu, Manivannan Sadhasivam,
Asutosh Das, Arthur Simchaev, zhanghui, Po-Wen Kao, Eric Biggers,
Keoseong Park
A later patch will introduce an error path in ufshcd_auto_hibern8_update().
Change the return type of that function before introducing calls to that
function in the host drivers such that the host drivers only have to be
modified once.
Reviewed-by: Bao D. Nguyen <quic_nguyenb@quicinc.com>
Reviewed-by: Can Guo <quic_cang@quicinc.com>
Reviewed-by: Peter Wang <peter.wang@mediatek.com>
Cc: Martin K. Petersen <martin.petersen@oracle.com>
Cc: Avri Altman <avri.altman@wdc.com>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
---
drivers/ufs/core/ufs-sysfs.c | 2 +-
drivers/ufs/core/ufshcd-priv.h | 1 -
drivers/ufs/core/ufshcd.c | 6 ++++--
include/ufs/ufshcd.h | 2 +-
4 files changed, 6 insertions(+), 5 deletions(-)
diff --git a/drivers/ufs/core/ufs-sysfs.c b/drivers/ufs/core/ufs-sysfs.c
index 6c72075750dd..a693dea1bd18 100644
--- a/drivers/ufs/core/ufs-sysfs.c
+++ b/drivers/ufs/core/ufs-sysfs.c
@@ -203,7 +203,7 @@ static ssize_t auto_hibern8_store(struct device *dev,
goto out;
}
- ufshcd_auto_hibern8_update(hba, ufshcd_us_to_ahit(timer));
+ ret = ufshcd_auto_hibern8_update(hba, ufshcd_us_to_ahit(timer));
out:
up(&hba->host_sem);
diff --git a/drivers/ufs/core/ufshcd-priv.h b/drivers/ufs/core/ufshcd-priv.h
index 0f3bd943b58b..a2b74fbc2056 100644
--- a/drivers/ufs/core/ufshcd-priv.h
+++ b/drivers/ufs/core/ufshcd-priv.h
@@ -60,7 +60,6 @@ int ufshcd_query_attr(struct ufs_hba *hba, enum query_opcode opcode,
enum attr_idn idn, u8 index, u8 selector, u32 *attr_val);
int ufshcd_query_flag(struct ufs_hba *hba, enum query_opcode opcode,
enum flag_idn idn, u8 index, bool *flag_res);
-void ufshcd_auto_hibern8_update(struct ufs_hba *hba, u32 ahit);
void ufshcd_compl_one_cqe(struct ufs_hba *hba, int task_tag,
struct cq_entry *cqe);
int ufshcd_mcq_init(struct ufs_hba *hba);
diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c
index f1bba459b46f..a08924972b20 100644
--- a/drivers/ufs/core/ufshcd.c
+++ b/drivers/ufs/core/ufshcd.c
@@ -4345,13 +4345,13 @@ static void ufshcd_configure_auto_hibern8(struct ufs_hba *hba)
ufshcd_writel(hba, hba->ahit, REG_AUTO_HIBERNATE_IDLE_TIMER);
}
-void ufshcd_auto_hibern8_update(struct ufs_hba *hba, u32 ahit)
+int ufshcd_auto_hibern8_update(struct ufs_hba *hba, u32 ahit)
{
unsigned long flags;
bool update = false;
if (!ufshcd_is_auto_hibern8_supported(hba))
- return;
+ return 0;
spin_lock_irqsave(hba->host->host_lock, flags);
if (hba->ahit != ahit) {
@@ -4368,6 +4368,8 @@ void ufshcd_auto_hibern8_update(struct ufs_hba *hba, u32 ahit)
ufshcd_release(hba);
ufshcd_rpm_put_sync(hba);
}
+
+ return 0;
}
EXPORT_SYMBOL_GPL(ufshcd_auto_hibern8_update);
diff --git a/include/ufs/ufshcd.h b/include/ufs/ufshcd.h
index 040d66d99912..7ae071a6811c 100644
--- a/include/ufs/ufshcd.h
+++ b/include/ufs/ufshcd.h
@@ -1363,7 +1363,7 @@ static inline int ufshcd_disable_host_tx_lcc(struct ufs_hba *hba)
return ufshcd_dme_set(hba, UIC_ARG_MIB(PA_LOCAL_TX_LCC_ENABLE), 0);
}
-void ufshcd_auto_hibern8_update(struct ufs_hba *hba, u32 ahit);
+int ufshcd_auto_hibern8_update(struct ufs_hba *hba, u32 ahit);
void ufshcd_fixup_dev_quirks(struct ufs_hba *hba,
const struct ufs_dev_quirk *fixups);
#define SD_ASCII_STD true
^ permalink raw reply related [flat|nested] 31+ messages in thread* [PATCH v11 14/16] scsi: ufs: Simplify ufshcd_auto_hibern8_update()
2023-08-22 19:16 [PATCH v11 00/16] Improve write performance for zoned UFS devices Bart Van Assche
` (12 preceding siblings ...)
2023-08-22 19:17 ` [PATCH v11 13/16] scsi: ufs: Change the return type of ufshcd_auto_hibern8_update() Bart Van Assche
@ 2023-08-22 19:17 ` Bart Van Assche
2023-08-22 19:17 ` [PATCH v11 15/16] scsi: ufs: Forbid auto-hibernation without I/O scheduler Bart Van Assche
2023-08-22 19:17 ` [PATCH v11 16/16] scsi: ufs: Inform the block layer about write ordering Bart Van Assche
15 siblings, 0 replies; 31+ messages in thread
From: Bart Van Assche @ 2023-08-22 19:17 UTC (permalink / raw)
To: Jens Axboe
Cc: linux-block, linux-scsi, Martin K . Petersen, Christoph Hellwig,
Bart Van Assche, Bao D . Nguyen, Can Guo, Avri Altman,
James E.J. Bottomley, Stanley Chu, Bean Huo, Asutosh Das,
Arthur Simchaev
Calls to ufshcd_auto_hibern8_update() are already serialized: this
function is either called if user space software is not running
(preparing to suspend) or from a single sysfs store callback function.
Kernfs serializes sysfs .store() callbacks.
No functionality is changed. This patch makes the next patch in this
series easier to read.
Reviewed-by: Bao D. Nguyen <quic_nguyenb@quicinc.com>
Reviewed-by: Can Guo <quic_cang@quicinc.com>
Cc: Martin K. Petersen <martin.petersen@oracle.com>
Cc: Avri Altman <avri.altman@wdc.com>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
---
drivers/ufs/core/ufshcd.c | 16 ++++------------
1 file changed, 4 insertions(+), 12 deletions(-)
diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c
index a08924972b20..68bf8e94eee6 100644
--- a/drivers/ufs/core/ufshcd.c
+++ b/drivers/ufs/core/ufshcd.c
@@ -4347,21 +4347,13 @@ static void ufshcd_configure_auto_hibern8(struct ufs_hba *hba)
int ufshcd_auto_hibern8_update(struct ufs_hba *hba, u32 ahit)
{
- unsigned long flags;
- bool update = false;
+ const u32 cur_ahit = READ_ONCE(hba->ahit);
- if (!ufshcd_is_auto_hibern8_supported(hba))
+ if (!ufshcd_is_auto_hibern8_supported(hba) || cur_ahit == ahit)
return 0;
- spin_lock_irqsave(hba->host->host_lock, flags);
- if (hba->ahit != ahit) {
- hba->ahit = ahit;
- update = true;
- }
- spin_unlock_irqrestore(hba->host->host_lock, flags);
-
- if (update &&
- !pm_runtime_suspended(&hba->ufs_device_wlun->sdev_gendev)) {
+ WRITE_ONCE(hba->ahit, ahit);
+ if (!pm_runtime_suspended(&hba->ufs_device_wlun->sdev_gendev)) {
ufshcd_rpm_get_sync(hba);
ufshcd_hold(hba);
ufshcd_configure_auto_hibern8(hba);
^ permalink raw reply related [flat|nested] 31+ messages in thread* [PATCH v11 15/16] scsi: ufs: Forbid auto-hibernation without I/O scheduler
2023-08-22 19:16 [PATCH v11 00/16] Improve write performance for zoned UFS devices Bart Van Assche
` (13 preceding siblings ...)
2023-08-22 19:17 ` [PATCH v11 14/16] scsi: ufs: Simplify ufshcd_auto_hibern8_update() Bart Van Assche
@ 2023-08-22 19:17 ` Bart Van Assche
2023-08-22 19:17 ` [PATCH v11 16/16] scsi: ufs: Inform the block layer about write ordering Bart Van Assche
15 siblings, 0 replies; 31+ messages in thread
From: Bart Van Assche @ 2023-08-22 19:17 UTC (permalink / raw)
To: Jens Axboe
Cc: linux-block, linux-scsi, Martin K . Petersen, Christoph Hellwig,
Bart Van Assche, Bao D . Nguyen, Can Guo, Avri Altman,
James E.J. Bottomley, Stanley Chu, Bean Huo, Asutosh Das,
Arthur Simchaev
UFSHCI 3.0 controllers do not preserve the write order if auto-hibernation
is enabled. If the write order is not preserved, an I/O scheduler is
required to serialize zoned writes. Hence do not allow auto-hibernation
to be enabled without I/O scheduler if a zoned logical unit is present
and if the controller is operating in legacy mode. This patch has been
tested with the following shell script:
show_ah8() {
echo -n "auto_hibern8: "
adb shell "cat /sys/devices/platform/13200000.ufs/auto_hibern8"
}
set_ah8() {
local rc
adb shell "echo $1 > /sys/devices/platform/13200000.ufs/auto_hibern8"
rc=$?
show_ah8
return $rc
}
set_iosched() {
adb shell "echo $1 >/sys/class/block/$zoned_bdev/queue/scheduler &&
echo -n 'I/O scheduler: ' &&
cat /sys/class/block/sde/queue/scheduler"
}
adb root
zoned_bdev=$(adb shell grep -lvw 0 /sys/class/block/sd*/queue/chunk_sectors |&
sed 's|/sys/class/block/||g;s|/queue/chunk_sectors||g')
[ -n "$zoned_bdev" ]
show_ah8
set_ah8 0
set_iosched none
if set_ah8 150000; then
echo "Error: enabled AH8 without I/O scheduler"
fi
set_iosched mq-deadline
set_ah8 150000
Reviewed-by: Bao D. Nguyen <quic_nguyenb@quicinc.com>
Reviewed-by: Can Guo <quic_cang@quicinc.com>
Cc: Martin K. Petersen <martin.petersen@oracle.com>
Cc: Avri Altman <avri.altman@wdc.com>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
---
drivers/ufs/core/ufshcd.c | 58 +++++++++++++++++++++++++++++++++++++++
1 file changed, 58 insertions(+)
diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c
index 68bf8e94eee6..c28a362b5b99 100644
--- a/drivers/ufs/core/ufshcd.c
+++ b/drivers/ufs/core/ufshcd.c
@@ -4337,6 +4337,30 @@ int ufshcd_uic_hibern8_exit(struct ufs_hba *hba)
}
EXPORT_SYMBOL_GPL(ufshcd_uic_hibern8_exit);
+static int ufshcd_update_preserves_write_order(struct ufs_hba *hba,
+ bool preserves_write_order)
+{
+ struct scsi_device *sdev;
+
+ if (!preserves_write_order) {
+ shost_for_each_device(sdev, hba->host) {
+ struct request_queue *q = sdev->request_queue;
+
+ /*
+ * Refuse to enable auto-hibernation if no I/O scheduler
+ * is present. This code does not check whether the
+ * attached I/O scheduler serializes zoned writes
+ * (ELEVATOR_F_ZBD_SEQ_WRITE) because this cannot be
+ * checked from outside the block layer core.
+ */
+ if (blk_queue_is_zoned(q) && !q->elevator)
+ return -EPERM;
+ }
+ }
+
+ return 0;
+}
+
static void ufshcd_configure_auto_hibern8(struct ufs_hba *hba)
{
if (!ufshcd_is_auto_hibern8_supported(hba))
@@ -4345,13 +4369,40 @@ static void ufshcd_configure_auto_hibern8(struct ufs_hba *hba)
ufshcd_writel(hba, hba->ahit, REG_AUTO_HIBERNATE_IDLE_TIMER);
}
+/**
+ * ufshcd_auto_hibern8_update() - Modify the auto-hibernation control register
+ * @hba: per-adapter instance
+ * @ahit: New auto-hibernate settings. Includes the scale and the value of the
+ * auto-hibernation timer. See also the UFSHCI_AHIBERN8_TIMER_MASK and
+ * UFSHCI_AHIBERN8_SCALE_MASK constants.
+ *
+ * A note: UFSHCI controllers do not preserve the command order in legacy mode
+ * if auto-hibernation is enabled. If the command order is not preserved, an
+ * I/O scheduler that serializes zoned writes (mq-deadline) is required if a
+ * zoned logical unit is present. Enabling auto-hibernation without attaching
+ * the mq-deadline scheduler first may cause unaligned write errors for the
+ * zoned logical unit if a zoned logical unit is present.
+ */
int ufshcd_auto_hibern8_update(struct ufs_hba *hba, u32 ahit)
{
const u32 cur_ahit = READ_ONCE(hba->ahit);
+ bool prev_state, new_state;
+ int ret;
if (!ufshcd_is_auto_hibern8_supported(hba) || cur_ahit == ahit)
return 0;
+ prev_state = FIELD_GET(UFSHCI_AHIBERN8_TIMER_MASK, cur_ahit);
+ new_state = FIELD_GET(UFSHCI_AHIBERN8_TIMER_MASK, ahit);
+
+ if (!is_mcq_enabled(hba) && !prev_state && new_state) {
+ /*
+ * Auto-hibernation will be enabled for legacy UFSHCI mode.
+ */
+ ret = ufshcd_update_preserves_write_order(hba, false);
+ if (ret)
+ return ret;
+ }
WRITE_ONCE(hba->ahit, ahit);
if (!pm_runtime_suspended(&hba->ufs_device_wlun->sdev_gendev)) {
ufshcd_rpm_get_sync(hba);
@@ -4360,6 +4411,13 @@ int ufshcd_auto_hibern8_update(struct ufs_hba *hba, u32 ahit)
ufshcd_release(hba);
ufshcd_rpm_put_sync(hba);
}
+ if (!is_mcq_enabled(hba) && prev_state && !new_state) {
+ /*
+ * Auto-hibernation has been disabled.
+ */
+ ret = ufshcd_update_preserves_write_order(hba, true);
+ WARN_ON_ONCE(ret);
+ }
return 0;
}
^ permalink raw reply related [flat|nested] 31+ messages in thread* [PATCH v11 16/16] scsi: ufs: Inform the block layer about write ordering
2023-08-22 19:16 [PATCH v11 00/16] Improve write performance for zoned UFS devices Bart Van Assche
` (14 preceding siblings ...)
2023-08-22 19:17 ` [PATCH v11 15/16] scsi: ufs: Forbid auto-hibernation without I/O scheduler Bart Van Assche
@ 2023-08-22 19:17 ` Bart Van Assche
15 siblings, 0 replies; 31+ messages in thread
From: Bart Van Assche @ 2023-08-22 19:17 UTC (permalink / raw)
To: Jens Axboe
Cc: linux-block, linux-scsi, Martin K . Petersen, Christoph Hellwig,
Bart Van Assche, Bao D . Nguyen, Can Guo, Avri Altman,
James E.J. Bottomley, Stanley Chu, Bean Huo, Asutosh Das,
Arthur Simchaev
From the UFSHCI 4.0 specification, about the legacy (single queue) mode:
"The host controller always process transfer requests in-order according
to the order submitted to the list. In case of multiple commands with
single doorbell register ringing (batch mode), The dispatch order for
these transfer requests by host controller will base on their index in
the List. A transfer request with lower index value will be executed
before a transfer request with higher index value."
From the UFSHCI 4.0 specification, about the MCQ mode:
"Command Submission
1. Host SW writes an Entry to SQ
2. Host SW updates SQ doorbell tail pointer
Command Processing
3. After fetching the Entry, Host Controller updates SQ doorbell head
pointer
4. Host controller sends COMMAND UPIU to UFS device"
In other words, for both legacy and MCQ mode, UFS controllers are
required to forward commands to the UFS device in the order these
commands have been received from the host.
Notes:
- For legacy mode this is only correct if the host submits one
command at a time. The UFS driver does this.
- Also in legacy mode, the command order is not preserved if
auto-hibernation is enabled in the UFS controller. Hence, enable
zone write locking if auto-hibernation is enabled.
This patch improves performance as follows on my test setup:
- With the mq-deadline scheduler: 2.5x more IOPS for small writes.
- When not using an I/O scheduler compared to using mq-deadline with
zone locking: 4x more IOPS for small writes.
Reviewed-by: Bao D. Nguyen <quic_nguyenb@quicinc.com>
Reviewed-by: Can Guo <quic_cang@quicinc.com>
Cc: Martin K. Petersen <martin.petersen@oracle.com>
Cc: Avri Altman <avri.altman@wdc.com>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
---
drivers/ufs/core/ufshcd.c | 22 ++++++++++++++++++++--
1 file changed, 20 insertions(+), 2 deletions(-)
diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c
index c28a362b5b99..a685058d4943 100644
--- a/drivers/ufs/core/ufshcd.c
+++ b/drivers/ufs/core/ufshcd.c
@@ -4357,6 +4357,19 @@ static int ufshcd_update_preserves_write_order(struct ufs_hba *hba,
return -EPERM;
}
}
+ shost_for_each_device(sdev, hba->host)
+ blk_freeze_queue_start(sdev->request_queue);
+ shost_for_each_device(sdev, hba->host) {
+ struct request_queue *q = sdev->request_queue;
+
+ blk_mq_freeze_queue_wait(q);
+ q->limits.driver_preserves_write_order = preserves_write_order;
+ blk_queue_required_elevator_features(q,
+ preserves_write_order ? 0 : ELEVATOR_F_ZBD_SEQ_WRITE);
+ if (q->disk)
+ disk_set_zoned(q->disk, q->limits.zoned);
+ blk_mq_unfreeze_queue(q);
+ }
return 0;
}
@@ -4397,7 +4410,8 @@ int ufshcd_auto_hibern8_update(struct ufs_hba *hba, u32 ahit)
if (!is_mcq_enabled(hba) && !prev_state && new_state) {
/*
- * Auto-hibernation will be enabled for legacy UFSHCI mode.
+ * Auto-hibernation will be enabled for legacy UFSHCI mode. Tell
+ * the block layer that write requests may be reordered.
*/
ret = ufshcd_update_preserves_write_order(hba, false);
if (ret)
@@ -4413,7 +4427,8 @@ int ufshcd_auto_hibern8_update(struct ufs_hba *hba, u32 ahit)
}
if (!is_mcq_enabled(hba) && prev_state && !new_state) {
/*
- * Auto-hibernation has been disabled.
+ * Auto-hibernation has been disabled. Tell the block layer that
+ * the order of write requests is preserved.
*/
ret = ufshcd_update_preserves_write_order(hba, true);
WARN_ON_ONCE(ret);
@@ -5191,6 +5206,9 @@ static int ufshcd_slave_configure(struct scsi_device *sdev)
ufshcd_hpb_configure(hba, sdev);
+ q->limits.driver_preserves_write_order =
+ !ufshcd_is_auto_hibern8_supported(hba) ||
+ FIELD_GET(UFSHCI_AHIBERN8_TIMER_MASK, hba->ahit) == 0;
blk_queue_update_dma_pad(q, PRDT_DATA_BYTE_COUNT_PAD - 1);
if (hba->quirks & UFSHCD_QUIRK_4KB_DMA_ALIGNMENT)
blk_queue_update_dma_alignment(q, SZ_4K - 1);
^ permalink raw reply related [flat|nested] 31+ messages in thread