[PATCH v2 0/4] Improve I/O priority support in mq-deadline for zoned writes

public inbox for linux-block@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH v2 0/4] Improve I/O priority support in mq-deadline for zoned writes
@ 2023-12-18 21:13 Bart Van Assche
  2023-12-18 21:13 ` [PATCH v2 1/4] block/mq-deadline: Rename dd_rq_ioclass() and change its return type Bart Van Assche
                   ` (3 more replies)
  0 siblings, 4 replies; 12+ messages in thread
From: Bart Van Assche @ 2023-12-18 21:13 UTC (permalink / raw)
  To: Jens Axboe
  Cc: linux-block, Damien Le Moal, Christoph Hellwig, Bart Van Assche

Hi Jens,

This patch series prevents that the mq-deadline I/O scheduler submits zoned
writes out of order if different writes for the same zone have different I/O
priorities. Please consider these patches for the next merge window.

Thanks,

Bart.

Changes compared to v1:
 - Changed the approach from ignoring the I/O priority of zoned writes to using
   the I/O priority of pending zoned writes.
 - Included two unit tests.

Bart Van Assche (4):
  block/mq-deadline: Rename dd_rq_ioclass() and change its return type
  block/mq-deadline: Introduce dd_bio_ioclass()
  block/mq-deadline: Introduce deadline_first_rq_past_pos()
  block/mq-deadline: Prevent zoned write reordering due to I/O
    prioritization

 block/Kconfig.iosched    |   5 ++
 block/mq-deadline.c      | 135 +++++++++++++++++++++++-------
 block/mq-deadline_test.c | 175 +++++++++++++++++++++++++++++++++++++++
 include/linux/blk-mq.h   |  17 ++++
 4 files changed, 300 insertions(+), 32 deletions(-)
 create mode 100644 block/mq-deadline_test.c


^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH v2 1/4] block/mq-deadline: Rename dd_rq_ioclass() and change its return type
  2023-12-18 21:13 [PATCH v2 0/4] Improve I/O priority support in mq-deadline for zoned writes Bart Van Assche
@ 2023-12-18 21:13 ` Bart Van Assche
  2023-12-18 21:13 ` [PATCH v2 2/4] block/mq-deadline: Introduce dd_bio_ioclass() Bart Van Assche
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 12+ messages in thread
From: Bart Van Assche @ 2023-12-18 21:13 UTC (permalink / raw)
  To: Jens Axboe
  Cc: linux-block, Damien Le Moal, Christoph Hellwig, Bart Van Assche

All dd_rq_ioclass() callers convert the return value into type enum
dd_prio. Move this conversion into dd_rq_ioclass() and rename this
function. Move the definition of this function. Introduce an additional
caller. No functionality is changed by this patch.

Cc: Damien Le Moal <dlemoal@kernel.org>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
---
 block/mq-deadline.c | 35 ++++++++++++++---------------------
 1 file changed, 14 insertions(+), 21 deletions(-)

diff --git a/block/mq-deadline.c b/block/mq-deadline.c
index f958e79277b8..8e5f71775cf1 100644
--- a/block/mq-deadline.c
+++ b/block/mq-deadline.c
@@ -119,15 +119,6 @@ deadline_rb_root(struct dd_per_prio *per_prio, struct request *rq)
 	return &per_prio->sort_list[rq_data_dir(rq)];
 }
 
-/*
- * Returns the I/O priority class (IOPRIO_CLASS_*) that has been assigned to a
- * request.
- */
-static u8 dd_rq_ioclass(struct request *rq)
-{
-	return IOPRIO_PRIO_CLASS(req_get_ioprio(rq));
-}
-
 /*
  * get the request before `rq' in sector-sorted order
  */
@@ -190,6 +181,15 @@ static inline struct request *deadline_from_pos(struct dd_per_prio *per_prio,
 	return res;
 }
 
+/*
+ * Returns the I/O priority class (IOPRIO_CLASS_*) that has been assigned to a
+ * request.
+ */
+static enum dd_prio dd_rq_ioprio(struct request *rq)
+{
+	return ioprio_class_to_prio[IOPRIO_PRIO_CLASS(req_get_ioprio(rq))];
+}
+
 static void
 deadline_add_rq_rb(struct dd_per_prio *per_prio, struct request *rq)
 {
@@ -228,8 +228,7 @@ static void dd_request_merged(struct request_queue *q, struct request *req,
 			      enum elv_merge type)
 {
 	struct deadline_data *dd = q->elevator->elevator_data;
-	const u8 ioprio_class = dd_rq_ioclass(req);
-	const enum dd_prio prio = ioprio_class_to_prio[ioprio_class];
+	const enum dd_prio prio = dd_rq_ioprio(req);
 	struct dd_per_prio *per_prio = &dd->per_prio[prio];
 
 	/*
@@ -248,8 +247,7 @@ static void dd_merged_requests(struct request_queue *q, struct request *req,
 			       struct request *next)
 {
 	struct deadline_data *dd = q->elevator->elevator_data;
-	const u8 ioprio_class = dd_rq_ioclass(next);
-	const enum dd_prio prio = ioprio_class_to_prio[ioprio_class];
+	const enum dd_prio prio = dd_rq_ioprio(next);
 
 	lockdep_assert_held(&dd->lock);
 
@@ -447,7 +445,6 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd,
 	struct request *rq, *next_rq;
 	enum dd_data_dir data_dir;
 	enum dd_prio prio;
-	u8 ioprio_class;
 
 	lockdep_assert_held(&dd->lock);
 
@@ -545,8 +542,7 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd,
 	dd->batching++;
 	deadline_move_request(dd, per_prio, rq);
 done:
-	ioprio_class = dd_rq_ioclass(rq);
-	prio = ioprio_class_to_prio[ioprio_class];
+	prio = dd_rq_ioprio(rq);
 	dd->per_prio[prio].latest_pos[data_dir] = blk_rq_pos(rq);
 	dd->per_prio[prio].stats.dispatched++;
 	/*
@@ -798,8 +794,6 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
 	struct request_queue *q = hctx->queue;
 	struct deadline_data *dd = q->elevator->elevator_data;
 	const enum dd_data_dir data_dir = rq_data_dir(rq);
-	u16 ioprio = req_get_ioprio(rq);
-	u8 ioprio_class = IOPRIO_PRIO_CLASS(ioprio);
 	struct dd_per_prio *per_prio;
 	enum dd_prio prio;
 
@@ -811,7 +805,7 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
 	 */
 	blk_req_zone_write_unlock(rq);
 
-	prio = ioprio_class_to_prio[ioprio_class];
+	prio = dd_rq_ioprio(rq);
 	per_prio = &dd->per_prio[prio];
 	if (!rq->elv.priv[0]) {
 		per_prio->stats.inserted++;
@@ -920,8 +914,7 @@ static void dd_finish_request(struct request *rq)
 {
 	struct request_queue *q = rq->q;
 	struct deadline_data *dd = q->elevator->elevator_data;
-	const u8 ioprio_class = dd_rq_ioclass(rq);
-	const enum dd_prio prio = ioprio_class_to_prio[ioprio_class];
+	const enum dd_prio prio = dd_rq_ioprio(rq);
 	struct dd_per_prio *per_prio = &dd->per_prio[prio];
 
 	/*

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH v2 2/4] block/mq-deadline: Introduce dd_bio_ioclass()
  2023-12-18 21:13 [PATCH v2 0/4] Improve I/O priority support in mq-deadline for zoned writes Bart Van Assche
  2023-12-18 21:13 ` [PATCH v2 1/4] block/mq-deadline: Rename dd_rq_ioclass() and change its return type Bart Van Assche
@ 2023-12-18 21:13 ` Bart Van Assche
  2023-12-18 21:13 ` [PATCH v2 3/4] block/mq-deadline: Introduce deadline_first_rq_past_pos() Bart Van Assche
  2023-12-18 21:13 ` [PATCH v2 4/4] block/mq-deadline: Prevent zoned write reordering due to I/O prioritization Bart Van Assche
  3 siblings, 0 replies; 12+ messages in thread
From: Bart Van Assche @ 2023-12-18 21:13 UTC (permalink / raw)
  To: Jens Axboe
  Cc: linux-block, Damien Le Moal, Christoph Hellwig, Bart Van Assche

Prepare for ignoring the I/O priority of certain requests.

Cc: Damien Le Moal <dlemoal@kernel.org>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
---
 block/mq-deadline.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/block/mq-deadline.c b/block/mq-deadline.c
index 8e5f71775cf1..a5341a37c840 100644
--- a/block/mq-deadline.c
+++ b/block/mq-deadline.c
@@ -190,6 +190,11 @@ static enum dd_prio dd_rq_ioprio(struct request *rq)
 	return ioprio_class_to_prio[IOPRIO_PRIO_CLASS(req_get_ioprio(rq))];
 }
 
+static enum dd_prio dd_bio_ioprio(struct bio *bio)
+{
+	return ioprio_class_to_prio[IOPRIO_PRIO_CLASS(bio->bi_ioprio)];
+}
+
 static void
 deadline_add_rq_rb(struct dd_per_prio *per_prio, struct request *rq)
 {
@@ -740,8 +745,7 @@ static int dd_request_merge(struct request_queue *q, struct request **rq,
 			    struct bio *bio)
 {
 	struct deadline_data *dd = q->elevator->elevator_data;
-	const u8 ioprio_class = IOPRIO_PRIO_CLASS(bio->bi_ioprio);
-	const enum dd_prio prio = ioprio_class_to_prio[ioprio_class];
+	const enum dd_prio prio = dd_bio_ioprio(bio);
 	struct dd_per_prio *per_prio = &dd->per_prio[prio];
 	sector_t sector = bio_end_sector(bio);
 	struct request *__rq;

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH v2 3/4] block/mq-deadline: Introduce deadline_first_rq_past_pos()
  2023-12-18 21:13 [PATCH v2 0/4] Improve I/O priority support in mq-deadline for zoned writes Bart Van Assche
  2023-12-18 21:13 ` [PATCH v2 1/4] block/mq-deadline: Rename dd_rq_ioclass() and change its return type Bart Van Assche
  2023-12-18 21:13 ` [PATCH v2 2/4] block/mq-deadline: Introduce dd_bio_ioclass() Bart Van Assche
@ 2023-12-18 21:13 ` Bart Van Assche
  2023-12-18 21:13 ` [PATCH v2 4/4] block/mq-deadline: Prevent zoned write reordering due to I/O prioritization Bart Van Assche
  3 siblings, 0 replies; 12+ messages in thread
From: Bart Van Assche @ 2023-12-18 21:13 UTC (permalink / raw)
  To: Jens Axboe
  Cc: linux-block, Damien Le Moal, Christoph Hellwig, Bart Van Assche

Prepare for introducing a second caller.

Cc: Damien Le Moal <dlemoal@kernel.org>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
---
 block/mq-deadline.c | 35 ++++++++++++++++++++++++-----------
 1 file changed, 24 insertions(+), 11 deletions(-)

diff --git a/block/mq-deadline.c b/block/mq-deadline.c
index a5341a37c840..c0f92cc729ca 100644
--- a/block/mq-deadline.c
+++ b/block/mq-deadline.c
@@ -147,6 +147,28 @@ deadline_latter_request(struct request *rq)
 	return NULL;
 }
 
+/*
+ * Return the first request with the given priority, data direction and for
+ * which blk_rq_pos() >= @pos.
+ */
+static struct request *deadline_first_rq_past_pos(struct dd_per_prio *per_prio,
+					enum dd_data_dir data_dir, sector_t pos)
+{
+	struct rb_node *node = per_prio->sort_list[data_dir].rb_node;
+	struct request *rq, *res = NULL;
+
+	while (node) {
+		rq = rb_entry_rq(node);
+		if (blk_rq_pos(rq) >= pos) {
+			res = rq;
+			node = node->rb_left;
+		} else {
+			node = node->rb_right;
+		}
+	}
+	return res;
+}
+
 /*
  * Return the first request for which blk_rq_pos() >= @pos. For zoned devices,
  * return the first request after the start of the zone containing @pos.
@@ -155,7 +177,7 @@ static inline struct request *deadline_from_pos(struct dd_per_prio *per_prio,
 				enum dd_data_dir data_dir, sector_t pos)
 {
 	struct rb_node *node = per_prio->sort_list[data_dir].rb_node;
-	struct request *rq, *res = NULL;
+	struct request *rq;
 
 	if (!node)
 		return NULL;
@@ -169,16 +191,7 @@ static inline struct request *deadline_from_pos(struct dd_per_prio *per_prio,
 	if (blk_rq_is_seq_zoned_write(rq))
 		pos = round_down(pos, rq->q->limits.chunk_sectors);
 
-	while (node) {
-		rq = rb_entry_rq(node);
-		if (blk_rq_pos(rq) >= pos) {
-			res = rq;
-			node = node->rb_left;
-		} else {
-			node = node->rb_right;
-		}
-	}
-	return res;
+	return deadline_first_rq_past_pos(per_prio, data_dir, pos);
 }
 
 /*

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH v2 4/4] block/mq-deadline: Prevent zoned write reordering due to I/O prioritization
  2023-12-18 21:13 [PATCH v2 0/4] Improve I/O priority support in mq-deadline for zoned writes Bart Van Assche
                   ` (2 preceding siblings ...)
  2023-12-18 21:13 ` [PATCH v2 3/4] block/mq-deadline: Introduce deadline_first_rq_past_pos() Bart Van Assche
@ 2023-12-18 21:13 ` Bart Van Assche
  2023-12-19 12:10   ` Christoph Hellwig
  3 siblings, 1 reply; 12+ messages in thread
From: Bart Van Assche @ 2023-12-18 21:13 UTC (permalink / raw)
  To: Jens Axboe
  Cc: linux-block, Damien Le Moal, Christoph Hellwig, Bart Van Assche

Assigning I/O priorities with the ioprio cgroup policy may cause
different I/O priorities to be assigned to write requests for the same
zone. Prevent that this causes unaligned write errors by adding zoned
writes for the same zone in the same priority queue as prior zoned
writes.

Cc: Damien Le Moal <dlemoal@kernel.org>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
---
 block/Kconfig.iosched    |   5 ++
 block/mq-deadline.c      |  81 +++++++++++++++---
 block/mq-deadline_test.c | 175 +++++++++++++++++++++++++++++++++++++++
 include/linux/blk-mq.h   |  17 ++++
 4 files changed, 268 insertions(+), 10 deletions(-)
 create mode 100644 block/mq-deadline_test.c

diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
index 27f11320b8d1..3e2fcbdbac65 100644
--- a/block/Kconfig.iosched
+++ b/block/Kconfig.iosched
@@ -7,6 +7,11 @@ config MQ_IOSCHED_DEADLINE
 	help
 	  MQ version of the deadline IO scheduler.
 
+config MQ_IOSCHED_DEADLINE_TEST
+	tristate "MQ deadline unit tests" if !KUNIT_ALL_TESTS
+	depends on MQ_IOSCHED_DEADLINE && KUNIT
+	default KUNIT_ALL_TESTS
+
 config MQ_IOSCHED_KYBER
 	tristate "Kyber I/O scheduler"
 	default y
diff --git a/block/mq-deadline.c b/block/mq-deadline.c
index c0f92cc729ca..d1d54cac4c37 100644
--- a/block/mq-deadline.c
+++ b/block/mq-deadline.c
@@ -46,6 +46,8 @@ enum dd_data_dir {
 enum { DD_DIR_COUNT = 2 };
 
 enum dd_prio {
+	DD_INVALID_PRIO	= -1,
+	DD_PRIO_MIN	= 0,
 	DD_RT_PRIO	= 0,
 	DD_BE_PRIO	= 1,
 	DD_IDLE_PRIO	= 2,
@@ -113,6 +115,12 @@ static const enum dd_prio ioprio_class_to_prio[] = {
 	[IOPRIO_CLASS_IDLE]	= DD_IDLE_PRIO,
 };
 
+static const u8 prio_to_ioprio_class[] = {
+	[DD_RT_PRIO]	= IOPRIO_CLASS_RT,
+	[DD_BE_PRIO]	= IOPRIO_CLASS_BE,
+	[DD_IDLE_PRIO]	= IOPRIO_CLASS_IDLE,
+};
+
 static inline struct rb_root *
 deadline_rb_root(struct dd_per_prio *per_prio, struct request *rq)
 {
@@ -194,18 +202,67 @@ static inline struct request *deadline_from_pos(struct dd_per_prio *per_prio,
 	return deadline_first_rq_past_pos(per_prio, data_dir, pos);
 }
 
+/*
+ * If any sequential write requests are pending for the zone containing @pos,
+ * return the I/O priority for these write requests.
+ */
+static enum dd_prio dd_zone_prio(struct deadline_data *dd,
+				 struct block_device *bdev, sector_t pos)
+{
+#ifdef CONFIG_BLK_DEV_ZONED
+	struct gendisk *disk = bdev->bd_disk;
+	const unsigned int zno = disk_zone_no(disk, pos);
+	enum dd_prio prio;
+
+	pos -= bdev_offset_from_zone_start(bdev, pos);
+	for (prio = DD_PRIO_MIN; prio <= DD_PRIO_MAX; prio++) {
+		struct dd_per_prio *per_prio = &dd->per_prio[prio];
+		struct request *rq;
+
+		rq = deadline_first_rq_past_pos(per_prio, DD_WRITE, pos);
+		while (rq && blk_rq_zone_no(rq) == zno) {
+			struct rb_node *node;
+
+			if (blk_rq_is_seq_zoned_write(rq))
+				return prio;
+			node = rb_next(&rq->rb_node);
+			if (!node)
+				break;
+			rq = rb_entry_rq(node);
+		}
+	}
+#endif
+	return DD_INVALID_PRIO;
+}
+
 /*
  * Returns the I/O priority class (IOPRIO_CLASS_*) that has been assigned to a
  * request.
  */
-static enum dd_prio dd_rq_ioprio(struct request *rq)
+static enum dd_prio dd_rq_ioprio(struct deadline_data *dd, struct request *rq)
 {
-	return ioprio_class_to_prio[IOPRIO_PRIO_CLASS(req_get_ioprio(rq))];
+	enum dd_prio prio;
+
+	if (!blk_rq_is_seq_zoned_write(rq) || !rq->bio)
+		return ioprio_class_to_prio[IOPRIO_PRIO_CLASS(
+			req_get_ioprio(rq))];
+	prio = dd_zone_prio(dd, rq->q->disk->part0, blk_rq_pos(rq));
+	if (prio == DD_INVALID_PRIO)
+		return ioprio_class_to_prio[IOPRIO_PRIO_CLASS(
+			req_get_ioprio(rq))];
+	return prio;
 }
 
-static enum dd_prio dd_bio_ioprio(struct bio *bio)
+static enum dd_prio dd_bio_ioprio(struct deadline_data *dd, struct bio *bio)
 {
-	return ioprio_class_to_prio[IOPRIO_PRIO_CLASS(bio->bi_ioprio)];
+	enum dd_prio prio;
+
+	if (!blk_bio_is_seq_zoned_write(bio))
+		return ioprio_class_to_prio[IOPRIO_PRIO_CLASS(bio->bi_ioprio)];
+	prio = dd_zone_prio(dd, bio->bi_bdev, bio->bi_iter.bi_sector);
+	if (prio == DD_INVALID_PRIO)
+		return ioprio_class_to_prio[IOPRIO_PRIO_CLASS(bio->bi_ioprio)];
+	return prio;
 }
 
 static void
@@ -246,7 +303,7 @@ static void dd_request_merged(struct request_queue *q, struct request *req,
 			      enum elv_merge type)
 {
 	struct deadline_data *dd = q->elevator->elevator_data;
-	const enum dd_prio prio = dd_rq_ioprio(req);
+	const enum dd_prio prio = dd_rq_ioprio(dd, req);
 	struct dd_per_prio *per_prio = &dd->per_prio[prio];
 
 	/*
@@ -265,7 +322,7 @@ static void dd_merged_requests(struct request_queue *q, struct request *req,
 			       struct request *next)
 {
 	struct deadline_data *dd = q->elevator->elevator_data;
-	const enum dd_prio prio = dd_rq_ioprio(next);
+	const enum dd_prio prio = dd_rq_ioprio(dd, next);
 
 	lockdep_assert_held(&dd->lock);
 
@@ -560,7 +617,7 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd,
 	dd->batching++;
 	deadline_move_request(dd, per_prio, rq);
 done:
-	prio = dd_rq_ioprio(rq);
+	prio = dd_rq_ioprio(dd, rq);
 	dd->per_prio[prio].latest_pos[data_dir] = blk_rq_pos(rq);
 	dd->per_prio[prio].stats.dispatched++;
 	/*
@@ -758,7 +815,7 @@ static int dd_request_merge(struct request_queue *q, struct request **rq,
 			    struct bio *bio)
 {
 	struct deadline_data *dd = q->elevator->elevator_data;
-	const enum dd_prio prio = dd_bio_ioprio(bio);
+	const enum dd_prio prio = dd_bio_ioprio(dd, bio);
 	struct dd_per_prio *per_prio = &dd->per_prio[prio];
 	sector_t sector = bio_end_sector(bio);
 	struct request *__rq;
@@ -822,7 +879,7 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
 	 */
 	blk_req_zone_write_unlock(rq);
 
-	prio = dd_rq_ioprio(rq);
+	prio = dd_rq_ioprio(dd, rq);
 	per_prio = &dd->per_prio[prio];
 	if (!rq->elv.priv[0]) {
 		per_prio->stats.inserted++;
@@ -931,7 +988,7 @@ static void dd_finish_request(struct request *rq)
 {
 	struct request_queue *q = rq->q;
 	struct deadline_data *dd = q->elevator->elevator_data;
-	const enum dd_prio prio = dd_rq_ioprio(rq);
+	const enum dd_prio prio = dd_rq_ioprio(dd, rq);
 	struct dd_per_prio *per_prio = &dd->per_prio[prio];
 
 	/*
@@ -1282,6 +1339,10 @@ static struct elevator_type mq_deadline = {
 };
 MODULE_ALIAS("mq-deadline-iosched");
 
+#ifdef CONFIG_MQ_IOSCHED_DEADLINE_TEST
+#include "mq-deadline_test.c"
+#endif
+
 static int __init deadline_init(void)
 {
 	return elv_register(&mq_deadline);
diff --git a/block/mq-deadline_test.c b/block/mq-deadline_test.c
new file mode 100644
index 000000000000..72bec6fd5f7a
--- /dev/null
+++ b/block/mq-deadline_test.c
@@ -0,0 +1,175 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2023 Google LLC
+ */
+#include <kunit/test.h>
+#include <linux/cleanup.h>
+
+static void test_ioprio(struct kunit *test)
+{
+	static struct block_device bdev;
+	static struct gendisk disk = { .part0 = &bdev };
+	static struct request_queue queue = { .disk = &disk };
+	static struct blk_mq_hw_ctx hctx = { .queue = &queue };
+	static struct bio bio1 = { .bi_bdev = &bdev,
+				   .bi_opf = REQ_OP_WRITE,
+				   .bi_ioprio = IOPRIO_CLASS_IDLE
+						<< IOPRIO_CLASS_SHIFT };
+	static struct request rq1 = { .q = &queue,
+				      .cmd_flags = REQ_OP_WRITE,
+				      .__sector = 1,
+				      .__data_len = 1,
+				      .bio = &bio1,
+				      .mq_hctx = &hctx,
+				      .ioprio = IOPRIO_CLASS_IDLE
+						<< IOPRIO_CLASS_SHIFT };
+	static struct bio bio2 = { .bi_bdev = &bdev,
+				   .bi_opf = REQ_OP_WRITE,
+				   .bi_ioprio = IOPRIO_CLASS_BE
+						<< IOPRIO_CLASS_SHIFT };
+	static struct request rq2 = { .q = &queue,
+				      .cmd_flags = REQ_OP_WRITE,
+				      .__sector = 3,
+				      .__data_len = 1,
+				      .bio = &bio2,
+				      .mq_hctx = &hctx,
+				      .ioprio = IOPRIO_CLASS_BE
+						<< IOPRIO_CLASS_SHIFT };
+	static struct bio bio3 = { .bi_bdev = &bdev,
+				   .bi_opf = REQ_OP_WRITE,
+				   .bi_ioprio = IOPRIO_CLASS_RT
+						<< IOPRIO_CLASS_SHIFT };
+	static struct request rq3 = { .q = &queue,
+				      .cmd_flags = REQ_OP_WRITE,
+				      .__sector = 5,
+				      .__data_len = 1,
+				      .bio = &bio3,
+				      .mq_hctx = &hctx,
+				      .ioprio = IOPRIO_CLASS_RT
+						<< IOPRIO_CLASS_SHIFT };
+	struct request *rq;
+	static LIST_HEAD(rq_list);
+
+	bdev.bd_disk = &disk;
+	bdev.bd_queue = &queue;
+	disk.queue = &queue;
+
+	dd_init_sched(&queue, &mq_deadline);
+	dd_prepare_request(&rq1);
+	dd_prepare_request(&rq2);
+	dd_prepare_request(&rq3);
+	list_add_tail(&rq1.queuelist, &rq_list);
+	list_add_tail(&rq2.queuelist, &rq_list);
+	list_add_tail(&rq3.queuelist, &rq_list);
+	dd_insert_requests(&hctx, &rq_list, false);
+	rq = dd_dispatch_request(&hctx);
+	KUNIT_EXPECT_PTR_EQ(test, rq, &rq3);
+	dd_finish_request(rq);
+	rq = dd_dispatch_request(&hctx);
+	KUNIT_EXPECT_PTR_EQ(test, rq, &rq2);
+	dd_finish_request(rq);
+	rq = dd_dispatch_request(&hctx);
+	KUNIT_EXPECT_PTR_EQ(test, rq, &rq1);
+	dd_finish_request(rq);
+	dd_exit_sched(queue.elevator);
+}
+
+/*
+ * Test that the write order is preserved if a higher I/O priority is assigned
+ * to higher LBAs. This test fails if dd_zone_prio() always returns
+ * DD_INVALID_PRIO.
+ */
+static void test_zone_prio(struct kunit *test)
+{
+	static struct block_device bdev;
+	static unsigned long seq_zones_wlock[1];
+	static struct gendisk disk = { .conv_zones_bitmap = NULL,
+				       .seq_zones_wlock = seq_zones_wlock,
+				       .part0 = &bdev };
+	static struct request_queue queue = {
+		.disk = &disk,
+		.limits = { .zoned = BLK_ZONED_HM, .chunk_sectors = 16 }
+	};
+	static struct blk_mq_hw_ctx hctx = { .queue = &queue };
+	static struct bio bio1 = { .bi_bdev = &bdev,
+				   .bi_opf = REQ_OP_WRITE,
+				   .bi_ioprio = IOPRIO_CLASS_IDLE
+						<< IOPRIO_CLASS_SHIFT };
+	static struct request rq1 = { .q = &queue,
+				      .cmd_flags = REQ_OP_WRITE,
+				      .__sector = 1,
+				      .__data_len = 1,
+				      .bio = &bio1,
+				      .mq_hctx = &hctx,
+				      .ioprio = IOPRIO_CLASS_IDLE
+						<< IOPRIO_CLASS_SHIFT };
+	static struct bio bio2 = { .bi_bdev = &bdev,
+				   .bi_opf = REQ_OP_WRITE,
+				   .bi_ioprio = IOPRIO_CLASS_BE
+						<< IOPRIO_CLASS_SHIFT };
+	static struct request rq2 = { .q = &queue,
+				      .cmd_flags = REQ_OP_WRITE,
+				      .__sector = 3,
+				      .__data_len = 1,
+				      .bio = &bio2,
+				      .mq_hctx = &hctx,
+				      .ioprio = IOPRIO_CLASS_BE
+						<< IOPRIO_CLASS_SHIFT };
+	static struct bio bio3 = { .bi_bdev = &bdev,
+				   .bi_opf = REQ_OP_WRITE,
+				   .bi_ioprio = IOPRIO_CLASS_RT
+						<< IOPRIO_CLASS_SHIFT };
+	static struct request rq3 = { .q = &queue,
+				      .cmd_flags = REQ_OP_WRITE,
+				      .__sector = 5,
+				      .__data_len = 1,
+				      .bio = &bio3,
+				      .mq_hctx = &hctx,
+				      .ioprio = IOPRIO_CLASS_RT
+						<< IOPRIO_CLASS_SHIFT };
+	struct request *rq;
+	static LIST_HEAD(rq_list);
+
+	bdev.bd_disk = &disk;
+	bdev.bd_queue = &queue;
+	disk.queue = &queue;
+
+	KUNIT_EXPECT_TRUE(test, blk_rq_is_seq_zoned_write(&rq1));
+	KUNIT_EXPECT_TRUE(test, blk_rq_is_seq_zoned_write(&rq2));
+	KUNIT_EXPECT_TRUE(test, blk_rq_is_seq_zoned_write(&rq3));
+
+	dd_init_sched(&queue, &mq_deadline);
+	dd_prepare_request(&rq1);
+	dd_prepare_request(&rq2);
+	dd_prepare_request(&rq3);
+	list_add_tail(&rq1.queuelist, &rq_list);
+	list_add_tail(&rq2.queuelist, &rq_list);
+	list_add_tail(&rq3.queuelist, &rq_list);
+	dd_insert_requests(&hctx, &rq_list, false);
+	rq = dd_dispatch_request(&hctx);
+	KUNIT_EXPECT_PTR_EQ(test, rq, &rq1);
+	dd_finish_request(rq);
+	rq = dd_dispatch_request(&hctx);
+	KUNIT_EXPECT_PTR_EQ(test, rq, &rq2);
+	dd_finish_request(rq);
+	rq = dd_dispatch_request(&hctx);
+	KUNIT_EXPECT_PTR_EQ(test, rq, &rq3);
+	dd_finish_request(rq);
+	dd_exit_sched(queue.elevator);
+}
+
+static struct kunit_case mq_deadline_test_cases[] = {
+	KUNIT_CASE(test_ioprio),
+	KUNIT_CASE(test_zone_prio),
+	{}
+};
+
+static struct kunit_suite mq_deadline_test_suite = {
+	.name = "mq-deadline",
+	.test_cases = mq_deadline_test_cases,
+};
+kunit_test_suite(mq_deadline_test_suite);
+
+MODULE_DESCRIPTION("mq-deadline unit tests");
+MODULE_AUTHOR("Bart Van Assche");
+MODULE_LICENSE("GPL");
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 1ab3081c82ed..e7fa81170b7c 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -1149,6 +1149,18 @@ static inline unsigned int blk_rq_zone_no(struct request *rq)
 	return disk_zone_no(rq->q->disk, blk_rq_pos(rq));
 }
 
+/**
+ * blk_bio_is_seq_zoned_write() - Check if @bio requires write serialization.
+ * @bio: Bio to examine.
+ *
+ * Note: REQ_OP_ZONE_APPEND bios do not require serialization.
+ */
+static inline bool blk_bio_is_seq_zoned_write(struct bio *bio)
+{
+	return disk_zone_is_seq(bio->bi_bdev->bd_disk, bio->bi_iter.bi_sector) &&
+		op_needs_zoned_write_locking(bio_op(bio));
+}
+
 static inline unsigned int blk_rq_zone_is_seq(struct request *rq)
 {
 	return disk_zone_is_seq(rq->q->disk, blk_rq_pos(rq));
@@ -1196,6 +1208,11 @@ static inline bool blk_req_can_dispatch_to_zone(struct request *rq)
 	return !blk_req_zone_is_write_locked(rq);
 }
 #else /* CONFIG_BLK_DEV_ZONED */
+static inline bool blk_bio_is_seq_zoned_write(struct bio *bio)
+{
+	return false;
+}
+
 static inline bool blk_rq_is_seq_zoned_write(struct request *rq)
 {
 	return false;

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [PATCH v2 4/4] block/mq-deadline: Prevent zoned write reordering due to I/O prioritization
  2023-12-18 21:13 ` [PATCH v2 4/4] block/mq-deadline: Prevent zoned write reordering due to I/O prioritization Bart Van Assche
@ 2023-12-19 12:10   ` Christoph Hellwig
  2023-12-19 17:42     ` Bart Van Assche
  0 siblings, 1 reply; 12+ messages in thread
From: Christoph Hellwig @ 2023-12-19 12:10 UTC (permalink / raw)
  To: Bart Van Assche
  Cc: Jens Axboe, linux-block, Damien Le Moal, Christoph Hellwig

On Mon, Dec 18, 2023 at 01:13:42PM -0800, Bart Van Assche wrote:
> Assigning I/O priorities with the ioprio cgroup policy may cause
> different I/O priorities to be assigned to write requests for the same
> zone. Prevent that this causes unaligned write errors by adding zoned
> writes for the same zone in the same priority queue as prior zoned
> writes.

I still think this is fundamentally the wrong thing to do.  If you set
different priorities, you want I/O to be reordered, so ignoring that
is a bad thing.


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v2 4/4] block/mq-deadline: Prevent zoned write reordering due to I/O prioritization
  2023-12-19 12:10   ` Christoph Hellwig
@ 2023-12-19 17:42     ` Bart Van Assche
  2023-12-20  0:05       ` Damien Le Moal
  0 siblings, 1 reply; 12+ messages in thread
From: Bart Van Assche @ 2023-12-19 17:42 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Jens Axboe, linux-block, Damien Le Moal

On 12/19/23 04:10, Christoph Hellwig wrote:
> On Mon, Dec 18, 2023 at 01:13:42PM -0800, Bart Van Assche wrote:
>> Assigning I/O priorities with the ioprio cgroup policy may cause
>> different I/O priorities to be assigned to write requests for the same
>> zone. Prevent that this causes unaligned write errors by adding zoned
>> writes for the same zone in the same priority queue as prior zoned
>> writes.
> 
> I still think this is fundamentally the wrong thing to do.  If you set
> different priorities, you want I/O to be reordered, so ignoring that
> is a bad thing.

Hi Christoph,

How about not setting the I/O priority of sequential zoned writes as in
the (untested) patch below?

Thanks,

Bart.


[PATCH] block: Do not set the I/O priority for sequential zoned writes

---
  block/blk-mq.c         |  7 +++++++
  include/linux/blk-mq.h | 17 +++++++++++++++++
  2 files changed, 24 insertions(+)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index c11c97afa0bc..668888103a47 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2922,6 +2922,13 @@ static bool blk_mq_can_use_cached_rq(struct request *rq, struct blk_plug *plug,

  static void bio_set_ioprio(struct bio *bio)
  {
+	/*
+	 * Do not set the I/O priority of sequential zoned write bios because
+	 * this could lead to reordering and hence to unaligned write errors.
+	 */
+	if (blk_bio_is_seq_zoned_write(bio))
+		return;
+
  	/* Nobody set ioprio so far? Initialize it based on task's nice value */
  	if (IOPRIO_PRIO_CLASS(bio->bi_ioprio) == IOPRIO_CLASS_NONE)
  		bio->bi_ioprio = get_current_ioprio();
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 1ab3081c82ed..e7fa81170b7c 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -1149,6 +1149,18 @@ static inline unsigned int blk_rq_zone_no(struct request *rq)
  	return disk_zone_no(rq->q->disk, blk_rq_pos(rq));
  }

+/**
+ * blk_bio_is_seq_zoned_write() - Check if @bio requires write serialization.
+ * @bio: Bio to examine.
+ *
+ * Note: REQ_OP_ZONE_APPEND bios do not require serialization.
+ */
+static inline bool blk_bio_is_seq_zoned_write(struct bio *bio)
+{
+	return disk_zone_is_seq(bio->bi_bdev->bd_disk, bio->bi_iter.bi_sector) &&
+		op_needs_zoned_write_locking(bio_op(bio));
+}
+
  static inline unsigned int blk_rq_zone_is_seq(struct request *rq)
  {
  	return disk_zone_is_seq(rq->q->disk, blk_rq_pos(rq));
@@ -1196,6 +1208,11 @@ static inline bool blk_req_can_dispatch_to_zone(struct request *rq)
  	return !blk_req_zone_is_write_locked(rq);
  }
  #else /* CONFIG_BLK_DEV_ZONED */
+static inline bool blk_bio_is_seq_zoned_write(struct bio *bio)
+{
+	return false;
+}
+
  static inline bool blk_rq_is_seq_zoned_write(struct request *rq)
  {
  	return false;


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [PATCH v2 4/4] block/mq-deadline: Prevent zoned write reordering due to I/O prioritization
  2023-12-19 17:42     ` Bart Van Assche
@ 2023-12-20  0:05       ` Damien Le Moal
  2023-12-20  0:48         ` Bart Van Assche
  0 siblings, 1 reply; 12+ messages in thread
From: Damien Le Moal @ 2023-12-20  0:05 UTC (permalink / raw)
  To: Bart Van Assche, Christoph Hellwig; +Cc: Jens Axboe, linux-block

On 12/20/23 02:42, Bart Van Assche wrote:
> On 12/19/23 04:10, Christoph Hellwig wrote:
>> On Mon, Dec 18, 2023 at 01:13:42PM -0800, Bart Van Assche wrote:
>>> Assigning I/O priorities with the ioprio cgroup policy may cause
>>> different I/O priorities to be assigned to write requests for the same
>>> zone. Prevent that this causes unaligned write errors by adding zoned
>>> writes for the same zone in the same priority queue as prior zoned
>>> writes.
>>
>> I still think this is fundamentally the wrong thing to do.  If you set
>> different priorities, you want I/O to be reordered, so ignoring that
>> is a bad thing.
> 
> Hi Christoph,
> 
> How about not setting the I/O priority of sequential zoned writes as in
> the (untested) patch below?
> 
> Thanks,
> 
> Bart.
> 
> 
> [PATCH] block: Do not set the I/O priority for sequential zoned writes
> 
> ---
>   block/blk-mq.c         |  7 +++++++
>   include/linux/blk-mq.h | 17 +++++++++++++++++
>   2 files changed, 24 insertions(+)
> 
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index c11c97afa0bc..668888103a47 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -2922,6 +2922,13 @@ static bool blk_mq_can_use_cached_rq(struct request *rq, struct blk_plug *plug,
> 
>   static void bio_set_ioprio(struct bio *bio)
>   {
> +	/*
> +	 * Do not set the I/O priority of sequential zoned write bios because
> +	 * this could lead to reordering and hence to unaligned write errors.
> +	 */
> +	if (blk_bio_is_seq_zoned_write(bio))
> +		return;

That is not acceptable as that will ignore priorities passed for async direct
IOs through aio->aio_reqprio. That one is a perfectly acceptable use case and we
should not ignore it.

> +
>   	/* Nobody set ioprio so far? Initialize it based on task's nice value */
>   	if (IOPRIO_PRIO_CLASS(bio->bi_ioprio) == IOPRIO_CLASS_NONE)
>   		bio->bi_ioprio = get_current_ioprio();
> diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
> index 1ab3081c82ed..e7fa81170b7c 100644
> --- a/include/linux/blk-mq.h
> +++ b/include/linux/blk-mq.h
> @@ -1149,6 +1149,18 @@ static inline unsigned int blk_rq_zone_no(struct request *rq)
>   	return disk_zone_no(rq->q->disk, blk_rq_pos(rq));
>   }
> 
> +/**
> + * blk_bio_is_seq_zoned_write() - Check if @bio requires write serialization.
> + * @bio: Bio to examine.
> + *
> + * Note: REQ_OP_ZONE_APPEND bios do not require serialization.
> + */
> +static inline bool blk_bio_is_seq_zoned_write(struct bio *bio)
> +{
> +	return disk_zone_is_seq(bio->bi_bdev->bd_disk, bio->bi_iter.bi_sector) &&
> +		op_needs_zoned_write_locking(bio_op(bio));
> +}
> +
>   static inline unsigned int blk_rq_zone_is_seq(struct request *rq)
>   {
>   	return disk_zone_is_seq(rq->q->disk, blk_rq_pos(rq));
> @@ -1196,6 +1208,11 @@ static inline bool blk_req_can_dispatch_to_zone(struct request *rq)
>   	return !blk_req_zone_is_write_locked(rq);
>   }
>   #else /* CONFIG_BLK_DEV_ZONED */
> +static inline bool blk_bio_is_seq_zoned_write(struct bio *bio)
> +{
> +	return false;
> +}
> +
>   static inline bool blk_rq_is_seq_zoned_write(struct request *rq)
>   {
>   	return false;
> 

-- 
Damien Le Moal
Western Digital Research


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v2 4/4] block/mq-deadline: Prevent zoned write reordering due to I/O prioritization
  2023-12-20  0:05       ` Damien Le Moal
@ 2023-12-20  0:48         ` Bart Van Assche
  2023-12-20  1:28           ` Damien Le Moal
  0 siblings, 1 reply; 12+ messages in thread
From: Bart Van Assche @ 2023-12-20  0:48 UTC (permalink / raw)
  To: Damien Le Moal, Christoph Hellwig; +Cc: Jens Axboe, linux-block

On 12/19/23 16:05, Damien Le Moal wrote:
> On 12/20/23 02:42, Bart Van Assche wrote:
>> diff --git a/block/blk-mq.c b/block/blk-mq.c
>> index c11c97afa0bc..668888103a47 100644
>> --- a/block/blk-mq.c
>> +++ b/block/blk-mq.c
>> @@ -2922,6 +2922,13 @@ static bool blk_mq_can_use_cached_rq(struct request *rq, struct blk_plug *plug,
>>
>>    static void bio_set_ioprio(struct bio *bio)
>>    {
>> +	/*
>> +	 * Do not set the I/O priority of sequential zoned write bios because
>> +	 * this could lead to reordering and hence to unaligned write errors.
>> +	 */
>> +	if (blk_bio_is_seq_zoned_write(bio))
>> +		return;
> 
> That is not acceptable as that will ignore priorities passed for async direct
> IOs through aio->aio_reqprio. That one is a perfectly acceptable use case and we
> should not ignore it.

Hi Damien,

What you wrote is wrong. bio_set_ioprio() applies the I/O priority set
by ionice or by the blk-ioprio cgroup policy. The above patch does not
affect the priorities set via aio_reqprio. aio_reqprio is still copied
in ki_ioprio and ki_ioprio is still copied into bi_ioprio by the direct
I/O code.

Bart.


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v2 4/4] block/mq-deadline: Prevent zoned write reordering due to I/O prioritization
  2023-12-20  0:48         ` Bart Van Assche
@ 2023-12-20  1:28           ` Damien Le Moal
  2023-12-20  3:53             ` Christoph Hellwig
  0 siblings, 1 reply; 12+ messages in thread
From: Damien Le Moal @ 2023-12-20  1:28 UTC (permalink / raw)
  To: Bart Van Assche, Christoph Hellwig; +Cc: Jens Axboe, linux-block

On 12/20/23 09:48, Bart Van Assche wrote:
> On 12/19/23 16:05, Damien Le Moal wrote:
>> On 12/20/23 02:42, Bart Van Assche wrote:
>>> diff --git a/block/blk-mq.c b/block/blk-mq.c
>>> index c11c97afa0bc..668888103a47 100644
>>> --- a/block/blk-mq.c
>>> +++ b/block/blk-mq.c
>>> @@ -2922,6 +2922,13 @@ static bool blk_mq_can_use_cached_rq(struct request *rq, struct blk_plug *plug,
>>>
>>>    static void bio_set_ioprio(struct bio *bio)
>>>    {
>>> +	/*
>>> +	 * Do not set the I/O priority of sequential zoned write bios because
>>> +	 * this could lead to reordering and hence to unaligned write errors.
>>> +	 */
>>> +	if (blk_bio_is_seq_zoned_write(bio))
>>> +		return;
>>
>> That is not acceptable as that will ignore priorities passed for async direct
>> IOs through aio->aio_reqprio. That one is a perfectly acceptable use case and we
>> should not ignore it.
> 
> Hi Damien,
> 
> What you wrote is wrong. bio_set_ioprio() applies the I/O priority set
> by ionice or by the blk-ioprio cgroup policy. The above patch does not
> affect the priorities set via aio_reqprio. aio_reqprio is still copied
> in ki_ioprio and ki_ioprio is still copied into bi_ioprio by the direct
> I/O code.

OK. But your patch will still endup with IO priorities being ignored for
legitimate use cases that do not lead to mixed-priorities. E.g. applications
using directly the raw device and doing writes to a zone without mixing
priorities, either with AIO, ionice or cgroups.

The issue is when a user mixes different IO priorities for writes to the same
zone, and as I said before, since doing that is nonsensical, getting the IOs to
fail is fine by me. The user will then be aware that this should not be done.

f2fs has a problem with that though as that leads to write errors and FS going
read-only (I guess). btrfs will not have this issue because it uses zone append.
Need to check dm-zoned as their may be an issue there.

So what about what I proposed in an earlier email: introduce a bio flag "ignore
ioprio" that causes bio_set_ioprio() to not set any IO priority and have f2fs
set that flag for any zone write BIO it issues ? That will solve your f2fs issue
without messing up good use cases.

-- 
Damien Le Moal
Western Digital Research

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v2 4/4] block/mq-deadline: Prevent zoned write reordering due to I/O prioritization
  2023-12-20  1:28           ` Damien Le Moal
@ 2023-12-20  3:53             ` Christoph Hellwig
  2023-12-20  4:40               ` Damien Le Moal
  0 siblings, 1 reply; 12+ messages in thread
From: Christoph Hellwig @ 2023-12-20  3:53 UTC (permalink / raw)
  To: Damien Le Moal
  Cc: Bart Van Assche, Christoph Hellwig, Jens Axboe, linux-block

On Wed, Dec 20, 2023 at 10:28:37AM +0900, Damien Le Moal wrote:
> zone, and as I said before, since doing that is nonsensical, getting the IOs to
> fail is fine by me. The user will then be aware that this should not be done.
> 
> f2fs has a problem with that though as that leads to write errors and FS going
> read-only (I guess). btrfs will not have this issue because it uses zone append.
> Need to check dm-zoned as their may be an issue there.
> 
> So what about what I proposed in an earlier email: introduce a bio flag "ignore
> ioprio" that causes bio_set_ioprio() to not set any IO priority and have f2fs
> set that flag for any zone write BIO it issues ? That will solve your f2fs issue
> without messing up good use cases.

How can this even be a problem for f2f2 upsteam where f2fs must only
have a single write per zone outstanding?  I really don't want crap
in the block layer to work around a known broken model (multiple
outstanding WRITE commands per zone) that because it's so known broken
isn't even merged upstream.



^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v2 4/4] block/mq-deadline: Prevent zoned write reordering due to I/O prioritization
  2023-12-20  3:53             ` Christoph Hellwig
@ 2023-12-20  4:40               ` Damien Le Moal
  0 siblings, 0 replies; 12+ messages in thread
From: Damien Le Moal @ 2023-12-20  4:40 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Bart Van Assche, Jens Axboe, linux-block

On 12/20/23 12:53, Christoph Hellwig wrote:
> On Wed, Dec 20, 2023 at 10:28:37AM +0900, Damien Le Moal wrote:
>> zone, and as I said before, since doing that is nonsensical, getting the IOs to
>> fail is fine by me. The user will then be aware that this should not be done.
>>
>> f2fs has a problem with that though as that leads to write errors and FS going
>> read-only (I guess). btrfs will not have this issue because it uses zone append.
>> Need to check dm-zoned as their may be an issue there.
>>
>> So what about what I proposed in an earlier email: introduce a bio flag "ignore
>> ioprio" that causes bio_set_ioprio() to not set any IO priority and have f2fs
>> set that flag for any zone write BIO it issues ? That will solve your f2fs issue
>> without messing up good use cases.
> 
> How can this even be a problem for f2f2 upsteam where f2fs must only
> have a single write per zone outstanding?  I really don't want crap
> in the block layer to work around a known broken model (multiple
> outstanding WRITE commands per zone) that because it's so known broken
> isn't even merged upstream.

The only constraint at the BIO level for writing to a zone is "issue the write
BIOs sequentially". So multiple write BIOs *can* be issued to a zone. The "one
write per zone in flight at any time" implemented with zone write locking
happens at the request level in the block IO scheduler, so underneath the file
system. So the issue can indeed happen.

But what you said could actually provide a solution: have the FS issue regular
writes one at a time if the writes have priorities.

-- 
Damien Le Moal
Western Digital Research


^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2023-12-20  4:40 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2023-12-18 21:13 [PATCH v2 0/4] Improve I/O priority support in mq-deadline for zoned writes Bart Van Assche
2023-12-18 21:13 ` [PATCH v2 1/4] block/mq-deadline: Rename dd_rq_ioclass() and change its return type Bart Van Assche
2023-12-18 21:13 ` [PATCH v2 2/4] block/mq-deadline: Introduce dd_bio_ioclass() Bart Van Assche
2023-12-18 21:13 ` [PATCH v2 3/4] block/mq-deadline: Introduce deadline_first_rq_past_pos() Bart Van Assche
2023-12-18 21:13 ` [PATCH v2 4/4] block/mq-deadline: Prevent zoned write reordering due to I/O prioritization Bart Van Assche
2023-12-19 12:10   ` Christoph Hellwig
2023-12-19 17:42     ` Bart Van Assche
2023-12-20  0:05       ` Damien Le Moal
2023-12-20  0:48         ` Bart Van Assche
2023-12-20  1:28           ` Damien Le Moal
2023-12-20  3:53             ` Christoph Hellwig
2023-12-20  4:40               ` Damien Le Moal

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox