From: Damien Le Moal <dlemoal@kernel.org>
To: linux-block@vger.kernel.org, Jens Axboe <axboe@kernel.dk>,
linux-scsi@vger.kernel.org,
"Martin K . Petersen" <martin.petersen@oracle.com>,
dm-devel@lists.linux.dev, Mike Snitzer <snitzer@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Subject: [PATCH 19/26] block: mq-deadline: Remove support for zone write locking
Date: Fri, 2 Feb 2024 16:30:57 +0900 [thread overview]
Message-ID: <20240202073104.2418230-20-dlemoal@kernel.org> (raw)
In-Reply-To: <20240202073104.2418230-1-dlemoal@kernel.org>
With the block layer generic plugging of write operations for zoned
block devices, mq-deadline, or any other scheduler, can only ever
see at most one write operation per zone at any time. There is thus no
sequentiality requirements for these writes and thus no need to tightly
control the dispatching of write requests using zone write locking.
Remove all the code that implement this control in the mq-deadline
scheduler and remove advertizing support for the
ELEVATOR_F_ZBD_SEQ_WRITE elevator feature.
Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
---
block/mq-deadline.c | 176 ++------------------------------------------
1 file changed, 6 insertions(+), 170 deletions(-)
diff --git a/block/mq-deadline.c b/block/mq-deadline.c
index 1b0de4fc3958..42d967849bec 100644
--- a/block/mq-deadline.c
+++ b/block/mq-deadline.c
@@ -90,7 +90,6 @@ struct deadline_data {
struct {
spinlock_t lock;
spinlock_t insert_lock;
- spinlock_t zone_lock;
} ____cacheline_aligned_in_smp;
unsigned long run_state;
@@ -171,8 +170,7 @@ deadline_latter_request(struct request *rq)
}
/*
- * Return the first request for which blk_rq_pos() >= @pos. For zoned devices,
- * return the first request after the start of the zone containing @pos.
+ * Return the first request for which blk_rq_pos() >= @pos.
*/
static inline struct request *deadline_from_pos(struct dd_per_prio *per_prio,
enum dd_data_dir data_dir, sector_t pos)
@@ -184,14 +182,6 @@ static inline struct request *deadline_from_pos(struct dd_per_prio *per_prio,
return NULL;
rq = rb_entry_rq(node);
- /*
- * A zoned write may have been requeued with a starting position that
- * is below that of the most recently dispatched request. Hence, for
- * zoned writes, start searching from the start of a zone.
- */
- if (blk_rq_is_seq_zoned_write(rq))
- pos = round_down(pos, rq->q->limits.chunk_sectors);
-
while (node) {
rq = rb_entry_rq(node);
if (blk_rq_pos(rq) >= pos) {
@@ -322,36 +312,6 @@ static inline bool deadline_check_fifo(struct dd_per_prio *per_prio,
return time_is_before_eq_jiffies((unsigned long)rq->fifo_time);
}
-/*
- * Check if rq has a sequential request preceding it.
- */
-static bool deadline_is_seq_write(struct deadline_data *dd, struct request *rq)
-{
- struct request *prev = deadline_earlier_request(rq);
-
- if (!prev)
- return false;
-
- return blk_rq_pos(prev) + blk_rq_sectors(prev) == blk_rq_pos(rq);
-}
-
-/*
- * Skip all write requests that are sequential from @rq, even if we cross
- * a zone boundary.
- */
-static struct request *deadline_skip_seq_writes(struct deadline_data *dd,
- struct request *rq)
-{
- sector_t pos = blk_rq_pos(rq);
-
- do {
- pos += blk_rq_sectors(rq);
- rq = deadline_latter_request(rq);
- } while (rq && blk_rq_pos(rq) == pos);
-
- return rq;
-}
-
/*
* For the specified data direction, return the next request to
* dispatch using arrival ordered lists.
@@ -360,40 +320,10 @@ static struct request *
deadline_fifo_request(struct deadline_data *dd, struct dd_per_prio *per_prio,
enum dd_data_dir data_dir)
{
- struct request *rq, *rb_rq, *next;
- unsigned long flags;
-
if (list_empty(&per_prio->fifo_list[data_dir]))
return NULL;
- rq = rq_entry_fifo(per_prio->fifo_list[data_dir].next);
- if (data_dir == DD_READ || !blk_queue_is_zoned(rq->q))
- return rq;
-
- /*
- * Look for a write request that can be dispatched, that is one with
- * an unlocked target zone. For some HDDs, breaking a sequential
- * write stream can lead to lower throughput, so make sure to preserve
- * sequential write streams, even if that stream crosses into the next
- * zones and these zones are unlocked.
- */
- spin_lock_irqsave(&dd->zone_lock, flags);
- list_for_each_entry_safe(rq, next, &per_prio->fifo_list[DD_WRITE],
- queuelist) {
- /* Check whether a prior request exists for the same zone. */
- rb_rq = deadline_from_pos(per_prio, data_dir, blk_rq_pos(rq));
- if (rb_rq && blk_rq_pos(rb_rq) < blk_rq_pos(rq))
- rq = rb_rq;
- if (blk_req_can_dispatch_to_zone(rq) &&
- (blk_queue_nonrot(rq->q) ||
- !deadline_is_seq_write(dd, rq)))
- goto out;
- }
- rq = NULL;
-out:
- spin_unlock_irqrestore(&dd->zone_lock, flags);
-
- return rq;
+ return rq_entry_fifo(per_prio->fifo_list[data_dir].next);
}
/*
@@ -404,36 +334,8 @@ static struct request *
deadline_next_request(struct deadline_data *dd, struct dd_per_prio *per_prio,
enum dd_data_dir data_dir)
{
- struct request *rq;
- unsigned long flags;
-
- rq = deadline_from_pos(per_prio, data_dir,
- per_prio->latest_pos[data_dir]);
- if (!rq)
- return NULL;
-
- if (data_dir == DD_READ || !blk_queue_is_zoned(rq->q))
- return rq;
-
- /*
- * Look for a write request that can be dispatched, that is one with
- * an unlocked target zone. For some HDDs, breaking a sequential
- * write stream can lead to lower throughput, so make sure to preserve
- * sequential write streams, even if that stream crosses into the next
- * zones and these zones are unlocked.
- */
- spin_lock_irqsave(&dd->zone_lock, flags);
- while (rq) {
- if (blk_req_can_dispatch_to_zone(rq))
- break;
- if (blk_queue_nonrot(rq->q))
- rq = deadline_latter_request(rq);
- else
- rq = deadline_skip_seq_writes(dd, rq);
- }
- spin_unlock_irqrestore(&dd->zone_lock, flags);
-
- return rq;
+ return deadline_from_pos(per_prio, data_dir,
+ per_prio->latest_pos[data_dir]);
}
/*
@@ -539,10 +441,6 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd,
rq = next_rq;
}
- /*
- * For a zoned block device, if we only have writes queued and none of
- * them can be dispatched, rq will be NULL.
- */
if (!rq)
return NULL;
@@ -563,10 +461,6 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd,
prio = ioprio_class_to_prio[ioprio_class];
dd->per_prio[prio].latest_pos[data_dir] = blk_rq_pos(rq);
dd->per_prio[prio].stats.dispatched++;
- /*
- * If the request needs its target zone locked, do it.
- */
- blk_req_zone_write_lock(rq);
rq->rq_flags |= RQF_STARTED;
return rq;
}
@@ -766,7 +660,6 @@ static int dd_init_sched(struct request_queue *q, struct elevator_type *e)
spin_lock_init(&dd->lock);
spin_lock_init(&dd->insert_lock);
- spin_lock_init(&dd->zone_lock);
INIT_LIST_HEAD(&dd->at_head);
INIT_LIST_HEAD(&dd->at_tail);
@@ -879,12 +772,6 @@ static void dd_insert_request(struct request_queue *q, struct request *rq,
lockdep_assert_held(&dd->lock);
- /*
- * This may be a requeue of a write request that has locked its
- * target zone. If it is the case, this releases the zone lock.
- */
- blk_req_zone_write_unlock(rq);
-
prio = ioprio_class_to_prio[ioprio_class];
per_prio = &dd->per_prio[prio];
if (!rq->elv.priv[0]) {
@@ -916,18 +803,6 @@ static void dd_insert_request(struct request_queue *q, struct request *rq,
*/
rq->fifo_time = jiffies + dd->fifo_expire[data_dir];
insert_before = &per_prio->fifo_list[data_dir];
-#ifdef CONFIG_BLK_DEV_ZONED
- /*
- * Insert zoned writes such that requests are sorted by
- * position per zone.
- */
- if (blk_rq_is_seq_zoned_write(rq)) {
- struct request *rq2 = deadline_latter_request(rq);
-
- if (rq2 && blk_rq_zone_no(rq2) == blk_rq_zone_no(rq))
- insert_before = &rq2->queuelist;
- }
-#endif
list_add_tail(&rq->queuelist, insert_before);
}
}
@@ -956,33 +831,8 @@ static void dd_prepare_request(struct request *rq)
rq->elv.priv[0] = NULL;
}
-static bool dd_has_write_work(struct blk_mq_hw_ctx *hctx)
-{
- struct deadline_data *dd = hctx->queue->elevator->elevator_data;
- enum dd_prio p;
-
- for (p = 0; p <= DD_PRIO_MAX; p++)
- if (!list_empty_careful(&dd->per_prio[p].fifo_list[DD_WRITE]))
- return true;
-
- return false;
-}
-
/*
* Callback from inside blk_mq_free_request().
- *
- * For zoned block devices, write unlock the target zone of
- * completed write requests. Do this while holding the zone lock
- * spinlock so that the zone is never unlocked while deadline_fifo_request()
- * or deadline_next_request() are executing. This function is called for
- * all requests, whether or not these requests complete successfully.
- *
- * For a zoned block device, __dd_dispatch_request() may have stopped
- * dispatching requests if all the queued requests are write requests directed
- * at zones that are already locked due to on-going write requests. To ensure
- * write request dispatch progress in this case, mark the queue as needing a
- * restart to ensure that the queue is run again after completion of the
- * request and zones being unlocked.
*/
static void dd_finish_request(struct request *rq)
{
@@ -997,21 +847,8 @@ static void dd_finish_request(struct request *rq)
* called dd_insert_requests(). Skip requests that bypassed I/O
* scheduling. See also blk_mq_request_bypass_insert().
*/
- if (!rq->elv.priv[0])
- return;
-
- atomic_inc(&per_prio->stats.completed);
-
- if (blk_queue_is_zoned(q)) {
- unsigned long flags;
-
- spin_lock_irqsave(&dd->zone_lock, flags);
- blk_req_zone_write_unlock(rq);
- spin_unlock_irqrestore(&dd->zone_lock, flags);
-
- if (dd_has_write_work(rq->mq_hctx))
- blk_mq_sched_mark_restart_hctx(rq->mq_hctx);
- }
+ if (rq->elv.priv[0])
+ atomic_inc(&per_prio->stats.completed);
}
static bool dd_has_work_for_prio(struct dd_per_prio *per_prio)
@@ -1339,7 +1176,6 @@ static struct elevator_type mq_deadline = {
.elevator_attrs = deadline_attrs,
.elevator_name = "mq-deadline",
.elevator_alias = "deadline",
- .elevator_features = ELEVATOR_F_ZBD_SEQ_WRITE,
.elevator_owner = THIS_MODULE,
};
MODULE_ALIAS("mq-deadline-iosched");
--
2.43.0
next prev parent reply other threads:[~2024-02-02 7:31 UTC|newest]
Thread overview: 107+ messages / expand[flat|nested] mbox.gz Atom feed top
2024-02-02 7:30 [PATCH 00/26] Zone write plugging Damien Le Moal
2024-02-02 7:30 ` [PATCH 01/26] block: Restore sector of flush requests Damien Le Moal
2024-02-04 11:55 ` Hannes Reinecke
2024-02-05 17:22 ` Bart Van Assche
2024-02-05 23:42 ` Damien Le Moal
2024-02-02 7:30 ` [PATCH 02/26] block: Remove req_bio_endio() Damien Le Moal
2024-02-04 11:57 ` Hannes Reinecke
2024-02-05 17:28 ` Bart Van Assche
2024-02-05 23:45 ` Damien Le Moal
2024-02-09 6:53 ` Damien Le Moal
2024-02-02 7:30 ` [PATCH 03/26] block: Introduce bio_straddle_zones() and bio_offset_from_zone_start() Damien Le Moal
2024-02-03 4:09 ` Bart Van Assche
2024-02-04 11:58 ` Hannes Reinecke
2024-02-02 7:30 ` [PATCH 04/26] block: Introduce blk_zone_complete_request_bio() Damien Le Moal
2024-02-04 11:59 ` Hannes Reinecke
2024-02-02 7:30 ` [PATCH 05/26] block: Allow using bio_attempt_back_merge() internally Damien Le Moal
2024-02-03 4:11 ` Bart Van Assche
2024-02-04 12:00 ` Hannes Reinecke
2024-02-02 7:30 ` [PATCH 06/26] block: Introduce zone write plugging Damien Le Moal
2024-02-04 3:56 ` Ming Lei
2024-02-04 23:57 ` Damien Le Moal
2024-02-05 2:19 ` Ming Lei
2024-02-05 2:41 ` Damien Le Moal
2024-02-05 3:38 ` Ming Lei
2024-02-05 5:11 ` Christoph Hellwig
2024-02-05 5:37 ` Damien Le Moal
2024-02-05 5:50 ` Christoph Hellwig
2024-02-05 6:14 ` Damien Le Moal
2024-02-05 10:06 ` Ming Lei
2024-02-05 12:20 ` Damien Le Moal
2024-02-05 12:43 ` Damien Le Moal
2024-02-04 12:14 ` Hannes Reinecke
2024-02-05 17:48 ` Bart Van Assche
2024-02-05 23:48 ` Damien Le Moal
2024-02-06 0:52 ` Bart Van Assche
2024-02-02 7:30 ` [PATCH 07/26] block: Allow zero value of max_zone_append_sectors queue limit Damien Le Moal
2024-02-04 12:15 ` Hannes Reinecke
2024-02-02 7:30 ` [PATCH 08/26] block: Implement zone append emulation Damien Le Moal
2024-02-04 12:24 ` Hannes Reinecke
2024-02-05 0:10 ` Damien Le Moal
2024-02-05 17:58 ` Bart Van Assche
2024-02-05 23:57 ` Damien Le Moal
2024-02-02 7:30 ` [PATCH 09/26] block: Allow BIO-based drivers to use blk_revalidate_disk_zones() Damien Le Moal
2024-02-04 12:26 ` Hannes Reinecke
2024-02-02 7:30 ` [PATCH 10/26] dm: Use the block layer zone append emulation Damien Le Moal
2024-02-03 17:58 ` Mike Snitzer
2024-02-05 5:38 ` Damien Le Moal
2024-02-05 20:33 ` Mike Snitzer
2024-02-05 23:40 ` Damien Le Moal
2024-02-06 20:41 ` Mike Snitzer
2024-02-04 12:30 ` Hannes Reinecke
2024-02-02 7:30 ` [PATCH 11/26] scsi: sd: " Damien Le Moal
2024-02-04 12:29 ` Hannes Reinecke
2024-02-06 1:55 ` Martin K. Petersen
2024-02-02 7:30 ` [PATCH 12/26] ublk_drv: Do not request ELEVATOR_F_ZBD_SEQ_WRITE elevator feature Damien Le Moal
2024-02-04 12:31 ` Hannes Reinecke
2024-02-02 7:30 ` [PATCH 13/26] null_blk: " Damien Le Moal
2024-02-04 12:31 ` Hannes Reinecke
2024-02-02 7:30 ` [PATCH 14/26] null_blk: Introduce zone_append_max_sectors attribute Damien Le Moal
2024-02-04 12:32 ` Hannes Reinecke
2024-02-02 7:30 ` [PATCH 15/26] null_blk: Introduce fua attribute Damien Le Moal
2024-02-04 12:33 ` Hannes Reinecke
2024-02-02 7:30 ` [PATCH 16/26] nvmet: zns: Do not reference the gendisk conv_zones_bitmap Damien Le Moal
2024-02-04 12:34 ` Hannes Reinecke
2024-02-02 7:30 ` [PATCH 17/26] block: Remove BLK_STS_ZONE_RESOURCE Damien Le Moal
2024-02-04 12:34 ` Hannes Reinecke
2024-02-02 7:30 ` [PATCH 18/26] block: Simplify blk_revalidate_disk_zones() interface Damien Le Moal
2024-02-04 12:35 ` Hannes Reinecke
2024-02-02 7:30 ` Damien Le Moal [this message]
2024-02-04 12:36 ` [PATCH 19/26] block: mq-deadline: Remove support for zone write locking Hannes Reinecke
2024-02-02 7:30 ` [PATCH 20/26] block: Remove elevator required features Damien Le Moal
2024-02-04 12:36 ` Hannes Reinecke
2024-02-02 7:30 ` [PATCH 21/26] block: Do not check zone type in blk_check_zone_append() Damien Le Moal
2024-02-04 12:37 ` Hannes Reinecke
2024-02-02 7:31 ` [PATCH 22/26] block: Move zone related debugfs attribute to blk-zoned.c Damien Le Moal
2024-02-04 12:38 ` Hannes Reinecke
2024-02-02 7:31 ` [PATCH 23/26] block: Remove zone write locking Damien Le Moal
2024-02-04 12:38 ` Hannes Reinecke
2024-02-02 7:31 ` [PATCH 24/26] block: Do not special-case plugging of zone write operations Damien Le Moal
2024-02-04 12:39 ` Hannes Reinecke
2024-02-02 7:31 ` [PATCH 25/26] block: Reduce zone write plugging memory usage Damien Le Moal
2024-02-04 12:42 ` Hannes Reinecke
2024-02-05 17:51 ` Bart Van Assche
2024-02-05 23:55 ` Damien Le Moal
2024-02-06 21:20 ` Bart Van Assche
2024-02-09 3:58 ` Damien Le Moal
2024-02-09 19:36 ` Bart Van Assche
2024-02-10 0:06 ` Damien Le Moal
2024-02-11 3:40 ` Bart Van Assche
2024-02-12 1:09 ` Damien Le Moal
2024-02-12 18:58 ` Bart Van Assche
2024-02-12 8:23 ` Damien Le Moal
2024-02-12 8:47 ` Damien Le Moal
2024-02-12 18:40 ` Bart Van Assche
2024-02-13 0:05 ` Damien Le Moal
2024-02-02 7:31 ` [PATCH 26/26] block: Add zone_active_wplugs debugfs entry Damien Le Moal
2024-02-04 12:43 ` Hannes Reinecke
2024-02-02 7:37 ` [PATCH 00/26] Zone write plugging Damien Le Moal
2024-02-03 12:11 ` Jens Axboe
2024-02-09 5:28 ` Damien Le Moal
2024-02-05 17:21 ` Bart Van Assche
2024-02-05 23:42 ` Damien Le Moal
2024-02-06 0:57 ` Bart Van Assche
2024-02-05 18:18 ` Bart Van Assche
2024-02-06 0:07 ` Damien Le Moal
2024-02-06 1:25 ` Bart Van Assche
2024-02-09 4:03 ` Damien Le Moal
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20240202073104.2418230-20-dlemoal@kernel.org \
--to=dlemoal@kernel.org \
--cc=axboe@kernel.dk \
--cc=dm-devel@lists.linux.dev \
--cc=hch@lst.de \
--cc=linux-block@vger.kernel.org \
--cc=linux-scsi@vger.kernel.org \
--cc=martin.petersen@oracle.com \
--cc=snitzer@redhat.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).