From: Jens Axboe <axboe@kernel.dk>
To: linux-block@vger.kernel.org
Cc: Jens Axboe <axboe@kernel.dk>
Subject: [PATCH 8/8] block/bfq: use separate insertion lists
Date: Tue, 23 Jan 2024 10:34:20 -0700 [thread overview]
Message-ID: <20240123174021.1967461-9-axboe@kernel.dk> (raw)
In-Reply-To: <20240123174021.1967461-1-axboe@kernel.dk>
Based on the similar patch for mq-deadline, this uses separate
insertion lists so we can defer touching dd->lock until dispatch
time.
This improves the following fio workload:
fio --bs=512 --group_reporting=1 --gtod_reduce=1 --invalidate=1 \
--ioengine=io_uring --norandommap --runtime=60 --rw=randread \
--thread --time_based=1 --buffered=0 --fixedbufs=1 --numjobs=32 \
--iodepth=4 --iodepth_batch_submit=4 --iodepth_batch_complete=4 \
--name=/dev/nvme0n1 --filename=/dev/nvme0n1
from:
/dev/nvme0n1: (groupid=0, jobs=32): err= 0: pid=1113: Fri Jan 19 20:59:26 2024
read: IOPS=567k, BW=277MiB/s (290MB/s)(1820MiB/6575msec)
bw ( KiB/s): min=274824, max=291156, per=100.00%, avg=283930.08, stdev=143.01, samples=416
iops : min=549648, max=582312, avg=567860.31, stdev=286.01, samples=416
cpu : usr=0.18%, sys=86.04%, ctx=866079, majf=0, minf=0
IO depths : 1=0.0%, 2=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, >=64=0.0%
submit : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
complete : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
issued rwts: total=3728344,0,0,0 short=0,0,0,0 dropped=0,0,0,0
latency : target=0, window=0, percentile=100.00%, depth=4
with 96% lock contention and 86% sys time, to:
/dev/nvme0n1: (groupid=0, jobs=32): err= 0: pid=8922: Sat Jan 20 11:16:20 2024
read: IOPS=1550k, BW=757MiB/s (794MB/s)(19.6GiB/26471msec)
bw ( KiB/s): min=754668, max=848896, per=100.00%, avg=775459.33, stdev=624.43, samples=1664
iops : min=1509336, max=1697793, avg=1550918.83, stdev=1248.87, samples=1664
cpu : usr=1.34%, sys=14.49%, ctx=9950560, majf=0, minf=0
IO depths : 1=0.0%, 2=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, >=64=0.0%
submit : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
complete : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
issued rwts: total=41042924,0,0,0 short=0,0,0,0 dropped=0,0,0,0
latency : target=0, window=0, percentile=100.00%, depth=4
with ~30% lock contention and 14.5% sys time, by applying the lessons
learnt with scaling mq-deadline. Patch needs to be split, but it:
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
block/bfq-iosched.c | 70 ++++++++++++++++++++++++++++++++++-----------
block/bfq-iosched.h | 4 +++
2 files changed, 58 insertions(+), 16 deletions(-)
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index ea16a0c53082..9bd57baa4b0b 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -5174,6 +5174,10 @@ static bool bfq_has_work(struct blk_mq_hw_ctx *hctx)
{
struct bfq_data *bfqd = hctx->queue->elevator->elevator_data;
+ if (!list_empty_careful(&bfqd->at_head) ||
+ !list_empty_careful(&bfqd->at_tail))
+ return true;
+
/*
* Avoiding lock: a race on bfqd->queued should cause at
* most a call to dispatch for nothing
@@ -5323,12 +5327,44 @@ static inline void bfq_update_dispatch_stats(struct request_queue *q,
bool idle_timer_disabled) {}
#endif /* CONFIG_BFQ_CGROUP_DEBUG */
+static void bfq_insert_request(struct request_queue *q, struct request *rq,
+ blk_insert_t flags, struct list_head *free);
+
+static void __bfq_do_insert(struct request_queue *q, blk_insert_t flags,
+ struct list_head *list, struct list_head *free)
+{
+ while (!list_empty(list)) {
+ struct request *rq;
+
+ rq = list_first_entry(list, struct request, queuelist);
+ list_del_init(&rq->queuelist);
+ bfq_insert_request(q, rq, flags, free);
+ }
+}
+
+static void bfq_do_insert(struct request_queue *q, struct list_head *free)
+{
+ struct bfq_data *bfqd = q->elevator->elevator_data;
+ LIST_HEAD(at_head);
+ LIST_HEAD(at_tail);
+
+ spin_lock(&bfqd->insert_lock);
+ list_splice_init(&bfqd->at_head, &at_head);
+ list_splice_init(&bfqd->at_tail, &at_tail);
+ spin_unlock(&bfqd->insert_lock);
+
+ __bfq_do_insert(q, BLK_MQ_INSERT_AT_HEAD, &at_head, free);
+ __bfq_do_insert(q, 0, &at_tail, free);
+}
+
static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
{
- struct bfq_data *bfqd = hctx->queue->elevator->elevator_data;
+ struct request_queue *q = hctx->queue;
+ struct bfq_data *bfqd = q->elevator->elevator_data;
struct request *rq;
struct bfq_queue *in_serv_queue;
bool waiting_rq, idle_timer_disabled = false;
+ LIST_HEAD(free);
/*
* If someone else is already dispatching, skip this one. This will
@@ -5344,6 +5380,8 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
spin_lock_irq(&bfqd->lock);
+ bfq_do_insert(hctx->queue, &free);
+
in_serv_queue = bfqd->in_service_queue;
waiting_rq = in_serv_queue && bfq_bfqq_wait_request(in_serv_queue);
@@ -5355,6 +5393,7 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
clear_bit_unlock(BFQ_DISPATCHING, &bfqd->run_state);
spin_unlock_irq(&bfqd->lock);
+ blk_mq_free_requests(&free);
bfq_update_dispatch_stats(hctx->queue, rq,
idle_timer_disabled ? in_serv_queue : NULL,
idle_timer_disabled);
@@ -6276,25 +6315,20 @@ static inline void bfq_update_insert_stats(struct request_queue *q,
static struct bfq_queue *bfq_init_rq(struct request *rq);
static void bfq_insert_request(struct request_queue *q, struct request *rq,
- blk_insert_t flags)
+ blk_insert_t flags, struct list_head *free)
{
struct bfq_data *bfqd = q->elevator->elevator_data;
struct bfq_queue *bfqq;
bool idle_timer_disabled = false;
blk_opf_t cmd_flags;
- LIST_HEAD(free);
#ifdef CONFIG_BFQ_GROUP_IOSCHED
if (!cgroup_subsys_on_dfl(io_cgrp_subsys) && rq->bio)
bfqg_stats_update_legacy_io(q, rq);
#endif
- spin_lock_irq(&bfqd->lock);
bfqq = bfq_init_rq(rq);
- if (blk_mq_sched_try_insert_merge(q, rq, &free)) {
- spin_unlock_irq(&bfqd->lock);
- blk_mq_free_requests(&free);
+ if (blk_mq_sched_try_insert_merge(q, rq, free))
return;
- }
trace_block_rq_insert(rq);
@@ -6324,8 +6358,6 @@ static void bfq_insert_request(struct request_queue *q, struct request *rq,
* merge).
*/
cmd_flags = rq->cmd_flags;
- spin_unlock_irq(&bfqd->lock);
-
bfq_update_insert_stats(q, bfqq, idle_timer_disabled,
cmd_flags);
}
@@ -6334,13 +6366,15 @@ static void bfq_insert_requests(struct blk_mq_hw_ctx *hctx,
struct list_head *list,
blk_insert_t flags)
{
- while (!list_empty(list)) {
- struct request *rq;
+ struct request_queue *q = hctx->queue;
+ struct bfq_data *bfqd = q->elevator->elevator_data;
- rq = list_first_entry(list, struct request, queuelist);
- list_del_init(&rq->queuelist);
- bfq_insert_request(hctx->queue, rq, flags);
- }
+ spin_lock_irq(&bfqd->insert_lock);
+ if (flags & BLK_MQ_INSERT_AT_HEAD)
+ list_splice_init(list, &bfqd->at_head);
+ else
+ list_splice_init(list, &bfqd->at_tail);
+ spin_unlock_irq(&bfqd->insert_lock);
}
static void bfq_update_hw_tag(struct bfq_data *bfqd)
@@ -7250,6 +7284,10 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
spin_unlock_irq(&q->queue_lock);
spin_lock_init(&bfqd->lock);
+ spin_lock_init(&bfqd->insert_lock);
+
+ INIT_LIST_HEAD(&bfqd->at_head);
+ INIT_LIST_HEAD(&bfqd->at_tail);
/*
* Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues.
diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h
index 56ff69f22163..f44f5d4ec2f4 100644
--- a/block/bfq-iosched.h
+++ b/block/bfq-iosched.h
@@ -516,10 +516,14 @@ enum {
struct bfq_data {
struct {
spinlock_t lock;
+ spinlock_t insert_lock;
} ____cacheline_aligned_in_smp;
unsigned long run_state;
+ struct list_head at_head;
+ struct list_head at_tail;
+
/* device request queue */
struct request_queue *queue;
/* dispatch queue */
--
2.43.0
next prev parent reply other threads:[~2024-01-23 17:40 UTC|newest]
Thread overview: 30+ messages / expand[flat|nested] mbox.gz Atom feed top
2024-01-23 17:34 [PATCHSET v3] mq-deadline and BFQ scalability improvements Jens Axboe
2024-01-23 17:34 ` [PATCH 1/8] block/mq-deadline: pass in queue directly to dd_insert_request() Jens Axboe
2024-01-24 9:21 ` Johannes Thumshirn
2024-01-23 17:34 ` [PATCH 2/8] block/mq-deadline: serialize request dispatching Jens Axboe
2024-01-23 18:36 ` Bart Van Assche
2024-01-23 19:13 ` Jens Axboe
2024-01-24 9:31 ` Christoph Hellwig
2024-01-24 15:00 ` Jens Axboe
2024-01-24 9:29 ` Johannes Thumshirn
2024-01-23 17:34 ` [PATCH 3/8] block/mq-deadline: skip expensive merge lookups if contended Jens Axboe
2024-01-24 9:31 ` Johannes Thumshirn
2024-01-24 9:32 ` Christoph Hellwig
2024-01-24 15:02 ` Jens Axboe
2024-01-23 17:34 ` [PATCH 4/8] block/mq-deadline: use separate insertion lists Jens Axboe
2024-01-23 18:37 ` Bart Van Assche
2024-01-24 9:42 ` Johannes Thumshirn
2024-01-23 17:34 ` [PATCH 5/8] block/bfq: pass in queue directly to bfq_insert_request() Jens Axboe
2024-01-23 18:38 ` Bart Van Assche
2024-01-24 9:46 ` Johannes Thumshirn
2024-01-23 17:34 ` [PATCH 6/8] block/bfq: serialize request dispatching Jens Axboe
2024-01-23 18:40 ` Bart Van Assche
2024-01-23 19:14 ` Jens Axboe
2024-01-23 17:34 ` [PATCH 7/8] block/bfq: skip expensive merge lookups if contended Jens Axboe
2024-01-23 18:44 ` Bart Van Assche
2024-01-23 19:14 ` Jens Axboe
2024-01-23 17:34 ` Jens Axboe [this message]
2024-01-23 18:47 ` [PATCH 8/8] block/bfq: use separate insertion lists Bart Van Assche
2024-01-23 19:18 ` Jens Axboe
2024-01-23 20:03 ` [PATCHSET v3] mq-deadline and BFQ scalability improvements Oleksandr Natalenko
2024-01-23 22:14 ` Jens Axboe
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20240123174021.1967461-9-axboe@kernel.dk \
--to=axboe@kernel.dk \
--cc=linux-block@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).