[PATCH 8/8] block/bfq: use separate insertion lists

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Jens Axboe <axboe@kernel.dk>
To: linux-block@vger.kernel.org
Cc: Jens Axboe <axboe@kernel.dk>
Subject: [PATCH 8/8] block/bfq: use separate insertion lists
Date: Tue, 23 Jan 2024 10:34:20 -0700	[thread overview]
Message-ID: <20240123174021.1967461-9-axboe@kernel.dk> (raw)
In-Reply-To: <20240123174021.1967461-1-axboe@kernel.dk>

Based on the similar patch for mq-deadline, this uses separate
insertion lists so we can defer touching dd->lock until dispatch
time.

This improves the following fio workload:

fio --bs=512 --group_reporting=1 --gtod_reduce=1 --invalidate=1 \
        --ioengine=io_uring --norandommap --runtime=60 --rw=randread \
        --thread --time_based=1 --buffered=0 --fixedbufs=1 --numjobs=32 \
        --iodepth=4 --iodepth_batch_submit=4 --iodepth_batch_complete=4 \
        --name=/dev/nvme0n1 --filename=/dev/nvme0n1

from:

/dev/nvme0n1: (groupid=0, jobs=32): err= 0: pid=1113: Fri Jan 19 20:59:26 2024
  read: IOPS=567k, BW=277MiB/s (290MB/s)(1820MiB/6575msec)
   bw (  KiB/s): min=274824, max=291156, per=100.00%, avg=283930.08, stdev=143.01, samples=416
   iops        : min=549648, max=582312, avg=567860.31, stdev=286.01, samples=416
  cpu          : usr=0.18%, sys=86.04%, ctx=866079, majf=0, minf=0
  IO depths    : 1=0.0%, 2=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, >=64=0.0%
     submit    : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
     complete  : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
     issued rwts: total=3728344,0,0,0 short=0,0,0,0 dropped=0,0,0,0
     latency   : target=0, window=0, percentile=100.00%, depth=4

with 96% lock contention and 86% sys time, to:

/dev/nvme0n1: (groupid=0, jobs=32): err= 0: pid=8922: Sat Jan 20 11:16:20 2024
  read: IOPS=1550k, BW=757MiB/s (794MB/s)(19.6GiB/26471msec)
   bw (  KiB/s): min=754668, max=848896, per=100.00%, avg=775459.33, stdev=624.43, samples=1664
   iops        : min=1509336, max=1697793, avg=1550918.83, stdev=1248.87, samples=1664
  cpu          : usr=1.34%, sys=14.49%, ctx=9950560, majf=0, minf=0
  IO depths    : 1=0.0%, 2=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, >=64=0.0%
     submit    : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
     complete  : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
     issued rwts: total=41042924,0,0,0 short=0,0,0,0 dropped=0,0,0,0
     latency   : target=0, window=0, percentile=100.00%, depth=4

with ~30% lock contention and 14.5% sys time, by applying the lessons
learnt with scaling mq-deadline. Patch needs to be split, but it:

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bfq-iosched.c | 70 ++++++++++++++++++++++++++++++++++-----------
 block/bfq-iosched.h |  4 +++
 2 files changed, 58 insertions(+), 16 deletions(-)

diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index ea16a0c53082..9bd57baa4b0b 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -5174,6 +5174,10 @@ static bool bfq_has_work(struct blk_mq_hw_ctx *hctx)
 {
 	struct bfq_data *bfqd = hctx->queue->elevator->elevator_data;
 
+	if (!list_empty_careful(&bfqd->at_head) ||
+	    !list_empty_careful(&bfqd->at_tail))
+		return true;
+
 	/*
 	 * Avoiding lock: a race on bfqd->queued should cause at
 	 * most a call to dispatch for nothing
@@ -5323,12 +5327,44 @@ static inline void bfq_update_dispatch_stats(struct request_queue *q,
 					     bool idle_timer_disabled) {}
 #endif /* CONFIG_BFQ_CGROUP_DEBUG */
 
+static void bfq_insert_request(struct request_queue *q, struct request *rq,
+			       blk_insert_t flags, struct list_head *free);
+
+static void __bfq_do_insert(struct request_queue *q, blk_insert_t flags,
+			    struct list_head *list, struct list_head *free)
+{
+	while (!list_empty(list)) {
+		struct request *rq;
+
+		rq = list_first_entry(list, struct request, queuelist);
+		list_del_init(&rq->queuelist);
+		bfq_insert_request(q, rq, flags, free);
+	}
+}
+
+static void bfq_do_insert(struct request_queue *q, struct list_head *free)
+{
+	struct bfq_data *bfqd = q->elevator->elevator_data;
+	LIST_HEAD(at_head);
+	LIST_HEAD(at_tail);
+
+	spin_lock(&bfqd->insert_lock);
+	list_splice_init(&bfqd->at_head, &at_head);
+	list_splice_init(&bfqd->at_tail, &at_tail);
+	spin_unlock(&bfqd->insert_lock);
+
+	__bfq_do_insert(q, BLK_MQ_INSERT_AT_HEAD, &at_head, free);
+	__bfq_do_insert(q, 0, &at_tail, free);
+}
+
 static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
 {
-	struct bfq_data *bfqd = hctx->queue->elevator->elevator_data;
+	struct request_queue *q = hctx->queue;
+	struct bfq_data *bfqd = q->elevator->elevator_data;
 	struct request *rq;
 	struct bfq_queue *in_serv_queue;
 	bool waiting_rq, idle_timer_disabled = false;
+	LIST_HEAD(free);
 
 	/*
 	 * If someone else is already dispatching, skip this one. This will
@@ -5344,6 +5380,8 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
 
 	spin_lock_irq(&bfqd->lock);
 
+	bfq_do_insert(hctx->queue, &free);
+
 	in_serv_queue = bfqd->in_service_queue;
 	waiting_rq = in_serv_queue && bfq_bfqq_wait_request(in_serv_queue);
 
@@ -5355,6 +5393,7 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
 
 	clear_bit_unlock(BFQ_DISPATCHING, &bfqd->run_state);
 	spin_unlock_irq(&bfqd->lock);
+	blk_mq_free_requests(&free);
 	bfq_update_dispatch_stats(hctx->queue, rq,
 			idle_timer_disabled ? in_serv_queue : NULL,
 				idle_timer_disabled);
@@ -6276,25 +6315,20 @@ static inline void bfq_update_insert_stats(struct request_queue *q,
 static struct bfq_queue *bfq_init_rq(struct request *rq);
 
 static void bfq_insert_request(struct request_queue *q, struct request *rq,
-			       blk_insert_t flags)
+			       blk_insert_t flags, struct list_head *free)
 {
 	struct bfq_data *bfqd = q->elevator->elevator_data;
 	struct bfq_queue *bfqq;
 	bool idle_timer_disabled = false;
 	blk_opf_t cmd_flags;
-	LIST_HEAD(free);
 
 #ifdef CONFIG_BFQ_GROUP_IOSCHED
 	if (!cgroup_subsys_on_dfl(io_cgrp_subsys) && rq->bio)
 		bfqg_stats_update_legacy_io(q, rq);
 #endif
-	spin_lock_irq(&bfqd->lock);
 	bfqq = bfq_init_rq(rq);
-	if (blk_mq_sched_try_insert_merge(q, rq, &free)) {
-		spin_unlock_irq(&bfqd->lock);
-		blk_mq_free_requests(&free);
+	if (blk_mq_sched_try_insert_merge(q, rq, free))
 		return;
-	}
 
 	trace_block_rq_insert(rq);
 
@@ -6324,8 +6358,6 @@ static void bfq_insert_request(struct request_queue *q, struct request *rq,
 	 * merge).
 	 */
 	cmd_flags = rq->cmd_flags;
-	spin_unlock_irq(&bfqd->lock);
-
 	bfq_update_insert_stats(q, bfqq, idle_timer_disabled,
 				cmd_flags);
 }
@@ -6334,13 +6366,15 @@ static void bfq_insert_requests(struct blk_mq_hw_ctx *hctx,
 				struct list_head *list,
 				blk_insert_t flags)
 {
-	while (!list_empty(list)) {
-		struct request *rq;
+	struct request_queue *q = hctx->queue;
+	struct bfq_data *bfqd = q->elevator->elevator_data;
 
-		rq = list_first_entry(list, struct request, queuelist);
-		list_del_init(&rq->queuelist);
-		bfq_insert_request(hctx->queue, rq, flags);
-	}
+	spin_lock_irq(&bfqd->insert_lock);
+	if (flags & BLK_MQ_INSERT_AT_HEAD)
+		list_splice_init(list, &bfqd->at_head);
+	else
+		list_splice_init(list, &bfqd->at_tail);
+	spin_unlock_irq(&bfqd->insert_lock);
 }
 
 static void bfq_update_hw_tag(struct bfq_data *bfqd)
@@ -7250,6 +7284,10 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
 	spin_unlock_irq(&q->queue_lock);
 
 	spin_lock_init(&bfqd->lock);
+	spin_lock_init(&bfqd->insert_lock);
+
+	INIT_LIST_HEAD(&bfqd->at_head);
+	INIT_LIST_HEAD(&bfqd->at_tail);
 
 	/*
 	 * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues.
diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h
index 56ff69f22163..f44f5d4ec2f4 100644
--- a/block/bfq-iosched.h
+++ b/block/bfq-iosched.h
@@ -516,10 +516,14 @@ enum {
 struct bfq_data {
 	struct {
 		spinlock_t lock;
+		spinlock_t insert_lock;
 	} ____cacheline_aligned_in_smp;
 
 	unsigned long run_state;
 
+	struct list_head at_head;
+	struct list_head at_tail;
+
 	/* device request queue */
 	struct request_queue *queue;
 	/* dispatch queue */
-- 
2.43.0

next prev parent reply	other threads:[~2024-01-23 17:40 UTC|newest]

Thread overview: 30+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-01-23 17:34 [PATCHSET v3] mq-deadline and BFQ scalability improvements Jens Axboe
2024-01-23 17:34 ` [PATCH 1/8] block/mq-deadline: pass in queue directly to dd_insert_request() Jens Axboe
2024-01-24  9:21   ` Johannes Thumshirn
2024-01-23 17:34 ` [PATCH 2/8] block/mq-deadline: serialize request dispatching Jens Axboe
2024-01-23 18:36   ` Bart Van Assche
2024-01-23 19:13     ` Jens Axboe
2024-01-24  9:31       ` Christoph Hellwig
2024-01-24 15:00         ` Jens Axboe
2024-01-24  9:29   ` Johannes Thumshirn
2024-01-23 17:34 ` [PATCH 3/8] block/mq-deadline: skip expensive merge lookups if contended Jens Axboe
2024-01-24  9:31   ` Johannes Thumshirn
2024-01-24  9:32   ` Christoph Hellwig
2024-01-24 15:02     ` Jens Axboe
2024-01-23 17:34 ` [PATCH 4/8] block/mq-deadline: use separate insertion lists Jens Axboe
2024-01-23 18:37   ` Bart Van Assche
2024-01-24  9:42   ` Johannes Thumshirn
2024-01-23 17:34 ` [PATCH 5/8] block/bfq: pass in queue directly to bfq_insert_request() Jens Axboe
2024-01-23 18:38   ` Bart Van Assche
2024-01-24  9:46   ` Johannes Thumshirn
2024-01-23 17:34 ` [PATCH 6/8] block/bfq: serialize request dispatching Jens Axboe
2024-01-23 18:40   ` Bart Van Assche
2024-01-23 19:14     ` Jens Axboe
2024-01-23 17:34 ` [PATCH 7/8] block/bfq: skip expensive merge lookups if contended Jens Axboe
2024-01-23 18:44   ` Bart Van Assche
2024-01-23 19:14     ` Jens Axboe
2024-01-23 17:34 ` Jens Axboe [this message]
2024-01-23 18:47   ` [PATCH 8/8] block/bfq: use separate insertion lists Bart Van Assche
2024-01-23 19:18     ` Jens Axboe
2024-01-23 20:03 ` [PATCHSET v3] mq-deadline and BFQ scalability improvements Oleksandr Natalenko
2024-01-23 22:14   ` Jens Axboe

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:ea16a0c5308 dfblob:9bd57baa4b0 dfblob:56ff69f2216
dfblob:f44f5d4ec2f )
 OR (
bs:"[PATCH 8/8] block/bfq: use separate insertion lists" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20240123174021.1967461-9-axboe@kernel.dk \
    --to=axboe@kernel.dk \
    --cc=linux-block@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.