[PATCH 3/4] block/mq-deadline: fallback to per-cpu insertion buckets under contention

public inbox for linux-block@vger.kernel.org
 help / color / mirror / Atom feed

From: Jens Axboe <axboe@kernel.dk>
To: linux-block@vger.kernel.org
Cc: bvanassche@acm.org, Jens Axboe <axboe@kernel.dk>
Subject: [PATCH 3/4] block/mq-deadline: fallback to per-cpu insertion buckets under contention
Date: Fri, 19 Jan 2024 09:02:08 -0700	[thread overview]
Message-ID: <20240119160338.1191281-4-axboe@kernel.dk> (raw)
In-Reply-To: <20240119160338.1191281-1-axboe@kernel.dk>

If we attempt to insert a list of requests, but someone else is already
running an insertion, then fallback to queueing that list internally and
let the existing inserter finish the operation. The current inserter
will either see and flush this list, of if it ends before we're done
doing our bucket insert, then we'll flush it and insert ourselves.

This reduces contention on the dd->lock, which protects any request
insertion or dispatch, by having a backup point to insert into which
will either be flushed immediately or by an existing inserter. As the
alternative is to just keep spinning on the dd->lock, it's very easy
to get into a situation where multiple processes are trying to do IO
and all sit and spin on this lock.

With the previous dispatch optimization, this drastically reduces
contention for a sample cases of 32 threads doing IO to devices. The
test case looks as follows:

fio --bs=512 --group_reporting=1 --gtod_reduce=1 --invalidate=1 \
	--ioengine=io_uring --norandommap --runtime=60 --rw=randread \
	--thread --time_based=1 --buffered=0 --fixedbufs=1 --numjobs=32 \
	--iodepth=4 --iodepth_batch_submit=4 --iodepth_batch_complete=4 \
	--name=scaletest --filename=/dev/$DEV

Before:

Device		IOPS	sys	contention	diff
====================================================
null_blk	879K	89%	93.6%
nvme0n1		901K	86%	94.5%

and after this and the previous dispatch patch:

Device		IOPS	sys	contention	diff
====================================================
null_blk	2311K	10.3%	21.1%		+257%
nvme0n1		2610K	11.0%	24.6%		+289%

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/mq-deadline.c | 130 +++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 121 insertions(+), 9 deletions(-)

diff --git a/block/mq-deadline.c b/block/mq-deadline.c
index b579ce282176..cc3155d50e0d 100644
--- a/block/mq-deadline.c
+++ b/block/mq-deadline.c
@@ -81,8 +81,18 @@ struct dd_per_prio {
 
 enum {
 	DD_DISPATCHING	= 0,
+	DD_INSERTING	= 1,
+	DD_BUCKETS	= 2,
 };
 
+#define DD_CPU_BUCKETS		32
+#define DD_CPU_BUCKETS_MASK	(DD_CPU_BUCKETS - 1)
+
+struct dd_bucket_list {
+	struct list_head list;
+	spinlock_t lock;
+} ____cacheline_aligned_in_smp;
+
 struct deadline_data {
 	/*
 	 * run time data
@@ -94,6 +104,8 @@ struct deadline_data {
 
 	unsigned long run_state;
 
+	struct dd_bucket_list bucket_lists[DD_CPU_BUCKETS];
+
 	struct dd_per_prio per_prio[DD_PRIO_COUNT];
 
 	/* Data direction of latest dispatched request. */
@@ -714,7 +726,7 @@ static int dd_init_sched(struct request_queue *q, struct elevator_type *e)
 	struct deadline_data *dd;
 	struct elevator_queue *eq;
 	enum dd_prio prio;
-	int ret = -ENOMEM;
+	int i, ret = -ENOMEM;
 
 	eq = elevator_alloc(q, e);
 	if (!eq)
@@ -729,6 +741,11 @@ static int dd_init_sched(struct request_queue *q, struct elevator_type *e)
 	spin_lock_init(&dd->lock);
 	spin_lock_init(&dd->zone_lock);
 
+	for (i = 0; i < DD_CPU_BUCKETS; i++) {
+		INIT_LIST_HEAD(&dd->bucket_lists[i].list);
+		spin_lock_init(&dd->bucket_lists[i].lock);
+	}
+
 	for (prio = 0; prio <= DD_PRIO_MAX; prio++) {
 		struct dd_per_prio *per_prio = &dd->per_prio[prio];
 
@@ -878,6 +895,94 @@ static void dd_insert_request(struct request_queue *q, struct request *rq,
 	}
 }
 
+static void dd_dispatch_from_buckets(struct deadline_data *dd,
+				     struct list_head *list)
+{
+	int i;
+
+	if (!test_bit(DD_BUCKETS, &dd->run_state) ||
+	    !test_and_clear_bit(DD_BUCKETS, &dd->run_state))
+		return;
+
+	for (i = 0; i < DD_CPU_BUCKETS; i++) {
+		struct dd_bucket_list *bucket = &dd->bucket_lists[i];
+
+		if (list_empty_careful(&bucket->list))
+			continue;
+		spin_lock(&bucket->lock);
+		list_splice_init(&bucket->list, list);
+		spin_unlock(&bucket->lock);
+	}
+}
+
+/*
+ * If we can grab the dd->lock, then just return and do the insertion as per
+ * usual. If not, add to one of our internal buckets, and afterwards recheck
+ * if if we should retry.
+ */
+static bool dd_insert_to_bucket(struct deadline_data *dd,
+				struct list_head *list)
+	__acquires(&dd->lock)
+{
+	struct dd_bucket_list *bucket;
+
+	/*
+	 * If we can grab the lock, proceed as per usual. If not, and insert
+	 * isn't running, force grab the lock and proceed as per usual.
+	 */
+	if (spin_trylock(&dd->lock))
+		return false;
+	if (!test_bit(DD_INSERTING, &dd->run_state)) {
+		spin_lock(&dd->lock);
+		return false;
+	}
+
+	if (!test_bit(DD_BUCKETS, &dd->run_state))
+		set_bit(DD_BUCKETS, &dd->run_state);
+
+	bucket = &dd->bucket_lists[get_cpu() & DD_CPU_BUCKETS_MASK];
+	spin_lock(&bucket->lock);
+	list_splice_init(list, &bucket->list);
+	spin_unlock(&bucket->lock);
+	put_cpu();
+
+	/*
+	 * Insertion still running, we are done.
+	 */
+	if (test_bit(DD_INSERTING, &dd->run_state))
+		return true;
+
+	/*
+	 * We may be too late, play it safe and grab the lock. This will
+	 * flush the above bucket insert as well and insert it.
+	 */
+	spin_lock(&dd->lock);
+	return false;
+}
+
+static void __dd_insert_requests(struct request_queue *q,
+				 struct deadline_data *dd,
+				 struct list_head *list, blk_insert_t flags,
+				 struct list_head *free)
+{
+	set_bit(DD_INSERTING, &dd->run_state);
+	do {
+		while (!list_empty(list)) {
+			struct request *rq;
+
+			rq = list_first_entry(list, struct request, queuelist);
+			list_del_init(&rq->queuelist);
+			dd_insert_request(q, rq, flags, free);
+		}
+
+		dd_dispatch_from_buckets(dd, list);
+		if (list_empty(list))
+			break;
+	} while (1);
+
+	clear_bit(DD_INSERTING, &dd->run_state);
+}
+
 /*
  * Called from blk_mq_insert_request() or blk_mq_dispatch_plug_list().
  */
@@ -889,16 +994,23 @@ static void dd_insert_requests(struct blk_mq_hw_ctx *hctx,
 	struct deadline_data *dd = q->elevator->elevator_data;
 	LIST_HEAD(free);
 
-	spin_lock(&dd->lock);
-	while (!list_empty(list)) {
-		struct request *rq;
+	/*
+	 * If dispatch is busy and we ended up adding to our internal bucket,
+	 * then we're done for now.
+	 */
+	if (dd_insert_to_bucket(dd, list))
+		return;
 
-		rq = list_first_entry(list, struct request, queuelist);
-		list_del_init(&rq->queuelist);
-		dd_insert_request(q, rq, flags, &free);
-	}
-	spin_unlock(&dd->lock);
+	do {
+		__dd_insert_requests(q, dd, list, flags, &free);
 
+		/*
+		 * If buckets is set after inserting was cleared, be safe and do
+		 * another loop as we could be racing with bucket insertion.
+		 */
+	} while (test_bit(DD_BUCKETS, &dd->run_state));
+
+	spin_unlock(&dd->lock);
 	blk_mq_free_requests(&free);
 }
 
-- 
2.43.0

next prev parent reply	other threads:[~2024-01-19 16:03 UTC|newest]

Thread overview: 13+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-01-19 16:02 [PATCHSET RFC v2 0/4] mq-deadline scalability improvements Jens Axboe
2024-01-19 16:02 ` [PATCH 1/4] block/mq-deadline: pass in queue directly to dd_insert_request() Jens Axboe
2024-01-19 23:35   ` Bart Van Assche
2024-01-19 16:02 ` [PATCH 2/4] block/mq-deadline: serialize request dispatching Jens Axboe
2024-01-19 23:24   ` Bart Van Assche
2024-01-20  0:00     ` Jens Axboe
2024-01-19 16:02 ` Jens Axboe [this message]
2024-01-19 23:16   ` [PATCH 3/4] block/mq-deadline: fallback to per-cpu insertion buckets under contention Bart Van Assche
2024-01-20  0:05     ` Jens Axboe
2024-01-20  0:13       ` Jens Axboe
2024-01-20  0:31       ` Jens Axboe
2024-01-22 23:55       ` Bart Van Assche
2024-01-19 16:02 ` [PATCH 4/4] block/mq-deadline: skip expensive merge lookups if contended Jens Axboe

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:b579ce28217 dfblob:cc3155d50e0 )
 OR (
bs:"[PATCH 3/4] block/mq-deadline: fallback to per-cpu insertion buckets under contention" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20240119160338.1191281-4-axboe@kernel.dk \
    --to=axboe@kernel.dk \
    --cc=bvanassche@acm.org \
    --cc=linux-block@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox