[PATCH] nvme-tcp: align I/O cpu with blk-mq mapping

public inbox for linux-nvme@lists.infradead.org
 help / color / mirror / Atom feed

* [PATCH] nvme-tcp: align I/O cpu with blk-mq mapping
@ 2024-06-18 12:03 Hannes Reinecke
  2024-06-19  5:30 ` Christoph Hellwig
  0 siblings, 1 reply; 6+ messages in thread
From: Hannes Reinecke @ 2024-06-18 12:03 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Sagi Grimberg, Keith Busch, linux-nvme, Hannes Reinecke

Add a new module parameter 'wq_affinity' to spread the I/O
over all cpus within the blk-mq hctx mapping for the queue.
This avoids bouncing I/O between cpus when we have less
hardware queues than cpus.

Signed-off-by: Hannes Reinecke <hare@kernel.org>
---
 drivers/nvme/host/tcp.c | 57 ++++++++++++++++++++++++++++++++---------
 1 file changed, 45 insertions(+), 12 deletions(-)

diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index 3be67c98c906..8c675ee50ccf 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -44,6 +44,13 @@ static bool wq_unbound;
 module_param(wq_unbound, bool, 0644);
 MODULE_PARM_DESC(wq_unbound, "Use unbound workqueue for nvme-tcp IO context (default false)");
 
+/*
+ * Balance workqueues across all CPUs in the set for a given queue
+ */
+static bool wq_affinity;
+module_param(wq_affinity, bool, 0644);
+MODULE_PARM_DESC(wq_affinity, "Match workqueues to blk-mq cpumask for nvme-tcp IO context (default false)");
+
 /*
  * TLS handshake timeout
  */
@@ -1550,20 +1557,40 @@ static bool nvme_tcp_poll_queue(struct nvme_tcp_queue *queue)
 static void nvme_tcp_set_queue_io_cpu(struct nvme_tcp_queue *queue)
 {
 	struct nvme_tcp_ctrl *ctrl = queue->ctrl;
-	int qid = nvme_tcp_queue_id(queue);
+	struct blk_mq_tag_set *set = &ctrl->tag_set;
+	int qid = nvme_tcp_queue_id(queue) - 1;
+	unsigned int *mq_map;
 	int n = 0;
 
-	if (nvme_tcp_default_queue(queue))
-		n = qid - 1;
-	else if (nvme_tcp_read_queue(queue))
-		n = qid - ctrl->io_queues[HCTX_TYPE_DEFAULT] - 1;
-	else if (nvme_tcp_poll_queue(queue))
+	if (nvme_tcp_default_queue(queue)) {
+		mq_map = set->map[HCTX_TYPE_DEFAULT].mq_map;
+		n = qid;
+	} else if (nvme_tcp_read_queue(queue)) {
+		mq_map = set->map[HCTX_TYPE_READ].mq_map;
+		n = qid - ctrl->io_queues[HCTX_TYPE_DEFAULT];
+	} else if (nvme_tcp_poll_queue(queue)) {
+		mq_map = set->map[HCTX_TYPE_POLL].mq_map;
 		n = qid - ctrl->io_queues[HCTX_TYPE_DEFAULT] -
-				ctrl->io_queues[HCTX_TYPE_READ] - 1;
+				ctrl->io_queues[HCTX_TYPE_READ];
+	}
 	if (wq_unbound)
 		queue->io_cpu = WORK_CPU_UNBOUND;
-	else
+	else if (wq_affinity) {
+		int i;
+
+		if (WARN_ON(!mq_map))
+			return;
+		for_each_cpu(i, cpu_online_mask) {
+			if (mq_map[i] == qid) {
+				queue->io_cpu = i;
+				break;
+			}
+		}
+	} else
 		queue->io_cpu = cpumask_next_wrap(n - 1, cpu_online_mask, -1, false);
+
+	dev_dbg(ctrl->ctrl.device, "queue %d: using cpu %d\n",
+		qid, queue->io_cpu);
 }
 
 static void nvme_tcp_tls_done(void *data, int status, key_serial_t pskid)
@@ -1704,7 +1731,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid,
 
 	queue->sock->sk->sk_allocation = GFP_ATOMIC;
 	queue->sock->sk->sk_use_task_frag = false;
-	nvme_tcp_set_queue_io_cpu(queue);
+	queue->io_cpu = WORK_CPU_UNBOUND;
 	queue->request = NULL;
 	queue->data_remaining = 0;
 	queue->ddgst_remaining = 0;
@@ -1858,9 +1885,10 @@ static int nvme_tcp_start_queue(struct nvme_ctrl *nctrl, int idx)
 	nvme_tcp_init_recv_ctx(queue);
 	nvme_tcp_setup_sock_ops(queue);
 
-	if (idx)
+	if (idx) {
+		nvme_tcp_set_queue_io_cpu(queue);
 		ret = nvmf_connect_io_queue(nctrl, idx);
-	else
+	} else
 		ret = nvmf_connect_admin_queue(nctrl);
 
 	if (!ret) {
@@ -2837,8 +2865,13 @@ static int __init nvme_tcp_init_module(void)
 	BUILD_BUG_ON(sizeof(struct nvme_tcp_icresp_pdu) != 128);
 	BUILD_BUG_ON(sizeof(struct nvme_tcp_term_pdu) != 24);
 
-	if (wq_unbound)
+	if (wq_unbound) {
+		if (wq_affinity) {
+			pr_err("cannot specify both 'wq_unbound' and 'wq_affinity'\n");
+			return -EINVAL;
+		}
 		wq_flags |= WQ_UNBOUND;
+	}
 
 	nvme_tcp_wq = alloc_workqueue("nvme_tcp_wq", wq_flags, 0);
 	if (!nvme_tcp_wq)
-- 
2.35.3



^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH] nvme-tcp: align I/O cpu with blk-mq mapping
  2024-06-18 12:03 [PATCH] nvme-tcp: align I/O cpu with blk-mq mapping Hannes Reinecke
@ 2024-06-19  5:30 ` Christoph Hellwig
  2024-06-19  8:57   ` Sagi Grimberg
  2024-06-19 10:09   ` Hannes Reinecke
  0 siblings, 2 replies; 6+ messages in thread
From: Christoph Hellwig @ 2024-06-19  5:30 UTC (permalink / raw)
  To: Hannes Reinecke; +Cc: Christoph Hellwig, Sagi Grimberg, Keith Busch, linux-nvme

On Tue, Jun 18, 2024 at 02:03:45PM +0200, Hannes Reinecke wrote:
> Add a new module parameter 'wq_affinity' to spread the I/O
> over all cpus within the blk-mq hctx mapping for the queue.
> This avoids bouncing I/O between cpus when we have less
> hardware queues than cpus.

What is the benefit when setting it?  What is the downside?  Why do you
think it needs to be conditional?

> +	}
>  	if (wq_unbound)
>  		queue->io_cpu = WORK_CPU_UNBOUND;
> +	else if (wq_affinity) {

Missing curly braces.  But this also means the wq_unbound options
is incompatible with your new one.

> +	} else
>  		queue->io_cpu = cpumask_next_wrap(n - 1, cpu_online_mask, -1, false);

Overly long line here.



^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] nvme-tcp: align I/O cpu with blk-mq mapping
  2024-06-19  5:30 ` Christoph Hellwig
@ 2024-06-19  8:57   ` Sagi Grimberg
  2024-06-19 10:01     ` Hannes Reinecke
  2024-06-19 10:09   ` Hannes Reinecke
  1 sibling, 1 reply; 6+ messages in thread
From: Sagi Grimberg @ 2024-06-19  8:57 UTC (permalink / raw)
  To: Christoph Hellwig, Hannes Reinecke; +Cc: Keith Busch, linux-nvme

On 19/06/2024 8:30, Christoph Hellwig wrote:
> On Tue, Jun 18, 2024 at 02:03:45PM +0200, Hannes Reinecke wrote:
>> Add a new module parameter 'wq_affinity' to spread the I/O
>> over all cpus within the blk-mq hctx mapping for the queue.
>> This avoids bouncing I/O between cpus when we have less
>> hardware queues than cpus.
> What is the benefit when setting it?  What is the downside?  Why do you
> think it needs to be conditional?

Not entirely sure.

Hannes,

Can you please show what is the hctx<->cpu mappings before/after this
patch? Please use different queue counts so we can see the pattern.

Can you please show any performance comparisons for low and high
queue counts?

I'm not entirely clear what exactly "spreads I/O over all CPUs" mean.
Every nvme-tcp queue has an io context that does network sends and receives
(unless it goes within .queue_rq() context when possible). Does this 
somehow mean
that now a queue io context will run from multiple cpus at the same time?

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] nvme-tcp: align I/O cpu with blk-mq mapping
  2024-06-19  8:57   ` Sagi Grimberg
@ 2024-06-19 10:01     ` Hannes Reinecke
  2024-06-19 10:14       ` Sagi Grimberg
  0 siblings, 1 reply; 6+ messages in thread
From: Hannes Reinecke @ 2024-06-19 10:01 UTC (permalink / raw)
  To: Sagi Grimberg, Christoph Hellwig, Hannes Reinecke; +Cc: Keith Busch, linux-nvme

On 6/19/24 10:57, Sagi Grimberg wrote:
> 
> 
> On 19/06/2024 8:30, Christoph Hellwig wrote:
>> On Tue, Jun 18, 2024 at 02:03:45PM +0200, Hannes Reinecke wrote:
>>> Add a new module parameter 'wq_affinity' to spread the I/O
>>> over all cpus within the blk-mq hctx mapping for the queue.
>>> This avoids bouncing I/O between cpus when we have less
>>> hardware queues than cpus.
>> What is the benefit when setting it?  What is the downside?  Why do you
>> think it needs to be conditional?
> 
> Not entirely sure.
> 

Hehe. But I am :-)

> Hannes,
> 
> Can you please show what is the hctx<->cpu mappings before/after this
> patch? Please use different queue counts so we can see the pattern.
> 
> Can you please show any performance comparisons for low and high
> queue counts?
> 
> I'm not entirely clear what exactly "spreads I/O over all CPUs" mean.
> Every nvme-tcp queue has an io context that does network sends and receives
> (unless it goes within .queue_rq() context when possible). Does this 
> somehow mean
> that now a queue io context will run from multiple cpus at the same time?

We are having on connection per queue, doing a 'queue_work_on()' for 
each queue, based on the value of 'io_cpu'.
That cpu should align to the set of CPUs per associated hwctx, otherwise
we need to bounce the thread from one cpu to another.
Case in point on this AMD machine I get this blk-mq mapping:

queue 0: 6, 7, 8, 54, 55, 56
queue 1: 9, 10, 11, 57, 58, 59
queue 10: 42, 43, 44, 90, 91, 92
queue 11: 45, 46, 47, 93, 94, 95
queue 12: 12, 13, 14, 60, 61, 62
queue 13: 15, 16, 17, 63, 64, 65
queue 14: 0, 1, 2, 48, 49, 50
queue 15: 3, 4, 5, 51, 52, 53
queue 2: 18, 19, 20, 66, 67, 68
queue 3: 21, 22, 23, 69, 70, 71
queue 4: 24, 25, 26, 72, 73, 74
queue 5: 27, 28, 29, 75, 76, 77
queue 6: 30, 31, 32, 78, 79, 80
queue 7: 33, 34, 35, 81, 82, 83
queue 8: 36, 37, 38, 84, 85, 86
queue 9: 39, 40, 41, 87, 88, 89

- default behaviour:

nvme nvme1: mapped 16/0/0 default/read/poll queues.
nvme nvme1: queue 1: using cpu 0
nvme nvme1: queue 2: using cpu 1
nvme nvme1: queue 3: using cpu 2
nvme nvme1: queue 4: using cpu 3
nvme nvme1: queue 5: using cpu 4
nvme nvme1: queue 6: using cpu 5
nvme nvme1: queue 7: using cpu 6
nvme nvme1: queue 8: using cpu 7
nvme nvme1: queue 9: using cpu 8
nvme nvme1: queue 10: using cpu 9
nvme nvme1: queue 11: using cpu 10
nvme nvme1: queue 12: using cpu 11
nvme nvme1: queue 13: using cpu 12
nvme nvme1: queue 14: using cpu 13
nvme nvme1: queue 15: using cpu 14
nvme nvme1: queue 16: using cpu 15

So just queue 12 is using the correct CPU; all other queues are
scheduled on cpus which are not part of the hwctx map.

- wq_affinity behaviour:

nvme nvme1: queue 1: using cpu 6
nvme nvme1: queue 2: using cpu 9
nvme nvme1: queue 3: using cpu 18
nvme nvme1: queue 4: using cpu 21
nvme nvme1: queue 5: using cpu 24
nvme nvme1: queue 6: using cpu 27
nvme nvme1: queue 7: using cpu 30
nvme nvme1: queue 8: using cpu 33
nvme nvme1: queue 9: using cpu 36
nvme nvme1: queue 10: using cpu 39
nvme nvme1: queue 11: using cpu 42
nvme nvme1: queue 12: using cpu 45
nvme nvme1: queue 13: using cpu 12
nvme nvme1: queue 14: using cpu 15
nvme nvme1: queue 15: using cpu 0
nvme nvme1: queue 16: using cpu 3

All queues are scheduled on a cpu which is part of the
blk-mq hwctx map.

Cheers,

Hannes
-- 
Dr. Hannes Reinecke                Kernel Storage Architect
hare@suse.de                              +49 911 74053 688
SUSE Software Solutions GmbH, Maxfeldstr. 5, 90409 Nürnberg
HRB 36809 (AG Nürnberg), GF: Ivo Totev, Andrew McDonald,
Werner Knoblich



^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] nvme-tcp: align I/O cpu with blk-mq mapping
  2024-06-19  5:30 ` Christoph Hellwig
  2024-06-19  8:57   ` Sagi Grimberg
@ 2024-06-19 10:09   ` Hannes Reinecke
  1 sibling, 0 replies; 6+ messages in thread
From: Hannes Reinecke @ 2024-06-19 10:09 UTC (permalink / raw)
  To: Christoph Hellwig, Hannes Reinecke; +Cc: Sagi Grimberg, Keith Busch, linux-nvme

On 6/19/24 07:30, Christoph Hellwig wrote:
> On Tue, Jun 18, 2024 at 02:03:45PM +0200, Hannes Reinecke wrote:
>> Add a new module parameter 'wq_affinity' to spread the I/O
>> over all cpus within the blk-mq hctx mapping for the queue.
>> This avoids bouncing I/O between cpus when we have less
>> hardware queues than cpus.
> 
> What is the benefit when setting it?  What is the downside?  Why do you
> think it needs to be conditional?
> 
We could make it the default option, as the current way of just cycling
through the cpu_online_mask() works only for trivial cpu configurations 
(ie not on AMD).
Downsides are none; at worst we're ending up with the same behaviour
which we have now (ie scheduling I/O on a cpu which is not part of the
blk-mq hwctx map).

>> +	}
>>   	if (wq_unbound)
>>   		queue->io_cpu = WORK_CPU_UNBOUND;
>> +	else if (wq_affinity) {
> 
> Missing curly braces.  But this also means the wq_unbound options
> is incompatible with your new one.
> 
Correct. The options are mutually incompatible.
I was looking into merging them, but then 'wq_unbound' is already 
existing and I wasn't sure if one can replace existing module options.

>> +	} else
>>   		queue->io_cpu = cpumask_next_wrap(n - 1, cpu_online_mask, -1, false);
> 
> Overly long line here.
> 
Yeah, sorry.

Cheers,

Hannes
-- 
Dr. Hannes Reinecke                Kernel Storage Architect
hare@suse.de                              +49 911 74053 688
SUSE Software Solutions GmbH, Maxfeldstr. 5, 90409 Nürnberg
HRB 36809 (AG Nürnberg), GF: Ivo Totev, Andrew McDonald,
Werner Knoblich



^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] nvme-tcp: align I/O cpu with blk-mq mapping
  2024-06-19 10:01     ` Hannes Reinecke
@ 2024-06-19 10:14       ` Sagi Grimberg
  0 siblings, 0 replies; 6+ messages in thread
From: Sagi Grimberg @ 2024-06-19 10:14 UTC (permalink / raw)
  To: Hannes Reinecke, Christoph Hellwig, Hannes Reinecke
  Cc: Keith Busch, linux-nvme



On 19/06/2024 13:01, Hannes Reinecke wrote:
> On 6/19/24 10:57, Sagi Grimberg wrote:
>>
>>
>> On 19/06/2024 8:30, Christoph Hellwig wrote:
>>> On Tue, Jun 18, 2024 at 02:03:45PM +0200, Hannes Reinecke wrote:
>>>> Add a new module parameter 'wq_affinity' to spread the I/O
>>>> over all cpus within the blk-mq hctx mapping for the queue.
>>>> This avoids bouncing I/O between cpus when we have less
>>>> hardware queues than cpus.
>>> What is the benefit when setting it?  What is the downside? Why do you
>>> think it needs to be conditional?
>>
>> Not entirely sure.
>>
>
> Hehe. But I am :-)
>
>> Hannes,
>>
>> Can you please show what is the hctx<->cpu mappings before/after this
>> patch? Please use different queue counts so we can see the pattern.
>>
>> Can you please show any performance comparisons for low and high
>> queue counts?
>>
>> I'm not entirely clear what exactly "spreads I/O over all CPUs" mean.
>> Every nvme-tcp queue has an io context that does network sends and 
>> receives
>> (unless it goes within .queue_rq() context when possible). Does this 
>> somehow mean
>> that now a queue io context will run from multiple cpus at the same 
>> time?
>
> We are having on connection per queue, doing a 'queue_work_on()' for 
> each queue, based on the value of 'io_cpu'.
> That cpu should align to the set of CPUs per associated hwctx, otherwise
> we need to bounce the thread from one cpu to another.
> Case in point on this AMD machine I get this blk-mq mapping:
>
> queue 0: 6, 7, 8, 54, 55, 56
> queue 1: 9, 10, 11, 57, 58, 59
> queue 10: 42, 43, 44, 90, 91, 92
> queue 11: 45, 46, 47, 93, 94, 95
> queue 12: 12, 13, 14, 60, 61, 62
> queue 13: 15, 16, 17, 63, 64, 65
> queue 14: 0, 1, 2, 48, 49, 50
> queue 15: 3, 4, 5, 51, 52, 53
> queue 2: 18, 19, 20, 66, 67, 68
> queue 3: 21, 22, 23, 69, 70, 71
> queue 4: 24, 25, 26, 72, 73, 74
> queue 5: 27, 28, 29, 75, 76, 77
> queue 6: 30, 31, 32, 78, 79, 80
> queue 7: 33, 34, 35, 81, 82, 83
> queue 8: 36, 37, 38, 84, 85, 86
> queue 9: 39, 40, 41, 87, 88, 89
>
> - default behaviour:
>
> nvme nvme1: mapped 16/0/0 default/read/poll queues.
> nvme nvme1: queue 1: using cpu 0
> nvme nvme1: queue 2: using cpu 1
> nvme nvme1: queue 3: using cpu 2
> nvme nvme1: queue 4: using cpu 3
> nvme nvme1: queue 5: using cpu 4
> nvme nvme1: queue 6: using cpu 5
> nvme nvme1: queue 7: using cpu 6
> nvme nvme1: queue 8: using cpu 7
> nvme nvme1: queue 9: using cpu 8
> nvme nvme1: queue 10: using cpu 9
> nvme nvme1: queue 11: using cpu 10
> nvme nvme1: queue 12: using cpu 11
> nvme nvme1: queue 13: using cpu 12
> nvme nvme1: queue 14: using cpu 13
> nvme nvme1: queue 15: using cpu 14
> nvme nvme1: queue 16: using cpu 15
>
> So just queue 12 is using the correct CPU; all other queues are
> scheduled on cpus which are not part of the hwctx map.
>
> - wq_affinity behaviour:
>
> nvme nvme1: queue 1: using cpu 6
> nvme nvme1: queue 2: using cpu 9
> nvme nvme1: queue 3: using cpu 18
> nvme nvme1: queue 4: using cpu 21
> nvme nvme1: queue 5: using cpu 24
> nvme nvme1: queue 6: using cpu 27
> nvme nvme1: queue 7: using cpu 30
> nvme nvme1: queue 8: using cpu 33
> nvme nvme1: queue 9: using cpu 36
> nvme nvme1: queue 10: using cpu 39
> nvme nvme1: queue 11: using cpu 42
> nvme nvme1: queue 12: using cpu 45
> nvme nvme1: queue 13: using cpu 12
> nvme nvme1: queue 14: using cpu 15
> nvme nvme1: queue 15: using cpu 0
> nvme nvme1: queue 16: using cpu 3
>
> All queues are scheduled on a cpu which is part of the
> blk-mq hwctx map.

OK, I think I understand what this patch does now, thanks (this should 
appear in the patch change log).

The change log really needs to improve here Hannes,
The wording "spread the I/O over all cpus within the blk-mq hctx mapping 
for the queue" is very misleading. It is actually addressing a incorrect 
selection io_cpu selection for certain cpu topologies which leads to 
running different queue threads on sibling cpus, which may lead to 
suboptimal performance. And please add performance comparisons of this 
change. And lets remove the module parameter, iiuc this change is 
universally better.


^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2024-06-19 10:14 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-06-18 12:03 [PATCH] nvme-tcp: align I/O cpu with blk-mq mapping Hannes Reinecke
2024-06-19  5:30 ` Christoph Hellwig
2024-06-19  8:57   ` Sagi Grimberg
2024-06-19 10:01     ` Hannes Reinecke
2024-06-19 10:14       ` Sagi Grimberg
2024-06-19 10:09   ` Hannes Reinecke

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox