[PATCH 0/4] nvme-tcp: improve scalability

Linux-NVME Archive on lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH 0/4] nvme-tcp: improve scalability
@ 2024-07-03 13:50 Hannes Reinecke
  2024-07-03 13:50 ` [PATCH 1/4] nvme-tcp: per-controller I/O workqueues Hannes Reinecke
                   ` (3 more replies)
  0 siblings, 4 replies; 37+ messages in thread
From: Hannes Reinecke @ 2024-07-03 13:50 UTC (permalink / raw)
  To: Sagi Grimberg; +Cc: Christoph Hellwig, Keith Busch, linux-nvme, Hannes Reinecke

Hi all,

for workloads with a lot of controllers we run into workqueue contention,
where the single workqueue is not able to service requests fast enough,
leading to spurious I/O errors and connect resets during high load.
This patchset improves the situation by introducing per-controller workqueues,
and modifies the 'wq_unbound' module parameter to include the blk-mq
cpu information for improving the locality.
With this we reduce the spurious I/O errors and improve the overall
performance for highly contended workloads.

Performance comparisons are hard to get as the original code would
abort with I/O timeouts such that fio has a hard time completing...

As usual, comments and reviews are welcome.

Hannes Reinecke (4):
  nvme-tcp: per-controller I/O workqueues
  nvme-tcp: align I/O cpu with blk-mq mapping
  workqueue: introduce helper workqueue_unbound_affinity_scope()
  nvme-tcp: switch to 'cpu' affinity scope for unbound workqueues

 drivers/nvme/host/tcp.c   | 84 +++++++++++++++++++++++++--------------
 include/linux/workqueue.h |  1 +
 kernel/workqueue.c        | 47 +++++++++++++++++-----
 3 files changed, 93 insertions(+), 39 deletions(-)

-- 
2.35.3

^ permalink raw reply	[flat|nested] 37+ messages in thread

* [PATCH 1/4] nvme-tcp: per-controller I/O workqueues
  2024-07-03 13:50 [PATCH 0/4] nvme-tcp: improve scalability Hannes Reinecke
@ 2024-07-03 13:50 ` Hannes Reinecke
  2024-07-03 14:11   ` Sagi Grimberg
  2024-07-04  5:36   ` Christoph Hellwig
  2024-07-03 13:50 ` [PATCH 2/4] nvme-tcp: align I/O cpu with blk-mq mapping Hannes Reinecke
                   ` (2 subsequent siblings)
  3 siblings, 2 replies; 37+ messages in thread
From: Hannes Reinecke @ 2024-07-03 13:50 UTC (permalink / raw)
  To: Sagi Grimberg; +Cc: Christoph Hellwig, Keith Busch, linux-nvme, Hannes Reinecke

Implement per-controller I/O workqueues to reduce workqueue contention
during I/O.

Signed-off-by: Hannes Reinecke <hare@kernel.org>
---
 drivers/nvme/host/tcp.c | 35 ++++++++++++++++++-----------------
 1 file changed, 18 insertions(+), 17 deletions(-)

diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index 5885aa452aa1..d43099c562fc 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -191,6 +191,7 @@ struct nvme_tcp_ctrl {
 	struct sockaddr_storage src_addr;
 	struct nvme_ctrl	ctrl;
 
+	struct workqueue_struct	*io_wq;
 	struct work_struct	err_work;
 	struct delayed_work	connect_work;
 	struct nvme_tcp_request async_req;
@@ -199,7 +200,6 @@ struct nvme_tcp_ctrl {
 
 static LIST_HEAD(nvme_tcp_ctrl_list);
 static DEFINE_MUTEX(nvme_tcp_ctrl_mutex);
-static struct workqueue_struct *nvme_tcp_wq;
 static const struct blk_mq_ops nvme_tcp_mq_ops;
 static const struct blk_mq_ops nvme_tcp_admin_mq_ops;
 static int nvme_tcp_try_send(struct nvme_tcp_queue *queue);
@@ -402,7 +402,7 @@ static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req,
 	}
 
 	if (last && nvme_tcp_queue_has_pending(queue))
-		queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
+		queue_work_on(queue->io_cpu, queue->ctrl->io_wq, &queue->io_work);
 }
 
 static void nvme_tcp_process_req_list(struct nvme_tcp_queue *queue)
@@ -974,7 +974,7 @@ static void nvme_tcp_data_ready(struct sock *sk)
 	queue = sk->sk_user_data;
 	if (likely(queue && queue->rd_enabled) &&
 	    !test_bit(NVME_TCP_Q_POLLING, &queue->flags))
-		queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
+		queue_work_on(queue->io_cpu, queue->ctrl->io_wq, &queue->io_work);
 	read_unlock_bh(&sk->sk_callback_lock);
 }
 
@@ -986,7 +986,7 @@ static void nvme_tcp_write_space(struct sock *sk)
 	queue = sk->sk_user_data;
 	if (likely(queue && sk_stream_is_writeable(sk))) {
 		clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
-		queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
+		queue_work_on(queue->io_cpu, queue->ctrl->io_wq, &queue->io_work);
 	}
 	read_unlock_bh(&sk->sk_callback_lock);
 }
@@ -1304,7 +1304,7 @@ static void nvme_tcp_io_work(struct work_struct *w)
 
 	} while (!time_after(jiffies, deadline)); /* quota is exhausted */
 
-	queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
+	queue_work_on(queue->io_cpu, queue->ctrl->io_wq, &queue->io_work);
 }
 
 static void nvme_tcp_free_crypto(struct nvme_tcp_queue *queue)
@@ -2390,6 +2390,8 @@ static void nvme_tcp_free_ctrl(struct nvme_ctrl *nctrl)
 
 	nvmf_free_options(nctrl->opts);
 free_ctrl:
+	destroy_workqueue(ctrl->io_wq);
+
 	kfree(ctrl->queues);
 	kfree(ctrl);
 }
@@ -2580,7 +2582,7 @@ static void nvme_tcp_commit_rqs(struct blk_mq_hw_ctx *hctx)
 	struct nvme_tcp_queue *queue = hctx->driver_data;
 
 	if (!llist_empty(&queue->req_list))
-		queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
+		queue_work_on(queue->io_cpu, queue->ctrl->io_wq, &queue->io_work);
 }
 
 static blk_status_t nvme_tcp_queue_rq(struct blk_mq_hw_ctx *hctx,
@@ -2712,6 +2714,7 @@ static struct nvme_tcp_ctrl *nvme_tcp_alloc_ctrl(struct device *dev,
 		struct nvmf_ctrl_options *opts)
 {
 	struct nvme_tcp_ctrl *ctrl;
+	unsigned int wq_flags = WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_SYSFS;
 	int ret;
 
 	ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL);
@@ -2783,6 +2786,15 @@ static struct nvme_tcp_ctrl *nvme_tcp_alloc_ctrl(struct device *dev,
 	if (ret)
 		goto out_kfree_queues;
 
+	if (wq_unbound)
+		wq_flags |= WQ_UNBOUND;
+	ctrl->io_wq = alloc_workqueue("nvme_tcp_wq_%d", wq_flags, 0,
+				      ctrl->ctrl.instance);
+	if (!ctrl->io_wq) {
+		nvme_put_ctrl(&ctrl->ctrl);
+		return ERR_PTR(-ENOMEM);
+	}
+
 	return ctrl;
 out_kfree_queues:
 	kfree(ctrl->queues);
@@ -2848,8 +2860,6 @@ static struct nvmf_transport_ops nvme_tcp_transport = {
 
 static int __init nvme_tcp_init_module(void)
 {
-	unsigned int wq_flags = WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_SYSFS;
-
 	BUILD_BUG_ON(sizeof(struct nvme_tcp_hdr) != 8);
 	BUILD_BUG_ON(sizeof(struct nvme_tcp_cmd_pdu) != 72);
 	BUILD_BUG_ON(sizeof(struct nvme_tcp_data_pdu) != 24);
@@ -2859,13 +2869,6 @@ static int __init nvme_tcp_init_module(void)
 	BUILD_BUG_ON(sizeof(struct nvme_tcp_icresp_pdu) != 128);
 	BUILD_BUG_ON(sizeof(struct nvme_tcp_term_pdu) != 24);
 
-	if (wq_unbound)
-		wq_flags |= WQ_UNBOUND;
-
-	nvme_tcp_wq = alloc_workqueue("nvme_tcp_wq", wq_flags, 0);
-	if (!nvme_tcp_wq)
-		return -ENOMEM;
-
 	nvmf_register_transport(&nvme_tcp_transport);
 	return 0;
 }
@@ -2881,8 +2884,6 @@ static void __exit nvme_tcp_cleanup_module(void)
 		nvme_delete_ctrl(&ctrl->ctrl);
 	mutex_unlock(&nvme_tcp_ctrl_mutex);
 	flush_workqueue(nvme_delete_wq);
-
-	destroy_workqueue(nvme_tcp_wq);
 }
 
 module_init(nvme_tcp_init_module);
-- 
2.35.3



^ permalink raw reply related	[flat|nested] 37+ messages in thread

* [PATCH 2/4] nvme-tcp: align I/O cpu with blk-mq mapping
  2024-07-03 13:50 [PATCH 0/4] nvme-tcp: improve scalability Hannes Reinecke
  2024-07-03 13:50 ` [PATCH 1/4] nvme-tcp: per-controller I/O workqueues Hannes Reinecke
@ 2024-07-03 13:50 ` Hannes Reinecke
  2024-07-03 14:19   ` Sagi Grimberg
  2024-07-03 13:50 ` [PATCH 3/4] workqueue: introduce helper workqueue_unbound_affinity_scope() Hannes Reinecke
  2024-07-03 13:50 ` [PATCH 4/4] nvme-tcp: switch to 'cpu' affinity scope for unbound workqueues Hannes Reinecke
  3 siblings, 1 reply; 37+ messages in thread
From: Hannes Reinecke @ 2024-07-03 13:50 UTC (permalink / raw)
  To: Sagi Grimberg; +Cc: Christoph Hellwig, Keith Busch, linux-nvme, Hannes Reinecke

When 'wq_unbound' is selected we should select the
the first CPU from a given blk-mq hctx mapping to queue
the tcp workqueue item. With this we can instruct the
workqueue code to keep the I/O affinity and avoid
a performance penalty.

One should switch to 'cpu' workqueue affinity to
get full advantage of this by issuing:

echo cpu > /sys/devices/virtual/workqueue/nvme_tcp_wq_*/affinity_scope

Signed-off-by: Hannes Reinecke <hare@kernel.org>
---
 drivers/nvme/host/tcp.c | 44 +++++++++++++++++++++++++++++------------
 1 file changed, 31 insertions(+), 13 deletions(-)

diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index d43099c562fc..df184004a514 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -1559,19 +1559,36 @@ static bool nvme_tcp_poll_queue(struct nvme_tcp_queue *queue)
 static void nvme_tcp_set_queue_io_cpu(struct nvme_tcp_queue *queue)
 {
 	struct nvme_tcp_ctrl *ctrl = queue->ctrl;
-	int qid = nvme_tcp_queue_id(queue);
+	struct blk_mq_tag_set *set = &ctrl->tag_set;
+	int qid = nvme_tcp_queue_id(queue) - 1;
+	unsigned int *mq_map = NULL;;
 	int n = 0;
 
-	if (nvme_tcp_default_queue(queue))
-		n = qid - 1;
-	else if (nvme_tcp_read_queue(queue))
-		n = qid - ctrl->io_queues[HCTX_TYPE_DEFAULT] - 1;
-	else if (nvme_tcp_poll_queue(queue))
+	if (nvme_tcp_default_queue(queue)) {
+		mq_map = set->map[HCTX_TYPE_DEFAULT].mq_map;
+		n = qid;
+	} else if (nvme_tcp_read_queue(queue)) {
+		mq_map = set->map[HCTX_TYPE_READ].mq_map;
+		n = qid - ctrl->io_queues[HCTX_TYPE_DEFAULT];
+	} else if (nvme_tcp_poll_queue(queue)) {
+		mq_map = set->map[HCTX_TYPE_POLL].mq_map;
 		n = qid - ctrl->io_queues[HCTX_TYPE_DEFAULT] -
-				ctrl->io_queues[HCTX_TYPE_READ] - 1;
-	if (wq_unbound)
-		queue->io_cpu = WORK_CPU_UNBOUND;
-	else
+				ctrl->io_queues[HCTX_TYPE_READ];
+	}
+	if (wq_unbound) {
+		int cpu;
+
+		if (WARN_ON(!mq_map))
+			return;
+		for_each_online_cpu(cpu) {
+			if (mq_map[cpu] == qid) {
+				queue->io_cpu = cpu;
+				break;
+			}
+		}
+		dev_dbg(ctrl->ctrl.device, "queue %d: using cpu %d\n",
+			qid, queue->io_cpu);
+	} else
 		queue->io_cpu = cpumask_next_wrap(n - 1, cpu_online_mask, -1, false);
 }
 
@@ -1716,7 +1733,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid,
 
 	queue->sock->sk->sk_allocation = GFP_ATOMIC;
 	queue->sock->sk->sk_use_task_frag = false;
-	nvme_tcp_set_queue_io_cpu(queue);
+	queue->io_cpu = WORK_CPU_UNBOUND;
 	queue->request = NULL;
 	queue->data_remaining = 0;
 	queue->ddgst_remaining = 0;
@@ -1872,9 +1889,10 @@ static int nvme_tcp_start_queue(struct nvme_ctrl *nctrl, int idx)
 	nvme_tcp_init_recv_ctx(queue);
 	nvme_tcp_setup_sock_ops(queue);
 
-	if (idx)
+	if (idx) {
+		nvme_tcp_set_queue_io_cpu(queue);
 		ret = nvmf_connect_io_queue(nctrl, idx);
-	else
+	} else
 		ret = nvmf_connect_admin_queue(nctrl);
 
 	if (!ret) {
-- 
2.35.3



^ permalink raw reply related	[flat|nested] 37+ messages in thread

* [PATCH 3/4] workqueue: introduce helper workqueue_unbound_affinity_scope()
  2024-07-03 13:50 [PATCH 0/4] nvme-tcp: improve scalability Hannes Reinecke
  2024-07-03 13:50 ` [PATCH 1/4] nvme-tcp: per-controller I/O workqueues Hannes Reinecke
  2024-07-03 13:50 ` [PATCH 2/4] nvme-tcp: align I/O cpu with blk-mq mapping Hannes Reinecke
@ 2024-07-03 13:50 ` Hannes Reinecke
  2024-07-03 17:31   ` Tejun Heo
  2024-07-03 13:50 ` [PATCH 4/4] nvme-tcp: switch to 'cpu' affinity scope for unbound workqueues Hannes Reinecke
  3 siblings, 1 reply; 37+ messages in thread
From: Hannes Reinecke @ 2024-07-03 13:50 UTC (permalink / raw)
  To: Sagi Grimberg
  Cc: Christoph Hellwig, Keith Busch, linux-nvme, Hannes Reinecke,
	Tejun Heo

For drivers creating their own workqueue it might be useful to
switch to the 'cpu' unbound affinity scope to keep the locality
and reduce contention. As it's cumbersome to instruct the user
how to switch to affinity scope from userland introduce a helper
workqueue_unbound_affinity_scope() to let the driver set the
affinity scope directly.

Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Hannes Reinecke <hare@kernel.org>
---
 include/linux/workqueue.h |  1 +
 kernel/workqueue.c        | 47 +++++++++++++++++++++++++++++++--------
 2 files changed, 39 insertions(+), 9 deletions(-)

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index fb3993894536..6c5f3dc614d9 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -541,6 +541,7 @@ void free_workqueue_attrs(struct workqueue_attrs *attrs);
 int apply_workqueue_attrs(struct workqueue_struct *wq,
 			  const struct workqueue_attrs *attrs);
 extern int workqueue_unbound_exclude_cpumask(cpumask_var_t cpumask);
+extern int workqueue_unbound_affinity_scope(struct workqueue_struct *wq, int scope);
 
 extern bool queue_work_on(int cpu, struct workqueue_struct *wq,
 			struct work_struct *work);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 003474c9a77d..53b1ca11fc86 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -7143,26 +7143,55 @@ static ssize_t wq_affn_scope_show(struct device *dev,
 	return written;
 }
 
-static ssize_t wq_affn_scope_store(struct device *dev,
-				   struct device_attribute *attr,
-				   const char *buf, size_t count)
+int workqueue_unbound_affinity_scope(struct workqueue_struct *wq, int scope)
 {
-	struct workqueue_struct *wq = dev_to_wq(dev);
 	struct workqueue_attrs *attrs;
-	int affn, ret = -ENOMEM;
+	int ret = -ENOMEM;;
 
-	affn = parse_affn_scope(buf);
-	if (affn < 0)
-		return affn;
+	if (!(wq->flags & WQ_UNBOUND))
+		return -EINVAL;
+
+	switch(scope) {
+	case WQ_AFFN_DFL:
+		/* fallthrough */
+	case WQ_AFFN_CPU:
+		/* fallthrough */
+	case WQ_AFFN_SMT:
+		/* fallthrough */
+	case WQ_AFFN_CACHE:
+		/* fallthrough */
+	case WQ_AFFN_NUMA:
+		/* fallthrough */
+	case WQ_AFFN_SYSTEM:
+		break;
+	default:
+		return -EINVAL;
+	}
 
 	apply_wqattrs_lock();
 	attrs = wq_sysfs_prep_attrs(wq);
 	if (attrs) {
-		attrs->affn_scope = affn;
+		attrs->affn_scope = scope;
 		ret = apply_workqueue_attrs_locked(wq, attrs);
 	}
 	apply_wqattrs_unlock();
 	free_workqueue_attrs(attrs);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(workqueue_unbound_affinity_scope);
+
+static ssize_t wq_affn_scope_store(struct device *dev,
+				   struct device_attribute *attr,
+				   const char *buf, size_t count)
+{
+	struct workqueue_struct *wq = dev_to_wq(dev);
+	int affn, ret = -ENOMEM;
+
+	affn = parse_affn_scope(buf);
+	if (affn < 0)
+		return affn;
+
+	ret = workqueue_unbound_affinity_scope(wq, affn);
 	return ret ?: count;
 }
 
-- 
2.35.3



^ permalink raw reply related	[flat|nested] 37+ messages in thread

* [PATCH 4/4] nvme-tcp: switch to 'cpu' affinity scope for unbound workqueues
  2024-07-03 13:50 [PATCH 0/4] nvme-tcp: improve scalability Hannes Reinecke
                   ` (2 preceding siblings ...)
  2024-07-03 13:50 ` [PATCH 3/4] workqueue: introduce helper workqueue_unbound_affinity_scope() Hannes Reinecke
@ 2024-07-03 13:50 ` Hannes Reinecke
  2024-07-03 14:22   ` Sagi Grimberg
  3 siblings, 1 reply; 37+ messages in thread
From: Hannes Reinecke @ 2024-07-03 13:50 UTC (permalink / raw)
  To: Sagi Grimberg; +Cc: Christoph Hellwig, Keith Busch, linux-nvme, Hannes Reinecke

We should switch to the 'cpu' affinity scope when using the 'wq_unbound'
parameter as this allows us to keep I/O locality and improve performance.

Signed-off-by: Hannes Reinecke <hare@kernel.org>
---
 drivers/nvme/host/tcp.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index df184004a514..889a5a28e3b7 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -2812,6 +2812,11 @@ static struct nvme_tcp_ctrl *nvme_tcp_alloc_ctrl(struct device *dev,
 		nvme_put_ctrl(&ctrl->ctrl);
 		return ERR_PTR(-ENOMEM);
 	}
+	if (wq_unbound) {
+		ret = workqueue_unbound_affinity_scope(ctrl->io_wq, WQ_AFFN_CPU);
+		if (ret)
+			pr_warn("Failed to set 'cpu' workqueue affinity\n");
+	}
 
 	return ctrl;
 out_kfree_queues:
-- 
2.35.3



^ permalink raw reply related	[flat|nested] 37+ messages in thread

* Re: [PATCH 1/4] nvme-tcp: per-controller I/O workqueues
  2024-07-03 13:50 ` [PATCH 1/4] nvme-tcp: per-controller I/O workqueues Hannes Reinecke
@ 2024-07-03 14:11   ` Sagi Grimberg
  2024-07-03 14:46     ` Hannes Reinecke
  2024-07-04  5:36   ` Christoph Hellwig
  1 sibling, 1 reply; 37+ messages in thread
From: Sagi Grimberg @ 2024-07-03 14:11 UTC (permalink / raw)
  To: Hannes Reinecke; +Cc: Christoph Hellwig, Keith Busch, linux-nvme



On 03/07/2024 16:50, Hannes Reinecke wrote:
> Implement per-controller I/O workqueues to reduce workqueue contention
> during I/O.

OK, wonder what is the cost here. Is it in ALL conditions better than a 
single
workqueue?


^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 2/4] nvme-tcp: align I/O cpu with blk-mq mapping
  2024-07-03 13:50 ` [PATCH 2/4] nvme-tcp: align I/O cpu with blk-mq mapping Hannes Reinecke
@ 2024-07-03 14:19   ` Sagi Grimberg
  2024-07-03 14:53     ` Hannes Reinecke
  2024-07-04  5:37     ` Christoph Hellwig
  0 siblings, 2 replies; 37+ messages in thread
From: Sagi Grimberg @ 2024-07-03 14:19 UTC (permalink / raw)
  To: Hannes Reinecke; +Cc: Christoph Hellwig, Keith Busch, linux-nvme



On 03/07/2024 16:50, Hannes Reinecke wrote:
> When 'wq_unbound' is selected we should select the
> the first CPU from a given blk-mq hctx mapping to queue
> the tcp workqueue item. With this we can instruct the
> workqueue code to keep the I/O affinity and avoid
> a performance penalty.

wq_unbound is designed to keep io_cpu to be UNBOUND, my recollection
was the the person introducing it was trying to make the io_cpu always be
on a specific NUMA node, or a subset of cpus within a numa node. So he uses
that and tinkers with wq cpumask via sysfs.

I don't see why you are tying this to wq_unbound in the first place.

>
> One should switch to 'cpu' workqueue affinity to
> get full advantage of this by issuing:
>
> echo cpu > /sys/devices/virtual/workqueue/nvme_tcp_wq_*/affinity_scope

Quantify improvement please.


^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 4/4] nvme-tcp: switch to 'cpu' affinity scope for unbound workqueues
  2024-07-03 13:50 ` [PATCH 4/4] nvme-tcp: switch to 'cpu' affinity scope for unbound workqueues Hannes Reinecke
@ 2024-07-03 14:22   ` Sagi Grimberg
  2024-07-03 15:01     ` Hannes Reinecke
  0 siblings, 1 reply; 37+ messages in thread
From: Sagi Grimberg @ 2024-07-03 14:22 UTC (permalink / raw)
  To: Hannes Reinecke; +Cc: Christoph Hellwig, Keith Busch, linux-nvme



On 03/07/2024 16:50, Hannes Reinecke wrote:
> We should switch to the 'cpu' affinity scope when using the 'wq_unbound'
> parameter as this allows us to keep I/O locality and improve performance.

Can you please describe more why this is better? locality between what?

While you mention in your cover letter "comments and reviews are welcome"
The change logs in your patches are not designed to assist your reviewer.


^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 1/4] nvme-tcp: per-controller I/O workqueues
  2024-07-03 14:11   ` Sagi Grimberg
@ 2024-07-03 14:46     ` Hannes Reinecke
  2024-07-03 15:16       ` Sagi Grimberg
  0 siblings, 1 reply; 37+ messages in thread
From: Hannes Reinecke @ 2024-07-03 14:46 UTC (permalink / raw)
  To: Sagi Grimberg, Hannes Reinecke; +Cc: Christoph Hellwig, Keith Busch, linux-nvme

On 7/3/24 16:11, Sagi Grimberg wrote:
> 
> 
> On 03/07/2024 16:50, Hannes Reinecke wrote:
>> Implement per-controller I/O workqueues to reduce workqueue contention
>> during I/O.
> 
> OK, wonder what is the cost here. Is it in ALL conditions better than a 
> single workqueue?

Well, clearly not on memory-limited systems; a workqueue per controller 
takes up more memory that a single one. And it's questionable whether
such a system isn't underprovisioned for nvme anyway.
We will see a higher scheduler interaction as the scheduler needs to 
switch between workqueues, but that was kinda the idea. And I doubt one
can measure it; the overhead between switching workqueues should be 
pretty much identical to the overhead switching between workqueue items.

I could do some measurements, but really I don't think it'll yield any
surprising results.

Cheers,

Hannes
-- 
Dr. Hannes Reinecke                  Kernel Storage Architect
hare@suse.de                                +49 911 74053 688
SUSE Software Solutions GmbH, Frankenstr. 146, 90461 Nürnberg
HRB 36809 (AG Nürnberg), GF: I. Totev, A. McDonald, W. Knoblich

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 2/4] nvme-tcp: align I/O cpu with blk-mq mapping
  2024-07-03 14:19   ` Sagi Grimberg
@ 2024-07-03 14:53     ` Hannes Reinecke
  2024-07-03 15:03       ` Sagi Grimberg
  2024-07-04  5:37     ` Christoph Hellwig
  1 sibling, 1 reply; 37+ messages in thread
From: Hannes Reinecke @ 2024-07-03 14:53 UTC (permalink / raw)
  To: Sagi Grimberg, Hannes Reinecke; +Cc: Christoph Hellwig, Keith Busch, linux-nvme

On 7/3/24 16:19, Sagi Grimberg wrote:
> 
> 
> On 03/07/2024 16:50, Hannes Reinecke wrote:
>> When 'wq_unbound' is selected we should select the
>> the first CPU from a given blk-mq hctx mapping to queue
>> the tcp workqueue item. With this we can instruct the
>> workqueue code to keep the I/O affinity and avoid
>> a performance penalty.
> 
> wq_unbound is designed to keep io_cpu to be UNBOUND, my recollection
> was the the person introducing it was trying to make the io_cpu always be
> on a specific NUMA node, or a subset of cpus within a numa node. So he uses
> that and tinkers with wq cpumask via sysfs.
> 
> I don't see why you are tying this to wq_unbound in the first place.
> 
Because in the default case the workqueue is nailed to a cpu, and will 
not move from it. IE if you call 'queue_work_on()' it _will_ run on that 
cpu.
But if something else is running on that CPU (printk logging, say), you 
will have to stand in the queue until the scheduler gives you some time.

If the workqueue is unbound the workqueue code is able to switch away 
from the cpu if it finds it busy or otherwise unsuitable, leading to a 
better utilization and avoiding a workqueue stall.
And in the 'unbound' case the 'cpu' argument merely serves as a hint
where to place the workqueue item.
At least, that's how I understood the code.

And it makes the 'CPU hogged' messages go away, which is a bonus in 
itself...

>>
>> One should switch to 'cpu' workqueue affinity to
>> get full advantage of this by issuing:
>>
>> echo cpu > /sys/devices/virtual/workqueue/nvme_tcp_wq_*/affinity_scope
> 
> Quantify improvement please.

I'll see to get some number.

Cheers,

Hannes
-- 
Dr. Hannes Reinecke                  Kernel Storage Architect
hare@suse.de                                +49 911 74053 688
SUSE Software Solutions GmbH, Frankenstr. 146, 90461 Nürnberg
HRB 36809 (AG Nürnberg), GF: I. Totev, A. McDonald, W. Knoblich



^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 4/4] nvme-tcp: switch to 'cpu' affinity scope for unbound workqueues
  2024-07-03 14:22   ` Sagi Grimberg
@ 2024-07-03 15:01     ` Hannes Reinecke
  2024-07-03 15:09       ` Sagi Grimberg
  0 siblings, 1 reply; 37+ messages in thread
From: Hannes Reinecke @ 2024-07-03 15:01 UTC (permalink / raw)
  To: Sagi Grimberg, Hannes Reinecke; +Cc: Christoph Hellwig, Keith Busch, linux-nvme

On 7/3/24 16:22, Sagi Grimberg wrote:
> 
> 
> On 03/07/2024 16:50, Hannes Reinecke wrote:
>> We should switch to the 'cpu' affinity scope when using the 'wq_unbound'
>> parameter as this allows us to keep I/O locality and improve performance.
> 
> Can you please describe more why this is better? locality between what?
> 
Well; the default unbound scope is 'cache', which groups the cpu 
according to the cache hierarchy. I want the cpu locality of the 
workqueue items to be preserved as much as possible, so I switched
to 'cpu' here.

I'll get some performance numbers.

> While you mention in your cover letter "comments and reviews are welcome"
> The change logs in your patches are not designed to assist your reviewer.

I spent the last few weeks trying to come up with a solution based on my
original submission, but in the end I gave up as I hadn't been able to
fix the original issue.
This here is a different approach by massaging the 'wq_unbound' 
mechanism, which is not only easier but also has the big advantage that
it actually works :-)
So I did not include a changlog to the previous patchset as this is a 
pretty different approach.
Sorry if this is confusing.

Cheers,

Hannes
-- 
Dr. Hannes Reinecke                  Kernel Storage Architect
hare@suse.de                                +49 911 74053 688
SUSE Software Solutions GmbH, Frankenstr. 146, 90461 Nürnberg
HRB 36809 (AG Nürnberg), GF: I. Totev, A. McDonald, W. Knoblich



^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 2/4] nvme-tcp: align I/O cpu with blk-mq mapping
  2024-07-03 14:53     ` Hannes Reinecke
@ 2024-07-03 15:03       ` Sagi Grimberg
  2024-07-03 15:40         ` Hannes Reinecke
  0 siblings, 1 reply; 37+ messages in thread
From: Sagi Grimberg @ 2024-07-03 15:03 UTC (permalink / raw)
  To: Hannes Reinecke, Hannes Reinecke
  Cc: Christoph Hellwig, Keith Busch, linux-nvme



On 03/07/2024 17:53, Hannes Reinecke wrote:
> On 7/3/24 16:19, Sagi Grimberg wrote:
>>
>>
>> On 03/07/2024 16:50, Hannes Reinecke wrote:
>>> When 'wq_unbound' is selected we should select the
>>> the first CPU from a given blk-mq hctx mapping to queue
>>> the tcp workqueue item. With this we can instruct the
>>> workqueue code to keep the I/O affinity and avoid
>>> a performance penalty.
>>
>> wq_unbound is designed to keep io_cpu to be UNBOUND, my recollection
>> was the the person introducing it was trying to make the io_cpu 
>> always be
>> on a specific NUMA node, or a subset of cpus within a numa node. So 
>> he uses
>> that and tinkers with wq cpumask via sysfs.
>>
>> I don't see why you are tying this to wq_unbound in the first place.
>>
> Because in the default case the workqueue is nailed to a cpu, and will 
> not move from it. IE if you call 'queue_work_on()' it _will_ run on 
> that cpu.
> But if something else is running on that CPU (printk logging, say), 
> you will have to stand in the queue until the scheduler gives you some 
> time.
>
> If the workqueue is unbound the workqueue code is able to switch away 
> from the cpu if it finds it busy or otherwise unsuitable, leading to a 
> better utilization and avoiding a workqueue stall.
> And in the 'unbound' case the 'cpu' argument merely serves as a hint
> where to place the workqueue item.
> At least, that's how I understood the code.

We should make the io_cpu come from blk-mq hctx mapping by default, and 
for every controller it should use a different cpu from the hctx 
mapping. That is the default behavior. in the wq_unbound case, we skip 
all of that and make io_cpu = WORK_CPU_UNBOUND, as it was before.

I'm not sure I follow your logic.

>
> And it makes the 'CPU hogged' messages go away, which is a bonus in 
> itself...

Which messages? aren't these messages saying that the work spent too 
much time? why are you describing
the case where the work does not get cpu quota to run?


^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 4/4] nvme-tcp: switch to 'cpu' affinity scope for unbound workqueues
  2024-07-03 15:01     ` Hannes Reinecke
@ 2024-07-03 15:09       ` Sagi Grimberg
  2024-07-03 15:50         ` Hannes Reinecke
  0 siblings, 1 reply; 37+ messages in thread
From: Sagi Grimberg @ 2024-07-03 15:09 UTC (permalink / raw)
  To: Hannes Reinecke, Hannes Reinecke
  Cc: Christoph Hellwig, Keith Busch, linux-nvme



On 03/07/2024 18:01, Hannes Reinecke wrote:
> On 7/3/24 16:22, Sagi Grimberg wrote:
>>
>>
>> On 03/07/2024 16:50, Hannes Reinecke wrote:
>>> We should switch to the 'cpu' affinity scope when using the 
>>> 'wq_unbound'
>>> parameter as this allows us to keep I/O locality and improve 
>>> performance.
>>
>> Can you please describe more why this is better? locality between what?
>>
> Well; the default unbound scope is 'cache', which groups the cpu 
> according to the cache hierarchy. I want the cpu locality of the 
> workqueue items to be preserved as much as possible, so I switched
> to 'cpu' here.
>
> I'll get some performance numbers.
>
>> While you mention in your cover letter "comments and reviews are 
>> welcome"
>> The change logs in your patches are not designed to assist your 
>> reviewer.
>
> I spent the last few weeks trying to come up with a solution based on my
> original submission, but in the end I gave up as I hadn't been able to
> fix the original issue.

Well, the last submission was a discombobulated set of mostly unrelated 
patches...
What was it that did not work?

> This here is a different approach by massaging the 'wq_unbound' 
> mechanism, which is not only easier but also has the big advantage that
> it actually works :-)
> So I did not include a changlog to the previous patchset as this is a 
> pretty different approach.
> Sorry if this is confusing.

It's just difficult to try and understand what each patch contributes, 
and most of the time the patches
are under-documented. I want to see the improvements added, but I also 
want them to be properly reviewed.


^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 1/4] nvme-tcp: per-controller I/O workqueues
  2024-07-03 14:46     ` Hannes Reinecke
@ 2024-07-03 15:16       ` Sagi Grimberg
  2024-07-03 17:07         ` Tejun Heo
  0 siblings, 1 reply; 37+ messages in thread
From: Sagi Grimberg @ 2024-07-03 15:16 UTC (permalink / raw)
  To: Hannes Reinecke, Hannes Reinecke
  Cc: Christoph Hellwig, Keith Busch, linux-nvme, Tejun Heo


>>
>>
>> On 03/07/2024 16:50, Hannes Reinecke wrote:
>>> Implement per-controller I/O workqueues to reduce workqueue contention
>>> during I/O.
>>
>> OK, wonder what is the cost here. Is it in ALL conditions better than 
>> a single workqueue?
>
> Well, clearly not on memory-limited systems; a workqueue per 
> controller takes up more memory that a single one. And it's 
> questionable whether
> such a system isn't underprovisioned for nvme anyway.
> We will see a higher scheduler interaction as the scheduler needs to 
> switch between workqueues, but that was kinda the idea. And I doubt one
> can measure it; the overhead between switching workqueues should be 
> pretty much identical to the overhead switching between workqueue items.
>
> I could do some measurements, but really I don't think it'll yield any
> surprising results.

I'm just not used to seeing drivers create non-global workqueues. I've 
seen some filesystems have workqueues per-super, but
it's not a common pattern around the kernel.

Tejun,
Is this a pattern that we should pursue? Do multiple symmetric 
workqueues really work better (faster, with less overhead) than
a single global workqueues?


^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 2/4] nvme-tcp: align I/O cpu with blk-mq mapping
  2024-07-03 15:03       ` Sagi Grimberg
@ 2024-07-03 15:40         ` Hannes Reinecke
  2024-07-03 19:38           ` Sagi Grimberg
  0 siblings, 1 reply; 37+ messages in thread
From: Hannes Reinecke @ 2024-07-03 15:40 UTC (permalink / raw)
  To: Sagi Grimberg, Hannes Reinecke; +Cc: Christoph Hellwig, Keith Busch, linux-nvme

On 7/3/24 17:03, Sagi Grimberg wrote:
> 
> 
> On 03/07/2024 17:53, Hannes Reinecke wrote:
>> On 7/3/24 16:19, Sagi Grimberg wrote:
>>>
>>>
>>> On 03/07/2024 16:50, Hannes Reinecke wrote:
>>>> When 'wq_unbound' is selected we should select the
>>>> the first CPU from a given blk-mq hctx mapping to queue
>>>> the tcp workqueue item. With this we can instruct the
>>>> workqueue code to keep the I/O affinity and avoid
>>>> a performance penalty.
>>>
>>> wq_unbound is designed to keep io_cpu to be UNBOUND, my recollection
>>> was the the person introducing it was trying to make the io_cpu 
>>> always be on a specific NUMA node, or a subset of cpus within a numa node. So 
>>> he uses that and tinkers with wq cpumask via sysfs.
>>>
>>> I don't see why you are tying this to wq_unbound in the first place.
>>>
>> Because in the default case the workqueue is nailed to a cpu, and will 
>> not move from it. IE if you call 'queue_work_on()' it _will_ run on 
>> that cpu.
>> But if something else is running on that CPU (printk logging, say), 
>> you will have to stand in the queue until the scheduler gives you some 
>> time.
>>
>> If the workqueue is unbound the workqueue code is able to switch away 
>> from the cpu if it finds it busy or otherwise unsuitable, leading to a 
>> better utilization and avoiding a workqueue stall.
>> And in the 'unbound' case the 'cpu' argument merely serves as a hint
>> where to place the workqueue item.
>> At least, that's how I understood the code.
> 
> We should make the io_cpu come from blk-mq hctx mapping by default, and 
> for every controller it should use a different cpu from the hctx 
> mapping. That is the default behavior. in the wq_unbound case, we skip 
> all of that and make io_cpu = WORK_CPU_UNBOUND, as it was before.
> 
> I'm not sure I follow your logic.
> 
Hehe. That's quite simple: there is none :-)
I have been tinkering with that approach in the last weeks, but got 
consistently _worse_ results than with the original implementation.
So I gave up on trying to make that the default.

>>
>> And it makes the 'CPU hogged' messages go away, which is a bonus in 
>> itself...
> 
> Which messages? aren't these messages saying that the work spent too 
> much time? why are you describing the case where the work does not get
> cpu quota to run?

I means these messages:

workqueue: nvme_tcp_io_work [nvme_tcp] hogged CPU for >10000us 32771 
times, consider switching to WQ_UNBOUND

which I get consistently during testing with the default implementation.

Cheers,

Hannes
-- 
Dr. Hannes Reinecke                  Kernel Storage Architect
hare@suse.de                                +49 911 74053 688
SUSE Software Solutions GmbH, Frankenstr. 146, 90461 Nürnberg
HRB 36809 (AG Nürnberg), GF: I. Totev, A. McDonald, W. Knoblich



^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 4/4] nvme-tcp: switch to 'cpu' affinity scope for unbound workqueues
  2024-07-03 15:09       ` Sagi Grimberg
@ 2024-07-03 15:50         ` Hannes Reinecke
  2024-07-04  9:11           ` Sagi Grimberg
  0 siblings, 1 reply; 37+ messages in thread
From: Hannes Reinecke @ 2024-07-03 15:50 UTC (permalink / raw)
  To: Sagi Grimberg, Hannes Reinecke; +Cc: Christoph Hellwig, Keith Busch, linux-nvme

On 7/3/24 17:09, Sagi Grimberg wrote:
> 
> 
> On 03/07/2024 18:01, Hannes Reinecke wrote:
>> On 7/3/24 16:22, Sagi Grimberg wrote:
>>>
>>>
>>> On 03/07/2024 16:50, Hannes Reinecke wrote:
>>>> We should switch to the 'cpu' affinity scope when using the 
>>>> 'wq_unbound'
>>>> parameter as this allows us to keep I/O locality and improve 
>>>> performance.
>>>
>>> Can you please describe more why this is better? locality between what?
>>>
>> Well; the default unbound scope is 'cache', which groups the cpu 
>> according to the cache hierarchy. I want the cpu locality of the 
>> workqueue items to be preserved as much as possible, so I switched
>> to 'cpu' here.
>>
>> I'll get some performance numbers.
>>
>>> While you mention in your cover letter "comments and reviews are 
>>> welcome"
>>> The change logs in your patches are not designed to assist your 
>>> reviewer.
>>
>> I spent the last few weeks trying to come up with a solution based on my
>> original submission, but in the end I gave up as I hadn't been able to
>> fix the original issue.
> 
> Well, the last submission was a discombobulated set of mostly unrelated 
> patches...
> What was it that did not work?
> 
>> This here is a different approach by massaging the 'wq_unbound' 
>> mechanism, which is not only easier but also has the big advantage that
>> it actually works :-)
>> So I did not include a changlog to the previous patchset as this is a 
>> pretty different approach.
>> Sorry if this is confusing.
> 
> It's just difficult to try and understand what each patch contributes, 
> and most of the time the patches
> are under-documented. I want to see the improvements added, but I also 
> want them to be properly reviewed.

Sure. So here are some performance number:
(One subsystem, two paths, 96 queues)
default:
4k seq read: bw=365MiB/s (383MB/s), 11.4MiB/s-20.5MiB/s 
(11.0MB/s-21.5MB/s), io=16.0GiB (17.2GB), run=24950-44907msec
4k rand read: bw=307MiB/s (322MB/s), 9830KiB/s-13.8MiB/s 
(10.1MB/s-14.5MB/s), io=16.0GiB (17.2GB), run=37081-53333msec
4k seq write: bw=550MiB/s (577MB/s), 17.2MiB/s-28.7MiB/s 
(18.0MB/s-30.1MB/s), io=16.0GiB (17.2GB), run=17859-29786msec
4k rand write: bw=453MiB/s (475MB/s), 14.2MiB/s-21.3MiB/s 
(14.8MB/s-22.3MB/s), io=16.0GiB (17.2GB), run=24066-36161msec

unbound:
4k seq read: bw=232MiB/s (243MB/s), 6145KiB/s-9249KiB/s 
(6293kB/s-9471kB/s), io=13.6GiB (14.6GB), run=56685-60074msec
4k rand read: bw=249MiB/s (261MB/s), 6335KiB/s-9713KiB/s 
(6487kB/s-9946kB/s), io=14.6GiB (15.7GB), run=53976-60019msec
4k seq write: bw=358MiB/s (375MB/s), 11.2MiB/s-13.5MiB/s 
(11.7MB/s-14.2MB/s), io=16.0GiB (17.2GB), run=37918-45779msec
4k rand write: bw=335MiB/s (351MB/s), 10.5MiB/s-14.7MiB/s 
(10.0MB/s-15.4MB/s), io=16.0GiB (17.2GB), run=34929-48971msec

unbound + 'cpu' affinity:
4k seq read: bw=249MiB/s (261MB/s), 6003KiB/s-13.6MiB/s 
(6147kB/s-14.3MB/s), io=14.6GiB (15.7GB), run=37636-60065msec
4k rand read: bw=305MiB/s (320MB/s), 9773KiB/s-13.9MiB/s 
(10.0MB/s-14.6MB/s), io=16.0GiB (17.2GB), run=36791-53644msec
4k seq write: bw=499MiB/s (523MB/s), 15.6MiB/s-18.0MiB/s 
(16.3MB/s-19.9MB/s), io=16.0GiB (17.2GB), run=27018-32860msec
4k rand write: bw=536MiB/s (562MB/s), 16.7MiB/s-21.1MiB/s 
(17.6MB/s-22.1MB/s), io=16.0GiB (17.2GB), run=24305-30588msec

As you can see, with unbound and 'cpu' affinity we are basically on par
with the default implementations (all tests are run with per-controller
workqueues, mind).
Running the same workload with 4 subsystems and 8 paths will run into
I/O timeouts for the default implementation, but perfectly succeed with
unbound and 'cpu' affinity.
So definitely an improvement there.

I'll see to dig out performance numbers for the current implementations.

Cheers,

Hannes
-- 
Dr. Hannes Reinecke                  Kernel Storage Architect
hare@suse.de                                +49 911 74053 688
SUSE Software Solutions GmbH, Frankenstr. 146, 90461 Nürnberg
HRB 36809 (AG Nürnberg), GF: I. Totev, A. McDonald, W. Knoblich



^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 1/4] nvme-tcp: per-controller I/O workqueues
  2024-07-03 15:16       ` Sagi Grimberg
@ 2024-07-03 17:07         ` Tejun Heo
  2024-07-03 19:14           ` Sagi Grimberg
  0 siblings, 1 reply; 37+ messages in thread
From: Tejun Heo @ 2024-07-03 17:07 UTC (permalink / raw)
  To: Sagi Grimberg
  Cc: Hannes Reinecke, Hannes Reinecke, Christoph Hellwig, Keith Busch,
	linux-nvme

Hello,

On Wed, Jul 03, 2024 at 06:16:32PM +0300, Sagi Grimberg wrote:
...
> > > OK, wonder what is the cost here. Is it in ALL conditions better
> > > than a single workqueue?
> > 
> > Well, clearly not on memory-limited systems; a workqueue per controller
> > takes up more memory that a single one. And it's questionable whether
> > such a system isn't underprovisioned for nvme anyway.

Each workqueue does take up some memory but it's not enormous (I think it's
512 + 512 * nr_cpus + some extra + rescuer if MEM_RECLAIM). Each workqueue
is just a frontend to shared backend worker pools, so splitting a workqueue
into multiple that do about the same work usually won't create more workers.

> > We will see a higher scheduler interaction as the scheduler needs to
> > switch between workqueues, but that was kinda the idea. And I doubt one

This isn't necessarily true. The backend worker pools don't care whether you
have one or multiple workqueues. For per-cpu workqueues, the concurrency
management applies across different workqueues. For unbound workqueues,
because concurrency limit is per workqueue, if there are enough concurrent
work items being queued, the concurrent number of running kworkers may go up
but that's just because the total concurrency went up. Whether you have one
or many workqueues, as long as workqueues share the properties, they map to
the same backend worker pools and execute exactly the same way.

> > can measure it; the overhead between switching workqueues should be
> > pretty much identical to the overhead switching between workqueue items.

They are identical.

> > I could do some measurements, but really I don't think it'll yield any
> > surprising results.
> 
> I'm just not used to seeing drivers create non-global workqueues. I've seen
> some filesystems have workqueues per-super, but
> it's not a common pattern around the kernel.
> 
> Tejun,
> Is this a pattern that we should pursue? Do multiple symmetric workqueues
> really work better (faster, with less overhead) than
> a single global workqueues?

Yeah, there's nothing wrong with creating multiple if for the right reasons.
Here are some reasons I can think of:

- Not wanting to share concurrency limit so that one one device can't
  interfere with another. Not sharing rescuer may also have *some* benefits
  although I doubt it'd be all that noticeable.

- To get separate flush domains. e.g. If you want to be able to do
  flush_workqueue() on the work items that service one device without
  getting affected by work items from other devices.

- To get different per-device workqueue attribtes - e.g. maybe you wanna
  confine workers serving a specific device to a subset of CPUs or give them
  higher priority.

Note that separating workqueues does not necessarily change how things are
executed. e.g. You don't get your own kworkers.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 3/4] workqueue: introduce helper workqueue_unbound_affinity_scope()
  2024-07-03 13:50 ` [PATCH 3/4] workqueue: introduce helper workqueue_unbound_affinity_scope() Hannes Reinecke
@ 2024-07-03 17:31   ` Tejun Heo
  2024-07-04  6:04     ` Hannes Reinecke
  0 siblings, 1 reply; 37+ messages in thread
From: Tejun Heo @ 2024-07-03 17:31 UTC (permalink / raw)
  To: Hannes Reinecke; +Cc: Sagi Grimberg, Christoph Hellwig, Keith Busch, linux-nvme

Hello, Hannes.

On Wed, Jul 03, 2024 at 03:50:20PM +0200, Hannes Reinecke wrote:
> For drivers creating their own workqueue it might be useful to
> switch to the 'cpu' unbound affinity scope to keep the locality
> and reduce contention. As it's cumbersome to instruct the user
> how to switch to affinity scope from userland introduce a helper
> workqueue_unbound_affinity_scope() to let the driver set the
> affinity scope directly.

So, there's already apply_workqueue_attrs() to change the unbound workqueue
attributes including affinity scope. It's cumbersome to use because the
attribute structure has to be allocated and then freed. The alloc/free is
for the cpumasks, so if the API is too cumbersome, maybe we can just make
the cpumask part optional? ie. Allow using workqueue_attrs without going
through alloc if cpumasks don't need to be modified. There probably should
be a function to read attrs from a workqueue too.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 1/4] nvme-tcp: per-controller I/O workqueues
  2024-07-03 17:07         ` Tejun Heo
@ 2024-07-03 19:14           ` Sagi Grimberg
  2024-07-03 19:17             ` Tejun Heo
  0 siblings, 1 reply; 37+ messages in thread
From: Sagi Grimberg @ 2024-07-03 19:14 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Hannes Reinecke, Hannes Reinecke, Christoph Hellwig, Keith Busch,
	linux-nvme

>> I'm just not used to seeing drivers create non-global workqueues. I've seen
>> some filesystems have workqueues per-super, but
>> it's not a common pattern around the kernel.
>>
>> Tejun,
>> Is this a pattern that we should pursue? Do multiple symmetric workqueues
>> really work better (faster, with less overhead) than
>> a single global workqueues?
> Yeah, there's nothing wrong with creating multiple if for the right reasons.
> Here are some reasons I can think of:
>
> - Not wanting to share concurrency limit so that one one device can't
>    interfere with another. Not sharing rescuer may also have *some* benefits
>    although I doubt it'd be all that noticeable.
>
> - To get separate flush domains. e.g. If you want to be able to do
>    flush_workqueue() on the work items that service one device without
>    getting affected by work items from other devices.
>
> - To get different per-device workqueue attribtes - e.g. maybe you wanna
>    confine workers serving a specific device to a subset of CPUs or give them
>    higher priority.

None of these reasons are the claimed reason to use separate workqueues in
this patch. The claim is that it is more efficient, i.e. has less overhead.

The commit msg is the following:
"Implement per-controller I/O workqueues to reduce workqueue contention 
during I/O."
> Note that separating workqueues does not necessarily change how things are
> executed. e.g. You don't get your own kworkers.

Yes, that is well understood.


^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 1/4] nvme-tcp: per-controller I/O workqueues
  2024-07-03 19:14           ` Sagi Grimberg
@ 2024-07-03 19:17             ` Tejun Heo
  2024-07-03 19:41               ` Sagi Grimberg
  2024-07-04  7:36               ` Hannes Reinecke
  0 siblings, 2 replies; 37+ messages in thread
From: Tejun Heo @ 2024-07-03 19:17 UTC (permalink / raw)
  To: Sagi Grimberg
  Cc: Hannes Reinecke, Hannes Reinecke, Christoph Hellwig, Keith Busch,
	linux-nvme

Hello,

On Wed, Jul 03, 2024 at 10:14:14PM +0300, Sagi Grimberg wrote:
...
> None of these reasons are the claimed reason to use separate workqueues in
> this patch. The claim is that it is more efficient, i.e. has less overhead.
> 
> The commit msg is the following:
> "Implement per-controller I/O workqueues to reduce workqueue contention
> during I/O."

Hmm... it's not impossible for the concurrency accounting in pool_workqueues
to show up if the issue rate is *really* high but I'd be surprised if that
actually matters given that the backend pool is shared. Maybe I'm missing
something but I don't see a reason why multiple workqueues would be more
efficient than a shared one.

Thanks.

-- 
tejun


^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 2/4] nvme-tcp: align I/O cpu with blk-mq mapping
  2024-07-03 15:40         ` Hannes Reinecke
@ 2024-07-03 19:38           ` Sagi Grimberg
  2024-07-03 19:47             ` Sagi Grimberg
  2024-07-04  6:43             ` Hannes Reinecke
  0 siblings, 2 replies; 37+ messages in thread
From: Sagi Grimberg @ 2024-07-03 19:38 UTC (permalink / raw)
  To: Hannes Reinecke, Hannes Reinecke
  Cc: Christoph Hellwig, Keith Busch, linux-nvme

>>
>>
>> On 03/07/2024 17:53, Hannes Reinecke wrote:
>>> On 7/3/24 16:19, Sagi Grimberg wrote:
>>>>
>>>>
>>>> On 03/07/2024 16:50, Hannes Reinecke wrote:
>>>>> When 'wq_unbound' is selected we should select the
>>>>> the first CPU from a given blk-mq hctx mapping to queue
>>>>> the tcp workqueue item. With this we can instruct the
>>>>> workqueue code to keep the I/O affinity and avoid
>>>>> a performance penalty.
>>>>
>>>> wq_unbound is designed to keep io_cpu to be UNBOUND, my recollection
>>>> was the the person introducing it was trying to make the io_cpu 
>>>> always be on a specific NUMA node, or a subset of cpus within a 
>>>> numa node. So he uses that and tinkers with wq cpumask via sysfs.
>>>>
>>>> I don't see why you are tying this to wq_unbound in the first place.
>>>>
>>> Because in the default case the workqueue is nailed to a cpu, and 
>>> will not move from it. IE if you call 'queue_work_on()' it _will_ 
>>> run on that cpu.
>>> But if something else is running on that CPU (printk logging, say), 
>>> you will have to stand in the queue until the scheduler gives you 
>>> some time.
>>>
>>> If the workqueue is unbound the workqueue code is able to switch 
>>> away from the cpu if it finds it busy or otherwise unsuitable, 
>>> leading to a better utilization and avoiding a workqueue stall.
>>> And in the 'unbound' case the 'cpu' argument merely serves as a hint
>>> where to place the workqueue item.
>>> At least, that's how I understood the code.
>>
>> We should make the io_cpu come from blk-mq hctx mapping by default, 
>> and for every controller it should use a different cpu from the hctx 
>> mapping. That is the default behavior. in the wq_unbound case, we 
>> skip all of that and make io_cpu = WORK_CPU_UNBOUND, as it was before.
>>
>> I'm not sure I follow your logic.
>>
> Hehe. That's quite simple: there is none :-)
> I have been tinkering with that approach in the last weeks, but got 
> consistently _worse_ results than with the original implementation.
> So I gave up on trying to make that the default.

What is the "original implementation" ?
What is you target? nvmet?
What is the fio job file you are using?
what is the queue count? controller count?
What was the queue mapping?

Please lets NOT condition any of this on wq_unbound option at this 
point. This modparam was introduced to address
a specific issue. If we see IO timeouts, we should fix them, not tell 
people to filp a modparam as a solution.

>
>>>
>>> And it makes the 'CPU hogged' messages go away, which is a bonus in 
>>> itself...
>>
>> Which messages? aren't these messages saying that the work spent too 
>> much time? why are you describing the case where the work does not get
>> cpu quota to run?
>
> I means these messages:
>
> workqueue: nvme_tcp_io_work [nvme_tcp] hogged CPU for >10000us 32771 
> times, consider switching to WQ_UNBOUND

That means that we are spending too much time in io_work, This is a 
separate bug. If you look at nvme_tcp_io_work it has
a stop condition after 1 millisecond. However, when we call 
nvme_tcp_try_recv() it just keeps receiving from the socket until
the socket receive buffer has no more payload. So in theory nothing 
prevents from the io_work from looping there forever.

This is indeed a bug that we need to address. Probably by setting 
rd_desc.count to some limit, decrement it for every
skb that we consume, and if we reach that limit and there are more skbs 
pending, we break and self-requeue.

If we indeed spend much time processing a single queue in io_work, it is 
possible that we have a starvation problem
that is escalating to the timeouts you are seeing.

>
> which I get consistently during testing with the default implementation.

Hannes, let's please separate this specific issue with the performance 
enhancements.
I do not think that we should search for performance enhancements to 
address what appears
to be a logical starvation issue.

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 1/4] nvme-tcp: per-controller I/O workqueues
  2024-07-03 19:17             ` Tejun Heo
@ 2024-07-03 19:41               ` Sagi Grimberg
  2024-07-04  7:36               ` Hannes Reinecke
  1 sibling, 0 replies; 37+ messages in thread
From: Sagi Grimberg @ 2024-07-03 19:41 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Hannes Reinecke, Hannes Reinecke, Christoph Hellwig, Keith Busch,
	linux-nvme



On 03/07/2024 22:17, Tejun Heo wrote:
> Hello,
>
> On Wed, Jul 03, 2024 at 10:14:14PM +0300, Sagi Grimberg wrote:
> ...
>> None of these reasons are the claimed reason to use separate workqueues in
>> this patch. The claim is that it is more efficient, i.e. has less overhead.
>>
>> The commit msg is the following:
>> "Implement per-controller I/O workqueues to reduce workqueue contention
>> during I/O."
> Hmm... it's not impossible for the concurrency accounting in pool_workqueues
> to show up if the issue rate is *really* high but I'd be surprised if that
> actually matters given that the backend pool is shared. Maybe I'm missing
> something but I don't see a reason why multiple workqueues would be more
> efficient than a shared one.

That was my assumption as well, that there is no real point in using 
multiple workqueues.
And hence my surprise.

Hannes, can you please provide a quantitive improvement that you saw simply
by separating to different workqueues?


^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 2/4] nvme-tcp: align I/O cpu with blk-mq mapping
  2024-07-03 19:38           ` Sagi Grimberg
@ 2024-07-03 19:47             ` Sagi Grimberg
  2024-07-04  6:43             ` Hannes Reinecke
  1 sibling, 0 replies; 37+ messages in thread
From: Sagi Grimberg @ 2024-07-03 19:47 UTC (permalink / raw)
  To: Hannes Reinecke, Hannes Reinecke
  Cc: Christoph Hellwig, Keith Busch, linux-nvme


>>
>>>>
>>>> And it makes the 'CPU hogged' messages go away, which is a bonus in 
>>>> itself...
>>>
>>> Which messages? aren't these messages saying that the work spent too 
>>> much time? why are you describing the case where the work does not get
>>> cpu quota to run?
>>
>> I means these messages:
>>
>> workqueue: nvme_tcp_io_work [nvme_tcp] hogged CPU for >10000us 32771 
>> times, consider switching to WQ_UNBOUND
>
> That means that we are spending too much time in io_work, This is a 
> separate bug. If you look at nvme_tcp_io_work it has
> a stop condition after 1 millisecond. However, when we call 
> nvme_tcp_try_recv() it just keeps receiving from the socket until
> the socket receive buffer has no more payload. So in theory nothing 
> prevents from the io_work from looping there forever.
>
> This is indeed a bug that we need to address. Probably by setting 
> rd_desc.count to some limit, decrement it for every
> skb that we consume, and if we reach that limit and there are more 
> skbs pending, we break and self-requeue.
>
> If we indeed spend much time processing a single queue in io_work, it 
> is possible that we have a starvation problem
> that is escalating to the timeouts you are seeing.

btw, if this is indeed the root cause to this issue, we can probably 
combine the softirq approach with this.
in data_ready, consume a limited number of skbs directly in softirq (say 
32k), if we have more, schedule io_work
to consume the rest, and pace from there in limits of say 512k bytes or 
so...

We may actually end up eating the cake and having it too. But lets see 
if this is indeed the root-cause first.


^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 1/4] nvme-tcp: per-controller I/O workqueues
  2024-07-03 13:50 ` [PATCH 1/4] nvme-tcp: per-controller I/O workqueues Hannes Reinecke
  2024-07-03 14:11   ` Sagi Grimberg
@ 2024-07-04  5:36   ` Christoph Hellwig
  1 sibling, 0 replies; 37+ messages in thread
From: Christoph Hellwig @ 2024-07-04  5:36 UTC (permalink / raw)
  To: Hannes Reinecke; +Cc: Sagi Grimberg, Christoph Hellwig, Keith Busch, linux-nvme

On Wed, Jul 03, 2024 at 03:50:18PM +0200, Hannes Reinecke wrote:
> Implement per-controller I/O workqueues to reduce workqueue contention
> during I/O.

Gee, I can see that you are implementing it.  But given that workqueues
are scalable contexts I'd really like to see where you hit limits.
Maybe it's totally expected, maybe you found a bug in the workqueues,
or maybe you just did because you need some of this later.

You'll need to tell the reviewers and also people finding the inevitable
bugs later on why you are doing this.

> +		queue_work_on(queue->io_cpu, queue->ctrl->io_wq, &queue->io_work);

.. and avoid allthe overly long lines.  Maybe by adding a helper to
de-duplicate this code?

> +		queue_work_on(queue->io_cpu, queue->ctrl->io_wq, &queue->io_work);

> +		queue_work_on(queue->io_cpu, queue->ctrl->io_wq, &queue->io_work);

> +	queue_work_on(queue->io_cpu, queue->ctrl->io_wq, &queue->io_work);

> +		queue_work_on(queue->io_cpu, queue->ctrl->io_wq, &queue->io_work);

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 2/4] nvme-tcp: align I/O cpu with blk-mq mapping
  2024-07-03 14:19   ` Sagi Grimberg
  2024-07-03 14:53     ` Hannes Reinecke
@ 2024-07-04  5:37     ` Christoph Hellwig
  2024-07-04  9:13       ` Sagi Grimberg
  1 sibling, 1 reply; 37+ messages in thread
From: Christoph Hellwig @ 2024-07-04  5:37 UTC (permalink / raw)
  To: Sagi Grimberg; +Cc: Hannes Reinecke, Christoph Hellwig, Keith Busch, linux-nvme

On Wed, Jul 03, 2024 at 05:19:39PM +0300, Sagi Grimberg wrote:
>
>
> On 03/07/2024 16:50, Hannes Reinecke wrote:
>> When 'wq_unbound' is selected we should select the
>> the first CPU from a given blk-mq hctx mapping to queue
>> the tcp workqueue item. With this we can instruct the
>> workqueue code to keep the I/O affinity and avoid
>> a performance penalty.
>
> wq_unbound is designed to keep io_cpu to be UNBOUND, my recollection
> was the the person introducing it was trying to make the io_cpu always be
> on a specific NUMA node, or a subset of cpus within a numa node. So he uses
> that and tinkers with wq cpumask via sysfs.

Honestly, I think we really should kill this unbound module paramter
before it causes more harm.


^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 3/4] workqueue: introduce helper workqueue_unbound_affinity_scope()
  2024-07-03 17:31   ` Tejun Heo
@ 2024-07-04  6:04     ` Hannes Reinecke
  0 siblings, 0 replies; 37+ messages in thread
From: Hannes Reinecke @ 2024-07-04  6:04 UTC (permalink / raw)
  To: Tejun Heo, Hannes Reinecke
  Cc: Sagi Grimberg, Christoph Hellwig, Keith Busch, linux-nvme

On 7/3/24 19:31, Tejun Heo wrote:
> Hello, Hannes.
> 
> On Wed, Jul 03, 2024 at 03:50:20PM +0200, Hannes Reinecke wrote:
>> For drivers creating their own workqueue it might be useful to
>> switch to the 'cpu' unbound affinity scope to keep the locality
>> and reduce contention. As it's cumbersome to instruct the user
>> how to switch to affinity scope from userland introduce a helper
>> workqueue_unbound_affinity_scope() to let the driver set the
>> affinity scope directly.
> 
> So, there's already apply_workqueue_attrs() to change the unbound workqueue
> attributes including affinity scope. It's cumbersome to use because the
> attribute structure has to be allocated and then freed. The alloc/free is
> for the cpumasks, so if the API is too cumbersome, maybe we can just make
> the cpumask part optional? ie. Allow using workqueue_attrs without going
> through alloc if cpumasks don't need to be modified. There probably should
> be a function to read attrs from a workqueue too.
> 
Sure; it would be ideal if one could allocate the attributes on stack, 
and then just call 'apply_workqueue_attrs()'.
And if the attrs don't include a cpumask then just leave it unchanged.
Let me see...

Cheers,

Hannes
-- 
Dr. Hannes Reinecke                  Kernel Storage Architect
hare@suse.de                                +49 911 74053 688
SUSE Software Solutions GmbH, Frankenstr. 146, 90461 Nürnberg
HRB 36809 (AG Nürnberg), GF: I. Totev, A. McDonald, W. Knoblich



^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 2/4] nvme-tcp: align I/O cpu with blk-mq mapping
  2024-07-03 19:38           ` Sagi Grimberg
  2024-07-03 19:47             ` Sagi Grimberg
@ 2024-07-04  6:43             ` Hannes Reinecke
  2024-07-04  9:07               ` Sagi Grimberg
  1 sibling, 1 reply; 37+ messages in thread
From: Hannes Reinecke @ 2024-07-04  6:43 UTC (permalink / raw)
  To: Sagi Grimberg, Hannes Reinecke; +Cc: Christoph Hellwig, Keith Busch, linux-nvme

On 7/3/24 21:38, Sagi Grimberg wrote:
[ .. ]
>>>
>>> We should make the io_cpu come from blk-mq hctx mapping by default, 
>>> and for every controller it should use a different cpu from the hctx 
>>> mapping. That is the default behavior. in the wq_unbound case, we 
>>> skip all of that and make io_cpu = WORK_CPU_UNBOUND, as it was before.
>>>
>>> I'm not sure I follow your logic.
>>>
>> Hehe. That's quite simple: there is none :-)
>> I have been tinkering with that approach in the last weeks, but got 
>> consistently _worse_ results than with the original implementation.
>> So I gave up on trying to make that the default.
> 
> What is the "original implementation" ?

nvme-6.10

> What is you target? nvmet?

nvmet with brd backend

> What is the fio job file you are using?

tiobench-example.fio from the fio samples

> what is the queue count? controller count?

96 queues, 4 subsystems, 2 controller each.

> What was the queue mapping?
> 
queue 0-5 maps to cpu 6-11
queue 6-11 maps to cpu 54-59
queue 12-17 maps to cpu 18-23
queue 18-23 maps to cpu 66-71
queue 24-29 maps to cpu 24-29
queue 30-35 maps to cpu 72-77
queue 36-41 maps to cpu 30-35
queue 42-47 maps to cpu 78-83
queue 48-53 maps to cpu 36-41
queue 54-59 maps to cpu 84-89
queue 60-65 maps to cpu 42-47
queue 66-71 maps to cpu 90-95
queue 72-77 maps to cpu 12-17
queue 78-83 maps to cpu 60-65
queue 84-89 maps to cpu 0-5
queue 90-95 maps to cpu 48-53

> Please lets NOT condition any of this on wq_unbound option at this 
> point. This modparam was introduced to address
> a specific issue. If we see IO timeouts, we should fix them, not tell 
> people to filp a modparam as a solution.
> 
Thing is, there is no 'best' solution. The current implementation is 
actually quite good in the single subsystem case. Issues start to appear
when doing performance testing with a really high load.
Reason for this is a high contention on the per-cpu workqueues, which 
are simply overwhelmed by doing I/O _and_ servicing 'normal' OS workload
like writing do disk etc.
Switching to wq_unbound reduces the contention and makes the system to 
scale better, but that scaling leads to a performance regression for
the single subsystem case.
(See my other mail for performance numbers)
So what is 'better'?

>>
>>>>
>>>> And it makes the 'CPU hogged' messages go away, which is a bonus in 
>>>> itself...
>>>
>>> Which messages? aren't these messages saying that the work spent too 
>>> much time? why are you describing the case where the work does not get
>>> cpu quota to run?
>>
>> I means these messages:
>>
>> workqueue: nvme_tcp_io_work [nvme_tcp] hogged CPU for >10000us 32771 
>> times, consider switching to WQ_UNBOUND
> 
> That means that we are spending too much time in io_work, This is a 
> separate bug. If you look at nvme_tcp_io_work it has
> a stop condition after 1 millisecond. However, when we call 
> nvme_tcp_try_recv() it just keeps receiving from the socket until
> the socket receive buffer has no more payload. So in theory nothing 
> prevents from the io_work from looping there forever.
> 
Oh, no. It's not the loop which is the problem. It's the actual sending
which takes long; in my test runs I've seen about 250 requests timing 
out, the majority of which was still pending on the send_list.
So the io_work function wasn't even running to fetch the requests off 
the list.

> This is indeed a bug that we need to address. Probably by setting 
> rd_desc.count to some limit, decrement it for every
> skb that we consume, and if we reach that limit and there are more skbs 
> pending, we break and self-requeue.
> 
> If we indeed spend much time processing a single queue in io_work, it is 
> possible that we have a starvation problem
> that is escalating to the timeouts you are seeing.
> 
See above; this is the problem. Most of the requests are still stuck on 
the send_list (with some even still on the req_list) when timeouts 
occur. This means the io_work function is not being scheduled fast 
enough (or often enough) to fetch the requests from the list.

My theory here is that this is due to us using bound workqueues;
each workqueue function has to execute on a given cpu, and we can
only schedule one io_work function per cpu. So if that cpu is busy
(with receiving packets, say, or normal OS tasks) we cannot execute,
and we're seeing a starvation.

With wq_unbound we are _not_ tied to a specific cpu, but rather
scheduled in a round-robin fashion. This avoids the starvation
and hence the I/O timeouts do not occur.
But we need to set the 'cpu' affinity for wq_unbound to keep
the cache locality, otherwise the performance _really_ suffers
as we're bouncing threads all over the place.

>>
>> which I get consistently during testing with the default implementation.
> 
> Hannes, let's please separate this specific issue with the performance 
> enhancements.
> I do not think that we should search for performance enhancements to 
> address what appears to be a logical starvation issue.

I am perfectly fine with that approach. This patchset is indeed just to 
address the I/O timeout issues I've been seeing.

Cheers,

Hannes
-- 
Dr. Hannes Reinecke                  Kernel Storage Architect
hare@suse.de                                +49 911 74053 688
SUSE Software Solutions GmbH, Frankenstr. 146, 90461 Nürnberg
HRB 36809 (AG Nürnberg), GF: I. Totev, A. McDonald, W. Knoblich



^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 1/4] nvme-tcp: per-controller I/O workqueues
  2024-07-03 19:17             ` Tejun Heo
  2024-07-03 19:41               ` Sagi Grimberg
@ 2024-07-04  7:36               ` Hannes Reinecke
  2024-07-05  7:10                 ` Christoph Hellwig
  1 sibling, 1 reply; 37+ messages in thread
From: Hannes Reinecke @ 2024-07-04  7:36 UTC (permalink / raw)
  To: Tejun Heo, Sagi Grimberg
  Cc: Hannes Reinecke, Christoph Hellwig, Keith Busch, linux-nvme

On 7/3/24 21:17, Tejun Heo wrote:
> Hello,
> 
> On Wed, Jul 03, 2024 at 10:14:14PM +0300, Sagi Grimberg wrote:
> ...
>> None of these reasons are the claimed reason to use separate workqueues in
>> this patch. The claim is that it is more efficient, i.e. has less overhead.
>>
>> The commit msg is the following:
>> "Implement per-controller I/O workqueues to reduce workqueue contention
>> during I/O."
> 
> Hmm... it's not impossible for the concurrency accounting in pool_workqueues
> to show up if the issue rate is *really* high but I'd be surprised if that
> actually matters given that the backend pool is shared. Maybe I'm missing
> something but I don't see a reason why multiple workqueues would be more
> efficient than a shared one.
> 
Well, I seem to run into the 'really high' issue rate case:

		unbound workqueue		bound workqueue
		single wq	multi wq	single wq	multi wq
4k seq read:	247MiB/s	249MiB/s	263MiB/s	365MiB/s
4k rand read:	294MiB/s	305MiB/s	279MiB/s	307MiB/s
4k seq write:	504MiB/s	499MiB/s	521MiB/s	550MiB/s
4k rand write:	531MiB/s	536MiB/s	476MiB/s	453MiB/s

Cheers,

Hannes
-- 
Dr. Hannes Reinecke                  Kernel Storage Architect
hare@suse.de                                +49 911 74053 688
SUSE Software Solutions GmbH, Frankenstr. 146, 90461 Nürnberg
HRB 36809 (AG Nürnberg), GF: I. Totev, A. McDonald, W. Knoblich



^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 2/4] nvme-tcp: align I/O cpu with blk-mq mapping
  2024-07-04  6:43             ` Hannes Reinecke
@ 2024-07-04  9:07               ` Sagi Grimberg
  2024-07-04 14:03                 ` Hannes Reinecke
  0 siblings, 1 reply; 37+ messages in thread
From: Sagi Grimberg @ 2024-07-04  9:07 UTC (permalink / raw)
  To: Hannes Reinecke, Hannes Reinecke
  Cc: Christoph Hellwig, Keith Busch, linux-nvme



On 7/4/24 09:43, Hannes Reinecke wrote:
> On 7/3/24 21:38, Sagi Grimberg wrote:
> [ .. ]
>>>>
>>>> We should make the io_cpu come from blk-mq hctx mapping by default, 
>>>> and for every controller it should use a different cpu from the 
>>>> hctx mapping. That is the default behavior. in the wq_unbound case, 
>>>> we skip all of that and make io_cpu = WORK_CPU_UNBOUND, as it was 
>>>> before.
>>>>
>>>> I'm not sure I follow your logic.
>>>>
>>> Hehe. That's quite simple: there is none :-)
>>> I have been tinkering with that approach in the last weeks, but got 
>>> consistently _worse_ results than with the original implementation.
>>> So I gave up on trying to make that the default.
>>
>> What is the "original implementation" ?
>
> nvme-6.10
>
>> What is you target? nvmet?
>
> nvmet with brd backend
>
>> What is the fio job file you are using?
>
> tiobench-example.fio from the fio samples
>
>> what is the queue count? controller count?
>
> 96 queues, 4 subsystems, 2 controller each.
>
>> What was the queue mapping?
>>
> queue 0-5 maps to cpu 6-11
> queue 6-11 maps to cpu 54-59
> queue 12-17 maps to cpu 18-23
> queue 18-23 maps to cpu 66-71
> queue 24-29 maps to cpu 24-29
> queue 30-35 maps to cpu 72-77
> queue 36-41 maps to cpu 30-35
> queue 42-47 maps to cpu 78-83
> queue 48-53 maps to cpu 36-41
> queue 54-59 maps to cpu 84-89
> queue 60-65 maps to cpu 42-47
> queue 66-71 maps to cpu 90-95
> queue 72-77 maps to cpu 12-17
> queue 78-83 maps to cpu 60-65
> queue 84-89 maps to cpu 0-5
> queue 90-95 maps to cpu 48-53

What are the io_cpu for each queue?

>
>> Please lets NOT condition any of this on wq_unbound option at this 
>> point. This modparam was introduced to address
>> a specific issue. If we see IO timeouts, we should fix them, not tell 
>> people to filp a modparam as a solution.
>>
> Thing is, there is no 'best' solution. The current implementation is 
> actually quite good in the single subsystem case. Issues start to appear
> when doing performance testing with a really high load.
> Reason for this is a high contention on the per-cpu workqueues, which 
> are simply overwhelmed by doing I/O _and_ servicing 'normal' OS workload
> like writing do disk etc.

What other 'normal' workloads are you seeing in your test?

> Switching to wq_unbound reduces the contention and makes the system to 
> scale better, but that scaling leads to a performance regression for
> the single subsystem case.

The kthreads are the same kthreads, the only difference is concurrency 
management, which may take advantage of different cpu cores, but pays
a price in latency and bouncing cpus.

> (See my other mail for performance numbers)
> So what is 'better'?

Which email? To Tejun? it seems that bound is better than unbound for 
all cases. You are suggesting that you regress with multiple controllers
though. That is why I suggested that we _spread_ queue io_cpu 
assignments for the bound case (as I suggested in your first attempt), 
I'd want
to see what happens in this case.

>
>>>
>>>>>
>>>>> And it makes the 'CPU hogged' messages go away, which is a bonus 
>>>>> in itself...
>>>>
>>>> Which messages? aren't these messages saying that the work spent 
>>>> too much time? why are you describing the case where the work does 
>>>> not get
>>>> cpu quota to run?
>>>
>>> I means these messages:
>>>
>>> workqueue: nvme_tcp_io_work [nvme_tcp] hogged CPU for >10000us 32771 
>>> times, consider switching to WQ_UNBOUND
>>
>> That means that we are spending too much time in io_work, This is a 
>> separate bug. If you look at nvme_tcp_io_work it has
>> a stop condition after 1 millisecond. However, when we call 
>> nvme_tcp_try_recv() it just keeps receiving from the socket until
>> the socket receive buffer has no more payload. So in theory nothing 
>> prevents from the io_work from looping there forever.
>>
> Oh, no. It's not the loop which is the problem. It's the actual sending
> which takes long; in my test runs I've seen about 250 requests timing 
> out, the majority of which was still pending on the send_list.
> So the io_work function wasn't even running to fetch the requests off 
> the list.

That, is unexpected. This means that either the socket buffer is full 
(is it?), or that the rx is taking a long time, taking
away tx cpu time. I am not sure I understand why would unbound wq would 
solve either other than maybe hide it in
some cases.

In what workloads do you see this issue? reads/writes?

Can you try the following patch:
--
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index 0b04a2bde81d..3360de9ef034 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -350,7 +350,7 @@ static inline void nvme_tcp_advance_req(struct 
nvme_tcp_request *req,
         }
  }

-static inline void nvme_tcp_send_all(struct nvme_tcp_queue *queue)
+static inline int nvme_tcp_send_all(struct nvme_tcp_queue *queue)
  {
         int ret;

@@ -358,6 +358,7 @@ static inline void nvme_tcp_send_all(struct 
nvme_tcp_queue *queue)
         do {
                 ret = nvme_tcp_try_send(queue);
         } while (ret > 0);
+       return ret;
  }

  static inline bool nvme_tcp_queue_has_pending(struct nvme_tcp_queue 
*queue)
@@ -1276,7 +1277,7 @@ static void nvme_tcp_io_work(struct work_struct *w)
                 int result;

                 if (mutex_trylock(&queue->send_mutex)) {
-                       result = nvme_tcp_try_send(queue);
+                       result = nvme_tcp_send_all(queue);
                         mutex_unlock(&queue->send_mutex);
                         if (result > 0)
                                 pending = true;
--

Just to understand if there is a starvation problem where in io_work() 
the tx is paced, but
the rx is not.
>
>> This is indeed a bug that we need to address. Probably by setting 
>> rd_desc.count to some limit, decrement it for every
>> skb that we consume, and if we reach that limit and there are more 
>> skbs pending, we break and self-requeue.
>>
>> If we indeed spend much time processing a single queue in io_work, it 
>> is possible that we have a starvation problem
>> that is escalating to the timeouts you are seeing.
>>
> See above; this is the problem. Most of the requests are still stuck 
> on the send_list (with some even still on the req_list) when timeouts 
> occur. This means the io_work function is not being scheduled fast 
> enough (or often enough) to fetch the requests from the list.

Or, maybe some other root-cause is creating a large backlog of send 
requests.

>
> My theory here is that this is due to us using bound workqueues;
> each workqueue function has to execute on a given cpu, and we can
> only schedule one io_work function per cpu. So if that cpu is busy
> (with receiving packets, say, or normal OS tasks) we cannot execute,
> and we're seeing a starvation.

The issue that you are seeing io_work is running over 10ms is the first 
red-light here.
Because it has an explicit stop condition after 1ms. This means that a 
single while loop
is exceeding 10ms. Something is causing that.

>
> With wq_unbound we are _not_ tied to a specific cpu, but rather
> scheduled in a round-robin fashion. This avoids the starvation
> and hence the I/O timeouts do not occur.
> But we need to set the 'cpu' affinity for wq_unbound to keep
> the cache locality, otherwise the performance _really_ suffers
> as we're bouncing threads all over the place.

The unbound workqueue exists to solve a specific user issue. I think 
that if we come to
a conclusion that unbound workqueues are better (i.e. we have a bug that 
is a result of
a bound workqueue), we change it, and stop using bound workqueues 
altogether.

To be clear, I'm do not hold that opinion yet.

>
>>>
>>> which I get consistently during testing with the default 
>>> implementation.
>>
>> Hannes, let's please separate this specific issue with the 
>> performance enhancements.
>> I do not think that we should search for performance enhancements to 
>> address what appears to be a logical starvation issue.
>
> I am perfectly fine with that approach. This patchset is indeed just 
> to address the I/O timeout issues I've been seeing.

And the solution cannot be "use unbound workqueues - change this modparam".


^ permalink raw reply related	[flat|nested] 37+ messages in thread

* Re: [PATCH 4/4] nvme-tcp: switch to 'cpu' affinity scope for unbound workqueues
  2024-07-03 15:50         ` Hannes Reinecke
@ 2024-07-04  9:11           ` Sagi Grimberg
  2024-07-04 15:54             ` Hannes Reinecke
  0 siblings, 1 reply; 37+ messages in thread
From: Sagi Grimberg @ 2024-07-04  9:11 UTC (permalink / raw)
  To: Hannes Reinecke, Hannes Reinecke
  Cc: Christoph Hellwig, Keith Busch, linux-nvme



On 7/3/24 18:50, Hannes Reinecke wrote:
> On 7/3/24 17:09, Sagi Grimberg wrote:
>>
>>
>> On 03/07/2024 18:01, Hannes Reinecke wrote:
>>> On 7/3/24 16:22, Sagi Grimberg wrote:
>>>>
>>>>
>>>> On 03/07/2024 16:50, Hannes Reinecke wrote:
>>>>> We should switch to the 'cpu' affinity scope when using the 
>>>>> 'wq_unbound'
>>>>> parameter as this allows us to keep I/O locality and improve 
>>>>> performance.
>>>>
>>>> Can you please describe more why this is better? locality between 
>>>> what?
>>>>
>>> Well; the default unbound scope is 'cache', which groups the cpu 
>>> according to the cache hierarchy. I want the cpu locality of the 
>>> workqueue items to be preserved as much as possible, so I switched
>>> to 'cpu' here.
>>>
>>> I'll get some performance numbers.
>>>
>>>> While you mention in your cover letter "comments and reviews are 
>>>> welcome"
>>>> The change logs in your patches are not designed to assist your 
>>>> reviewer.
>>>
>>> I spent the last few weeks trying to come up with a solution based 
>>> on my
>>> original submission, but in the end I gave up as I hadn't been able to
>>> fix the original issue.
>>
>> Well, the last submission was a discombobulated set of mostly 
>> unrelated patches...
>> What was it that did not work?
>>
>>> This here is a different approach by massaging the 'wq_unbound' 
>>> mechanism, which is not only easier but also has the big advantage that
>>> it actually works :-)
>>> So I did not include a changlog to the previous patchset as this is 
>>> a pretty different approach.
>>> Sorry if this is confusing.
>>
>> It's just difficult to try and understand what each patch 
>> contributes, and most of the time the patches
>> are under-documented. I want to see the improvements added, but I 
>> also want them to be properly reviewed.
>
> Sure. So here are some performance number:
> (One subsystem, two paths, 96 queues)
> default:
> 4k seq read: bw=365MiB/s (383MB/s), 11.4MiB/s-20.5MiB/s 
> (11.0MB/s-21.5MB/s), io=16.0GiB (17.2GB), run=24950-44907msec
> 4k rand read: bw=307MiB/s (322MB/s), 9830KiB/s-13.8MiB/s 
> (10.1MB/s-14.5MB/s), io=16.0GiB (17.2GB), run=37081-53333msec
> 4k seq write: bw=550MiB/s (577MB/s), 17.2MiB/s-28.7MiB/s 
> (18.0MB/s-30.1MB/s), io=16.0GiB (17.2GB), run=17859-29786msec
> 4k rand write: bw=453MiB/s (475MB/s), 14.2MiB/s-21.3MiB/s 
> (14.8MB/s-22.3MB/s), io=16.0GiB (17.2GB), run=24066-36161msec
>
> unbound:
> 4k seq read: bw=232MiB/s (243MB/s), 6145KiB/s-9249KiB/s 
> (6293kB/s-9471kB/s), io=13.6GiB (14.6GB), run=56685-60074msec
> 4k rand read: bw=249MiB/s (261MB/s), 6335KiB/s-9713KiB/s 
> (6487kB/s-9946kB/s), io=14.6GiB (15.7GB), run=53976-60019msec
> 4k seq write: bw=358MiB/s (375MB/s), 11.2MiB/s-13.5MiB/s 
> (11.7MB/s-14.2MB/s), io=16.0GiB (17.2GB), run=37918-45779msec
> 4k rand write: bw=335MiB/s (351MB/s), 10.5MiB/s-14.7MiB/s 
> (10.0MB/s-15.4MB/s), io=16.0GiB (17.2GB), run=34929-48971msec
>
> unbound + 'cpu' affinity:
> 4k seq read: bw=249MiB/s (261MB/s), 6003KiB/s-13.6MiB/s 
> (6147kB/s-14.3MB/s), io=14.6GiB (15.7GB), run=37636-60065msec
> 4k rand read: bw=305MiB/s (320MB/s), 9773KiB/s-13.9MiB/s 
> (10.0MB/s-14.6MB/s), io=16.0GiB (17.2GB), run=36791-53644msec
> 4k seq write: bw=499MiB/s (523MB/s), 15.6MiB/s-18.0MiB/s 
> (16.3MB/s-19.9MB/s), io=16.0GiB (17.2GB), run=27018-32860msec
> 4k rand write: bw=536MiB/s (562MB/s), 16.7MiB/s-21.1MiB/s 
> (17.6MB/s-22.1MB/s), io=16.0GiB (17.2GB), run=24305-30588msec
>
> As you can see, with unbound and 'cpu' affinity we are basically on par
> with the default implementations (all tests are run with per-controller
> workqueues, mind).

I'm puzzled that the seq vs. rand vary this much when you work against a 
brd device.
Are these results stable?

> Running the same workload with 4 subsystems and 8 paths will run into
> I/O timeouts for the default implementation, but perfectly succeed with
> unbound and 'cpu' affinity.
> So definitely an improvement there.

I tend to think that the io timeouts are caused by a bug, not by "non 
optimized" code. io timeouts
are eternity for this test, which makes me think we have a different 
issue here.


^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 2/4] nvme-tcp: align I/O cpu with blk-mq mapping
  2024-07-04  5:37     ` Christoph Hellwig
@ 2024-07-04  9:13       ` Sagi Grimberg
  0 siblings, 0 replies; 37+ messages in thread
From: Sagi Grimberg @ 2024-07-04  9:13 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Hannes Reinecke, Keith Busch, linux-nvme



On 7/4/24 08:37, Christoph Hellwig wrote:
> On Wed, Jul 03, 2024 at 05:19:39PM +0300, Sagi Grimberg wrote:
>>
>> On 03/07/2024 16:50, Hannes Reinecke wrote:
>>> When 'wq_unbound' is selected we should select the
>>> the first CPU from a given blk-mq hctx mapping to queue
>>> the tcp workqueue item. With this we can instruct the
>>> workqueue code to keep the I/O affinity and avoid
>>> a performance penalty.
>> wq_unbound is designed to keep io_cpu to be UNBOUND, my recollection
>> was the the person introducing it was trying to make the io_cpu always be
>> on a specific NUMA node, or a subset of cpus within a numa node. So he uses
>> that and tinkers with wq cpumask via sysfs.
> Honestly, I think we really should kill this unbound module paramter
> before it causes more harm.

It solves a point problem that I don't have a good idea on how to solve. 
It can stay there, but we should
be careful before we choose to evolve it.


^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 2/4] nvme-tcp: align I/O cpu with blk-mq mapping
  2024-07-04  9:07               ` Sagi Grimberg
@ 2024-07-04 14:03                 ` Hannes Reinecke
  0 siblings, 0 replies; 37+ messages in thread
From: Hannes Reinecke @ 2024-07-04 14:03 UTC (permalink / raw)
  To: Sagi Grimberg, Hannes Reinecke; +Cc: Christoph Hellwig, Keith Busch, linux-nvme

On 7/4/24 11:07, Sagi Grimberg wrote:
> 
> 
> On 7/4/24 09:43, Hannes Reinecke wrote:
>> On 7/3/24 21:38, Sagi Grimberg wrote:
>> [ .. ]
>>>>>
>>>>> We should make the io_cpu come from blk-mq hctx mapping by default, 
>>>>> and for every controller it should use a different cpu from the 
>>>>> hctx mapping. That is the default behavior. in the wq_unbound case, 
>>>>> we skip all of that and make io_cpu = WORK_CPU_UNBOUND, as it was 
>>>>> before.
>>>>>
>>>>> I'm not sure I follow your logic.
>>>>>
>>>> Hehe. That's quite simple: there is none :-)
>>>> I have been tinkering with that approach in the last weeks, but got 
>>>> consistently _worse_ results than with the original implementation.
>>>> So I gave up on trying to make that the default.
>>>
>>> What is the "original implementation" ?
>>
>> nvme-6.10
>>
>>> What is you target? nvmet?
>>
>> nvmet with brd backend
>>
>>> What is the fio job file you are using?
>>
>> tiobench-example.fio from the fio samples
>>
>>> what is the queue count? controller count?
>>
>> 96 queues, 4 subsystems, 2 controller each.
>>
>>> What was the queue mapping?
>>>
>> queue 0-5 maps to cpu 6-11
>> queue 6-11 maps to cpu 54-59
>> queue 12-17 maps to cpu 18-23
>> queue 18-23 maps to cpu 66-71
>> queue 24-29 maps to cpu 24-29
>> queue 30-35 maps to cpu 72-77
>> queue 36-41 maps to cpu 30-35
>> queue 42-47 maps to cpu 78-83
>> queue 48-53 maps to cpu 36-41
>> queue 54-59 maps to cpu 84-89
>> queue 60-65 maps to cpu 42-47
>> queue 66-71 maps to cpu 90-95
>> queue 72-77 maps to cpu 12-17
>> queue 78-83 maps to cpu 60-65
>> queue 84-89 maps to cpu 0-5
>> queue 90-95 maps to cpu 48-53
> 
> What are the io_cpu for each queue?
> 
These are the io_cpu mappings; 'maps to' means the value of io_cpu.
IE queue 0 has the io_cpu value of 6.

>>
>>> Please lets NOT condition any of this on wq_unbound option at this 
>>> point. This modparam was introduced to address
>>> a specific issue. If we see IO timeouts, we should fix them, not tell 
>>> people to filp a modparam as a solution.
>>>
>> Thing is, there is no 'best' solution. The current implementation is 
>> actually quite good in the single subsystem case. Issues start to appear
>> when doing performance testing with a really high load.
>> Reason for this is a high contention on the per-cpu workqueues, which 
>> are simply overwhelmed by doing I/O _and_ servicing 'normal' OS workload
>> like writing do disk etc.
> 
> What other 'normal' workloads are you seeing in your test?
> 
Well, it is an OS, after all. And that OS has some housekeeping to do 
like writing out the fio output etc.

>> Switching to wq_unbound reduces the contention and makes the system to 
>> scale better, but that scaling leads to a performance regression for
>> the single subsystem case.
> 
> The kthreads are the same kthreads, the only difference is concurrency 
> management, which may take advantage of different cpu cores, but pays
> a price in latency and bouncing cpus.
> 
Precisely. That's why I'm seeing a slightly better performance for the
existing implementation and single subsystems.

>> (See my other mail for performance numbers)
>> So what is 'better'?
> 
> Which email? To Tejun? it seems that bound is better than unbound for 
> all cases. You are suggesting that you regress with multiple controllers
> though. That is why I suggested that we _spread_ queue io_cpu 
> assignments for the bound case (as I suggested in your first attempt), 
> I'd want to see what happens in this case.
> 
Correct; this is for the single subsystem case.
For which the current implementation is more efficient than the unbound one.

>>
>>>>
>>>>>>
>>>>>> And it makes the 'CPU hogged' messages go away, which is a bonus 
>>>>>> in itself...
>>>>>
>>>>> Which messages? aren't these messages saying that the work spent 
>>>>> too much time? why are you describing the case where the work does 
>>>>> not get
>>>>> cpu quota to run?
>>>>
>>>> I means these messages:
>>>>
>>>> workqueue: nvme_tcp_io_work [nvme_tcp] hogged CPU for >10000us 32771 
>>>> times, consider switching to WQ_UNBOUND
>>>
>>> That means that we are spending too much time in io_work, This is a 
>>> separate bug. If you look at nvme_tcp_io_work it has
>>> a stop condition after 1 millisecond. However, when we call 
>>> nvme_tcp_try_recv() it just keeps receiving from the socket until
>>> the socket receive buffer has no more payload. So in theory nothing 
>>> prevents from the io_work from looping there forever.
>>>
>> Oh, no. It's not the loop which is the problem. It's the actual sending
>> which takes long; in my test runs I've seen about 250 requests timing 
>> out, the majority of which was still pending on the send_list.
>> So the io_work function wasn't even running to fetch the requests off 
>> the list.
> 
> That, is unexpected. This means that either the socket buffer is full 
> (is it?), or that the rx is taking a long time, taking
> away tx cpu time. I am not sure I understand why would unbound wq would 
> solve either other than maybe hide it in
> some cases.
> 
> In what workloads do you see this issue? reads/writes?
> 
Both, unfortunately.
Funny data point, though: for 2 subsystems I see I/O timeouts when 
writing the fio log to disk. There are no I/O timeouts when fio just
prints to stdout.
So it _is_ the OS which drives up starvation.

> Can you try the following patch:
> -- 
> diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
> index 0b04a2bde81d..3360de9ef034 100644
> --- a/drivers/nvme/host/tcp.c
> +++ b/drivers/nvme/host/tcp.c
> @@ -350,7 +350,7 @@ static inline void nvme_tcp_advance_req(struct 
> nvme_tcp_request *req,
>          }
>   }
> 
> -static inline void nvme_tcp_send_all(struct nvme_tcp_queue *queue)
> +static inline int nvme_tcp_send_all(struct nvme_tcp_queue *queue)
>   {
>          int ret;
> 
> @@ -358,6 +358,7 @@ static inline void nvme_tcp_send_all(struct 
> nvme_tcp_queue *queue)
>          do {
>                  ret = nvme_tcp_try_send(queue);
>          } while (ret > 0);
> +       return ret;
>   }
> 
>   static inline bool nvme_tcp_queue_has_pending(struct nvme_tcp_queue 
> *queue)
> @@ -1276,7 +1277,7 @@ static void nvme_tcp_io_work(struct work_struct *w)
>                  int result;
> 
>                  if (mutex_trylock(&queue->send_mutex)) {
> -                       result = nvme_tcp_try_send(queue);
> +                       result = nvme_tcp_send_all(queue);
>                          mutex_unlock(&queue->send_mutex);
>                          if (result > 0)
>                                  pending = true;
> -- 
> 
> Just to understand if there is a starvation problem where in io_work() 
> the tx is paced, but the rx is not.

Surprise, surprise, that does work.
Together with aligning io_cpu with the blk-mq mapping I see a 
substantial improvement for the 4 subsystem case; no I/O timeouts,
and contention as measured by the number of 'nvme_tcp_io_work [nvme_tcp] 
hogged CPU' messages has been reduced, too.
Performance impact is mixed; you win some, you lose some, with nothing
conclusive either way.

>>
>>> This is indeed a bug that we need to address. Probably by setting 
>>> rd_desc.count to some limit, decrement it for every
>>> skb that we consume, and if we reach that limit and there are more 
>>> skbs pending, we break and self-requeue.
>>>
>>> If we indeed spend much time processing a single queue in io_work, it 
>>> is possible that we have a starvation problem
>>> that is escalating to the timeouts you are seeing.
>>>
>> See above; this is the problem. Most of the requests are still stuck 
>> on the send_list (with some even still on the req_list) when timeouts 
>> occur. This means the io_work function is not being scheduled fast 
>> enough (or often enough) to fetch the requests from the list.
> 
> Or, maybe some other root-cause is creating a large backlog of send 
> requests.
> 
What definitely contributes to the backlog is cpu time starvation.
With the original implementation I can easily trigger I/O timeouts by
writing the fio output to disk; writing to stdout pretty much avoids
the I/O timeouts.

I see to get some latency numbers for send and receive.

Cheers,

Hannes
-- 
Dr. Hannes Reinecke                  Kernel Storage Architect
hare@suse.de                                +49 911 74053 688
SUSE Software Solutions GmbH, Frankenstr. 146, 90461 Nürnberg
HRB 36809 (AG Nürnberg), GF: I. Totev, A. McDonald, W. Knoblich



^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 4/4] nvme-tcp: switch to 'cpu' affinity scope for unbound workqueues
  2024-07-04  9:11           ` Sagi Grimberg
@ 2024-07-04 15:54             ` Hannes Reinecke
  2024-07-05 11:48               ` Sagi Grimberg
  0 siblings, 1 reply; 37+ messages in thread
From: Hannes Reinecke @ 2024-07-04 15:54 UTC (permalink / raw)
  To: Sagi Grimberg, Hannes Reinecke; +Cc: Christoph Hellwig, Keith Busch, linux-nvme

On 7/4/24 11:11, Sagi Grimberg wrote:
> 
> 
> On 7/3/24 18:50, Hannes Reinecke wrote:
[ .. ]
>>
>> As you can see, with unbound and 'cpu' affinity we are basically on par
>> with the default implementations (all tests are run with per-controller
>> workqueues, mind).
> 
> I'm puzzled that the seq vs. rand vary this much when you work against a 
> brd device.
> Are these results stable?
> 
There is quite a bit of flutter ongoing, but the overall picture doesn't 
change.

>> Running the same workload with 4 subsystems and 8 paths will run into
>> I/O timeouts for the default implementation, but perfectly succeed with
>> unbound and 'cpu' affinity.
>> So definitely an improvement there.
> 
> I tend to think that the io timeouts are caused by a bug, not by "non 
> optimized" code. io timeouts are eternity for this test, which makes me
> think we have a different issue here.

I did some latency measurements for the send and receive loop, and found 
that we are in fact starved by the receive side. The sending side is 
pretty well limited by the 'deadline' setting, but the receiving side 
has no such precaution, and I have seen per-queue receive latencies of 
over 5 milliseconds.
The worrying thing here was that only individual queues have been 
affected; most queues had the expected latency of around 50usecs, but
some really went over the top with 1000s of usecs. And these were the
queues which were generating I/O timeouts.

I have now modified the deadline method to cover both receive and 
sending side, and the results were pretty good; timeouts are gone and
even the overall performance for the 4 subsystem case has gone up.

Will be posting an updated patchset shortly.

Cheers,

Hannes
-- 
Dr. Hannes Reinecke                  Kernel Storage Architect
hare@suse.de                                +49 911 74053 688
SUSE Software Solutions GmbH, Frankenstr. 146, 90461 Nürnberg
HRB 36809 (AG Nürnberg), GF: I. Totev, A. McDonald, W. Knoblich

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 1/4] nvme-tcp: per-controller I/O workqueues
  2024-07-04  7:36               ` Hannes Reinecke
@ 2024-07-05  7:10                 ` Christoph Hellwig
  2024-07-05  8:11                   ` Hannes Reinecke
  0 siblings, 1 reply; 37+ messages in thread
From: Christoph Hellwig @ 2024-07-05  7:10 UTC (permalink / raw)
  To: Hannes Reinecke
  Cc: Tejun Heo, Sagi Grimberg, Hannes Reinecke, Christoph Hellwig,
	Keith Busch, linux-nvme

Btw, I don't think brd is what we should optimize for.  brd does
synchronous I/O from ->submit_bio which makes it very non-typical.
Trying to get this as good as possible for QD=1 might be fine,
but once we have deeper queue depth and/or bigger I/O size it will
use a lot more time in the submission context (aka the workqueues here)
than a real device.

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 1/4] nvme-tcp: per-controller I/O workqueues
  2024-07-05  7:10                 ` Christoph Hellwig
@ 2024-07-05  8:11                   ` Hannes Reinecke
  2024-07-05  8:16                     ` Jens Axboe
  0 siblings, 1 reply; 37+ messages in thread
From: Hannes Reinecke @ 2024-07-05  8:11 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Tejun Heo, Sagi Grimberg, Hannes Reinecke, Keith Busch,
	linux-nvme

On 7/5/24 09:10, Christoph Hellwig wrote:
> Btw, I don't think brd is what we should optimize for.  brd does
> synchronous I/O from ->submit_bio which makes it very non-typical.
> Trying to get this as good as possible for QD=1 might be fine,
> but once we have deeper queue depth and/or bigger I/O size it will
> use a lot more time in the submission context (aka the workqueues here)
> than a real device.
> 
Hmm. brd was the simplest choice to get a high-bandwidth target.
I'll check if a get a similar performance with null-blk.

Cheers,

Hannes
-- 
Dr. Hannes Reinecke                  Kernel Storage Architect
hare@suse.de                                +49 911 74053 688
SUSE Software Solutions GmbH, Frankenstr. 146, 90461 Nürnberg
HRB 36809 (AG Nürnberg), GF: I. Totev, A. McDonald, W. Knoblich



^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 1/4] nvme-tcp: per-controller I/O workqueues
  2024-07-05  8:11                   ` Hannes Reinecke
@ 2024-07-05  8:16                     ` Jens Axboe
  0 siblings, 0 replies; 37+ messages in thread
From: Jens Axboe @ 2024-07-05  8:16 UTC (permalink / raw)
  To: Hannes Reinecke, Christoph Hellwig
  Cc: Tejun Heo, Sagi Grimberg, Hannes Reinecke, Keith Busch,
	linux-nvme

On 7/5/24 2:11 AM, Hannes Reinecke wrote:
> On 7/5/24 09:10, Christoph Hellwig wrote:
>> Btw, I don't think brd is what we should optimize for.  brd does
>> synchronous I/O from ->submit_bio which makes it very non-typical.
>> Trying to get this as good as possible for QD=1 might be fine,
>> but once we have deeper queue depth and/or bigger I/O size it will
>> use a lot more time in the submission context (aka the workqueues here)
>> than a real device.

Agree, using brd is the backend is useless if you want to optimize
for the real world, and may be actively misleading.

> Hmm. brd was the simplest choice to get a high-bandwidth target.
> I'll check if a get a similar performance with null-blk.

Just use a normal flash drive? Even basic drives these days do
millions of iops and 7-8GB/sec.

-- 
Jens Axboe




^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 4/4] nvme-tcp: switch to 'cpu' affinity scope for unbound workqueues
  2024-07-04 15:54             ` Hannes Reinecke
@ 2024-07-05 11:48               ` Sagi Grimberg
  0 siblings, 0 replies; 37+ messages in thread
From: Sagi Grimberg @ 2024-07-05 11:48 UTC (permalink / raw)
  To: Hannes Reinecke, Hannes Reinecke
  Cc: Christoph Hellwig, Keith Busch, linux-nvme


>> I tend to think that the io timeouts are caused by a bug, not by "non 
>> optimized" code. io timeouts are eternity for this test, which makes me
>> think we have a different issue here.
>
> I did some latency measurements for the send and receive loop, and 
> found that we are in fact starved by the receive side. The sending 
> side is pretty well limited by the 'deadline' setting, but the 
> receiving side has no such precaution, and I have seen per-queue 
> receive latencies of over 5 milliseconds.
> The worrying thing here was that only individual queues have been 
> affected; most queues had the expected latency of around 50usecs, but
> some really went over the top with 1000s of usecs. And these were the
> queues which were generating I/O timeouts.
>
> I have now modified the deadline method to cover both receive and 
> sending side, and the results were pretty good; timeouts are gone and
> even the overall performance for the 4 subsystem case has gone up.
>
> Will be posting an updated patchset shortly.

That is good to get some confirmation. I'll wait to see your patch 
(assuming that you added
count limit to the desc passed to read_sock?)

btw, the count doesn't need to be byte granular, it can also store 
jiffies and check time.
However, I'd prefer to keep it to byte count such that we can leverage 
it to do one call
directly in nvme_tcp_data_ready().


^ permalink raw reply	[flat|nested] 37+ messages in thread

end of thread, other threads:[~2024-07-05 11:49 UTC | newest]

Thread overview: 37+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-07-03 13:50 [PATCH 0/4] nvme-tcp: improve scalability Hannes Reinecke
2024-07-03 13:50 ` [PATCH 1/4] nvme-tcp: per-controller I/O workqueues Hannes Reinecke
2024-07-03 14:11   ` Sagi Grimberg
2024-07-03 14:46     ` Hannes Reinecke
2024-07-03 15:16       ` Sagi Grimberg
2024-07-03 17:07         ` Tejun Heo
2024-07-03 19:14           ` Sagi Grimberg
2024-07-03 19:17             ` Tejun Heo
2024-07-03 19:41               ` Sagi Grimberg
2024-07-04  7:36               ` Hannes Reinecke
2024-07-05  7:10                 ` Christoph Hellwig
2024-07-05  8:11                   ` Hannes Reinecke
2024-07-05  8:16                     ` Jens Axboe
2024-07-04  5:36   ` Christoph Hellwig
2024-07-03 13:50 ` [PATCH 2/4] nvme-tcp: align I/O cpu with blk-mq mapping Hannes Reinecke
2024-07-03 14:19   ` Sagi Grimberg
2024-07-03 14:53     ` Hannes Reinecke
2024-07-03 15:03       ` Sagi Grimberg
2024-07-03 15:40         ` Hannes Reinecke
2024-07-03 19:38           ` Sagi Grimberg
2024-07-03 19:47             ` Sagi Grimberg
2024-07-04  6:43             ` Hannes Reinecke
2024-07-04  9:07               ` Sagi Grimberg
2024-07-04 14:03                 ` Hannes Reinecke
2024-07-04  5:37     ` Christoph Hellwig
2024-07-04  9:13       ` Sagi Grimberg
2024-07-03 13:50 ` [PATCH 3/4] workqueue: introduce helper workqueue_unbound_affinity_scope() Hannes Reinecke
2024-07-03 17:31   ` Tejun Heo
2024-07-04  6:04     ` Hannes Reinecke
2024-07-03 13:50 ` [PATCH 4/4] nvme-tcp: switch to 'cpu' affinity scope for unbound workqueues Hannes Reinecke
2024-07-03 14:22   ` Sagi Grimberg
2024-07-03 15:01     ` Hannes Reinecke
2024-07-03 15:09       ` Sagi Grimberg
2024-07-03 15:50         ` Hannes Reinecke
2024-07-04  9:11           ` Sagi Grimberg
2024-07-04 15:54             ` Hannes Reinecke
2024-07-05 11:48               ` Sagi Grimberg

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox