[PATCH 0/2] nvmet: support polling task for RDMA and TCP

Linux-NVME Archive on lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH 0/2] nvmet: support polling task for RDMA and TCP
@ 2024-06-26  8:28 Ping Gan
  2024-06-26  8:28 ` [PATCH 1/2] nvmet-rdma: add polling cq task for nvmet-rdma Ping Gan
                   ` (2 more replies)
  0 siblings, 3 replies; 17+ messages in thread
From: Ping Gan @ 2024-06-26  8:28 UTC (permalink / raw)
  To: hch, sagi, kch, linux-nvme, linux-kernel; +Cc: ping.gan, Ping Gan

When running nvmf on SMP platform, current nvme target's RDMA and
TCP use kworker to handle IO. But if there is other high workload 
in the system(eg: on kubernetes), the competition between the 
kworker and other workload is very radical. And since the kworker
is scheduled by OS randomly, it's difficult to control OS resource 
and also tune the performance. If target support to use delicated 
polling task to handle IO, it's useful to control OS resource and 
gain good performance. So it makes sense to add polling task in
rdma-rdma and rdma-tcp modules.

Ping Gan (2):
  nvmet-rdma: add polling cq task for nvmet-rdma
  nvmet-tcp: add polling task for nvmet-tcp

 drivers/nvme/target/rdma.c | 331 +++++++++++++++++++++++++++++++++-
 drivers/nvme/target/tcp.c  | 356 +++++++++++++++++++++++++++++++++++--
 2 files changed, 665 insertions(+), 22 deletions(-)

-- 
2.26.2

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [PATCH 1/2] nvmet-rdma: add polling cq task for nvmet-rdma
  2024-06-26  8:28 [PATCH 0/2] nvmet: support polling task for RDMA and TCP Ping Gan
@ 2024-06-26  8:28 ` Ping Gan
  2024-06-26  8:28 ` [PATCH 2/2] nvmet-tcp: add polling task for nvmet-tcp Ping Gan
  2024-06-30  8:58 ` [PATCH 0/2] nvmet: support polling task for RDMA and TCP Sagi Grimberg
  2 siblings, 0 replies; 17+ messages in thread
From: Ping Gan @ 2024-06-26  8:28 UTC (permalink / raw)
  To: hch, sagi, kch, linux-nvme, linux-kernel; +Cc: ping.gan, Ping Gan

To add dedicated polling cq tasks versus kworker for nvmet-rdma
module. And we have three module parametes:
 task_num is to define number of polling cq task.
 core_affinity is to define which cpu core will be begun to use.
 idle_peroid is to define task's polling time before go to idle.

Signed-off-by: Ping Gan <jacky_gam_2001@163.com>
---
 drivers/nvme/target/rdma.c | 331 ++++++++++++++++++++++++++++++++++++-
 1 file changed, 326 insertions(+), 5 deletions(-)

diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c
index 1eff8ca6a5f1..83c03e088bf9 100644
--- a/drivers/nvme/target/rdma.c
+++ b/drivers/nvme/target/rdma.c
@@ -26,6 +26,28 @@
 #include <linux/nvme-rdma.h>
 #include "nvmet.h"
 
+/* Define a time period (in usecs) that poll thread shall sample an activated
+ * queue before determining it to be idle.
+ */
+static int idle_poll_period_usecs;
+module_param(idle_poll_period_usecs, int, 0644);
+MODULE_PARM_DESC(idle_poll_period_usecs,
+		"nvmet rdma cq thread poll till idle time period in usecs");
+
+/* Define the target rdma cq polling thread's affinity cpu core.
+ */
+static int pt_affinity_core = -2;
+module_param(pt_affinity_core, int, 0644);
+MODULE_PARM_DESC(pt_affinity_core,
+	    "target rdma cq polling thread's affinity core, -1 for all online cpus");
+
+/* Define the polling thread number.
+ */
+static int pt_num;
+module_param(pt_num, int, 0644);
+MODULE_PARM_DESC(pt_num, "target rdma cq polling thread number");
+bool rdma_polling_cq_task;
+
 /*
  * We allow at least 1 page, up to 4 SGEs, and up to 16KB of inline data
  */
@@ -39,6 +61,23 @@
 
 #define NVMET_RDMA_BACKLOG 128
 
+struct nvmet_rdma_pt_data {
+	struct wait_queue_head	wait_head;
+	struct mutex		queue_lock;
+	struct list_head	pt_admin_queue_list;
+	struct list_head	pt_io_queue_list;
+	u32		thread_idle;
+	int		affinity_cpu;
+	pid_t			task_pid;
+	pid_t			task_tgid;
+	atomic64_t		admin_queue_cnt;
+	atomic64_t		io_queue_cnt;
+	struct task_struct *thread;
+	struct mutex	   thread_lock;
+};
+
+struct nvmet_rdma_pt_data **rdma_pt_data;
+
 struct nvmet_rdma_srq;
 
 struct nvmet_rdma_cmd {
@@ -114,6 +153,10 @@ struct nvmet_rdma_queue {
 	int			send_queue_size;
 
 	struct list_head	queue_list;
+	//for cq poll thread
+	struct nvmet_rdma_pt_data *pt_data;
+	struct list_head	pt_list_entry;
+	atomic64_t		req_cnt;
 };
 
 struct nvmet_rdma_port {
@@ -176,6 +219,59 @@ static int nvmet_rdma_alloc_rsp(struct nvmet_rdma_device *ndev,
 
 static const struct nvmet_fabrics_ops nvmet_rdma_ops;
 
+static void nvmet_rdma_wakeup_poll_thread(struct nvmet_rdma_queue *queue)
+{
+	smp_mb();
+	if (queue->pt_data && waitqueue_active(&queue->pt_data->wait_head))
+		wake_up(&queue->pt_data->wait_head);
+}
+
+static void nvmet_rdma_ib_cq_handler(struct ib_cq *cq, void *private)
+{
+	struct nvmet_rdma_queue *queue = (struct nvmet_rdma_queue *)cq->cq_context;
+	atomic64_set(&queue->req_cnt, 1);
+	nvmet_rdma_wakeup_poll_thread(queue);
+}
+
+static int nvmet_rdma_get_pcq_task(bool io_queue)
+{
+	int i = 1, ret = 0;
+	s64 min, tmp;
+	struct nvmet_rdma_pt_data *tptd;
+
+	tptd = rdma_pt_data[0];
+	if (io_queue)
+		min = atomic64_read(&tptd->io_queue_cnt);
+	else
+		min = atomic64_read(&tptd->admin_queue_cnt);
+	while (i < pt_num) {
+		tptd = rdma_pt_data[i];
+		if (io_queue)
+			tmp = atomic64_read(&tptd->io_queue_cnt);
+		else
+			tmp = atomic64_read(&tptd->admin_queue_cnt);
+		if (min > tmp) {
+			min = tmp;
+			ret = i;
+		}
+		i++;
+	}
+	tptd = rdma_pt_data[ret];
+	if (io_queue)
+		atomic64_inc(&tptd->io_queue_cnt);
+	else
+		atomic64_inc(&tptd->admin_queue_cnt);
+	return ret;
+}
+
+static inline void nvmet_rdma_pq_clear_req(struct nvmet_rdma_queue *queue)
+{
+	struct nvmet_rdma_pt_data *tptd = queue->pt_data;
+	mutex_lock(&tptd->queue_lock);
+	list_del(&queue->pt_list_entry);
+	mutex_unlock(&tptd->queue_lock);
+}
+
 static int srq_size_set(const char *val, const struct kernel_param *kp)
 {
 	int n = 0, ret;
@@ -507,6 +603,10 @@ static int nvmet_rdma_post_recv(struct nvmet_rdma_device *ndev,
 
 	if (unlikely(ret))
 		pr_err("post_recv cmd failed\n");
+	else if (rdma_polling_cq_task) {
+		atomic64_set(&cmd->queue->req_cnt, 1);
+		nvmet_rdma_wakeup_poll_thread(cmd->queue);
+	}
 
 	return ret;
 }
@@ -740,6 +840,9 @@ static void nvmet_rdma_queue_response(struct nvmet_req *req)
 	if (unlikely(ib_post_send(cm_id->qp, first_wr, NULL))) {
 		pr_err("sending cmd response failed\n");
 		nvmet_rdma_release_rsp(rsp);
+	} else if (rdma_polling_cq_task) {
+		atomic64_set(&rsp->queue->req_cnt, 1);
+		nvmet_rdma_wakeup_poll_thread(rsp->queue);
 	}
 }
 
@@ -816,6 +919,9 @@ static void nvmet_rdma_write_data_done(struct ib_cq *cq, struct ib_wc *wc)
 	if (unlikely(ib_post_send(cm_id->qp, &rsp->send_wr, NULL))) {
 		pr_err("sending cmd response failed\n");
 		nvmet_rdma_release_rsp(rsp);
+	} else if (rdma_polling_cq_task) {
+		atomic64_set(&rsp->queue->req_cnt, 1);
+		nvmet_rdma_wakeup_poll_thread(rsp->queue);
 	}
 }
 
@@ -957,6 +1063,10 @@ static bool nvmet_rdma_execute_command(struct nvmet_rdma_rsp *rsp)
 		if (rdma_rw_ctx_post(&rsp->rw, queue->qp,
 				queue->cm_id->port_num, &rsp->read_cqe, NULL))
 			nvmet_req_complete(&rsp->req, NVME_SC_DATA_XFER_ERROR);
+		if (rdma_polling_cq_task) {
+			atomic64_set(&queue->req_cnt, 1);
+			nvmet_rdma_wakeup_poll_thread(queue);
+		}
 	} else {
 		rsp->req.execute(&rsp->req);
 	}
@@ -1259,8 +1369,16 @@ static int nvmet_rdma_create_queue_ib(struct nvmet_rdma_queue *queue)
 	 */
 	nr_cqe = queue->recv_queue_size + 2 * queue->send_queue_size;
 
-	queue->cq = ib_cq_pool_get(ndev->device, nr_cqe + 1,
-				   queue->comp_vector, IB_POLL_WORKQUEUE);
+	if (rdma_polling_cq_task) {
+		queue->cq = ib_alloc_cq(ndev->device, queue, nr_cqe + 1,
+						queue->comp_vector, IB_POLL_DIRECT);
+		queue->cq->comp_handler = nvmet_rdma_ib_cq_handler;
+		ib_req_notify_cq(queue->cq, IB_CQ_NEXT_COMP);
+	} else {
+		queue->cq = ib_cq_pool_get(ndev->device, nr_cqe + 1,
+						queue->comp_vector, IB_POLL_WORKQUEUE);
+	}
+
 	if (IS_ERR(queue->cq)) {
 		ret = PTR_ERR(queue->cq);
 		pr_err("failed to create CQ cqe= %d ret= %d\n",
@@ -1331,8 +1449,11 @@ static void nvmet_rdma_destroy_queue_ib(struct nvmet_rdma_queue *queue)
 	if (queue->cm_id)
 		rdma_destroy_id(queue->cm_id);
 	ib_destroy_qp(queue->qp);
-	ib_cq_pool_put(queue->cq, queue->recv_queue_size + 2 *
-		       queue->send_queue_size + 1);
+	if (rdma_polling_cq_task)
+		ib_free_cq(queue->cq);
+	else
+		ib_cq_pool_put(queue->cq, queue->recv_queue_size + 2 *
+			       queue->send_queue_size + 1);
 }
 
 static void nvmet_rdma_free_queue(struct nvmet_rdma_queue *queue)
@@ -1340,6 +1461,13 @@ static void nvmet_rdma_free_queue(struct nvmet_rdma_queue *queue)
 	pr_debug("freeing queue %d\n", queue->idx);
 
 	nvmet_sq_destroy(&queue->nvme_sq);
+	if (rdma_polling_cq_task) {
+		nvmet_rdma_pq_clear_req(queue);
+		if (queue->host_qid > 0)
+			atomic64_dec(&queue->pt_data->io_queue_cnt);
+		else
+			atomic64_dec(&queue->pt_data->admin_queue_cnt);
+	}
 
 	nvmet_rdma_destroy_queue_ib(queue);
 	if (!queue->nsrq) {
@@ -1600,6 +1728,19 @@ static int nvmet_rdma_queue_connect(struct rdma_cm_id *cm_id,
 		goto free_queue;
 	}
 
+	if (rdma_polling_cq_task) {
+		bool io_queue = queue->host_qid > 0?1:0;
+		ret = nvmet_rdma_get_pcq_task(io_queue);
+		queue->pt_data = rdma_pt_data[ret];
+		mutex_lock(&queue->pt_data->queue_lock);
+		if (io_queue)
+			list_add_tail(&queue->pt_list_entry, &queue->pt_data->pt_io_queue_list);
+		else
+			list_add_tail(&queue->pt_list_entry, &queue->pt_data->pt_admin_queue_list);
+		mutex_unlock(&queue->pt_data->queue_lock);
+		nvmet_rdma_wakeup_poll_thread(queue);
+	}
+
 	mutex_lock(&nvmet_rdma_queue_mutex);
 	list_add_tail(&queue->queue_list, &nvmet_rdma_queue_list);
 	mutex_unlock(&nvmet_rdma_queue_mutex);
@@ -2082,9 +2223,156 @@ static struct ib_client nvmet_rdma_ib_client = {
 	.remove = nvmet_rdma_remove_one
 };
 
+#define RDMA_POLL_BUDGET   8
+static int __nvmet_rdma_poll_thread(struct nvmet_rdma_pt_data *rptd)
+{
+	int rcv_ret = 0;
+	bool need_repoll = false;
+	struct nvmet_rdma_queue *qreq, *tmp;
+
+	mutex_lock(&rptd->queue_lock);
+	if (!list_empty(&rptd->pt_admin_queue_list)) {
+		list_for_each_entry_safe(qreq, tmp, &rptd->pt_admin_queue_list, pt_list_entry) {
+			if (atomic64_read(&qreq->req_cnt) > 0) {
+				rcv_ret = ib_process_cq_direct(qreq->cq, RDMA_POLL_BUDGET);
+				if (rcv_ret > 0)
+					need_repoll = true;
+				else {
+					atomic64_set(&qreq->req_cnt, 0);
+					ib_req_notify_cq(qreq->cq, IB_CQ_NEXT_COMP);
+				}
+			}
+		}
+	}
+	if (!list_empty(&rptd->pt_io_queue_list)) {
+		list_for_each_entry_safe(qreq, tmp, &rptd->pt_io_queue_list, pt_list_entry) {
+			if (atomic64_read(&qreq->req_cnt) > 0) {
+				rcv_ret = ib_process_cq_direct(qreq->cq, RDMA_POLL_BUDGET);
+				if (rcv_ret > 0)
+					need_repoll = true;
+				else {
+					atomic64_set(&qreq->req_cnt, 0);
+					ib_req_notify_cq(qreq->cq, IB_CQ_NEXT_COMP);
+				}
+			}
+		}
+	}
+	mutex_unlock(&rptd->queue_lock);
+	if (need_repoll)
+		return 1;
+	else
+		return 0;
+}
+
+static int nvmet_rdma_poll_thread(void *data)
+{
+	struct nvmet_rdma_pt_data *rptd = data;
+	unsigned long timeout = 0;
+	DEFINE_WAIT(wait);
+
+	if (rptd->affinity_cpu != -1)
+		set_cpus_allowed_ptr(current, cpumask_of(rptd->affinity_cpu));
+	else
+		set_cpus_allowed_ptr(current, cpu_online_mask);
+	current->flags |= PF_NO_SETAFFINITY;
+	mutex_lock(&rptd->thread_lock);
+	rptd->task_pid = current->pid;
+	rptd->task_tgid = current->tgid;
+
+	while (!kthread_should_stop()) {
+		int ret = __nvmet_rdma_poll_thread(rptd);
+		if (ret > 0 || !time_after(jiffies, timeout)) {
+			cond_resched();
+			if (ret > 0)
+				timeout = jiffies + rptd->thread_idle;
+			continue;
+		}
+		prepare_to_wait(&rptd->wait_head, &wait, TASK_INTERRUPTIBLE);
+		mutex_unlock(&rptd->thread_lock);
+		schedule();
+		mutex_lock(&rptd->thread_lock);
+		finish_wait(&rptd->wait_head, &wait);
+		timeout = jiffies + rptd->thread_idle;
+	}
+	rptd->thread = NULL;
+	rptd->task_pid = -1;
+	rptd->task_tgid = -1;
+	mutex_unlock(&rptd->thread_lock);
+	kthread_complete_and_exit(NULL, 0);
+	//do_exit(0);
+}
+
 static int __init nvmet_rdma_init(void)
 {
-	int ret;
+	int ret, i;
+	char task_name[TASK_COMM_LEN];
+	struct task_struct *task;
+
+	rdma_polling_cq_task = false;
+	if ((pt_affinity_core >= -1 && pt_affinity_core < (int)nr_cpu_ids)
+		|| pt_num > 0 || idle_poll_period_usecs > 0) {
+		if (pt_num == 0)
+			pt_num = 1;
+		else if (pt_num < 0) {
+			printk(KERN_ERR "bad parameter for task num\n");
+			ret =  -EINVAL;
+			return ret;
+		}
+		if (pt_affinity_core == -2)
+			pt_affinity_core = -1;
+		if (pt_affinity_core < -1 ||
+			pt_affinity_core >= (int)nr_cpu_ids) {
+			printk(KERN_ERR "bad parameter for affinity core \n");
+			ret =  -EINVAL;
+			return ret;
+		}
+		if (idle_poll_period_usecs == 0)
+			idle_poll_period_usecs = 1000; // default 1ms
+		else if (idle_poll_period_usecs < 0) {
+			printk(KERN_ERR "bad parameter for idle poll period\n");
+			ret =  -EINVAL;
+			return ret;
+		}
+		rdma_pt_data = kmalloc(pt_num * sizeof(void *), GFP_KERNEL);
+		if (!rdma_pt_data)
+			return -ENOMEM;
+
+		for (i = 0; i < pt_num; i++) {
+			rdma_pt_data[i] = kmalloc(sizeof(struct nvmet_rdma_pt_data), GFP_KERNEL);
+			if (!rdma_pt_data[i]) {
+				ret = -ENOMEM;
+				goto err_free_pqtd;
+			}
+		}
+		for (i = 0; i < pt_num; i++) {
+			mutex_init(&rdma_pt_data[i]->thread_lock);
+			rdma_pt_data[i]->thread_idle = usecs_to_jiffies(idle_poll_period_usecs);
+			mutex_init(&rdma_pt_data[i]->queue_lock);
+			INIT_LIST_HEAD(&rdma_pt_data[i]->pt_admin_queue_list);
+			INIT_LIST_HEAD(&rdma_pt_data[i]->pt_io_queue_list);
+			init_waitqueue_head(&rdma_pt_data[i]->wait_head);
+			atomic64_set(&rdma_pt_data[i]->admin_queue_cnt, 0);
+			atomic64_set(&rdma_pt_data[i]->io_queue_cnt, 0);
+			if (pt_affinity_core != -1)
+				rdma_pt_data[i]->affinity_cpu = (pt_affinity_core + (int)i) %
+								((int) nr_cpu_ids);
+			else
+				rdma_pt_data[i]->affinity_cpu = -1;
+			snprintf(task_name, TASK_COMM_LEN, "nvmet-rdma-pt%u", i);
+			task = kthread_create(nvmet_rdma_poll_thread, (void *)rdma_pt_data[i], task_name);
+			if (IS_ERR(task)) {
+				ret = PTR_ERR(task);
+				goto err_free_pt_data;
+			}
+			set_user_nice(task, -20);
+			mutex_lock(&rdma_pt_data[i]->thread_lock);
+			rdma_pt_data[i]->thread = task;
+			mutex_unlock(&rdma_pt_data[i]->thread_lock);
+		}
+		rdma_polling_cq_task = true;
+		for (i = 0; i <  pt_num; i++)
+			wake_up_process(rdma_pt_data[i]->thread);
+	}
 
 	ret = ib_register_client(&nvmet_rdma_ib_client);
 	if (ret)
@@ -2098,15 +2386,48 @@ static int __init nvmet_rdma_init(void)
 
 err_ib_client:
 	ib_unregister_client(&nvmet_rdma_ib_client);
+err_free_pt_data:
+	if ((pt_affinity_core >= -1 && pt_affinity_core < (int)nr_cpu_ids)
+		|| pt_num > 0 || idle_poll_period_usecs > 0) {
+		while (i > 0) {
+			kthread_stop(rdma_pt_data[i-1]->thread);
+			i--;
+		}
+		i = pt_num;
+err_free_pqtd:
+		while (i > 0) {
+			kfree(rdma_pt_data[i-1]);
+			i--;
+		}
+		kfree(rdma_pt_data);
+	}
 	return ret;
 }
 
 static void __exit nvmet_rdma_exit(void)
 {
+	int i = 0;
+
+	if (rdma_polling_cq_task) {
+		for (i = 0; i < pt_num; i++) {
+			mutex_lock(&rdma_pt_data[i]->thread_lock);
+			if (rdma_pt_data[i]->thread) {
+				mutex_unlock(&rdma_pt_data[i]->thread_lock);
+				kthread_stop(rdma_pt_data[i]->thread);
+			} else  {
+				mutex_unlock(&rdma_pt_data[i]->thread_lock);
+			}
+		}
+	}
 	nvmet_unregister_transport(&nvmet_rdma_ops);
 	ib_unregister_client(&nvmet_rdma_ib_client);
 	WARN_ON_ONCE(!list_empty(&nvmet_rdma_queue_list));
 	ida_destroy(&nvmet_rdma_queue_ida);
+	if (rdma_polling_cq_task) {
+		for (i = 0; i < pt_num; i++)
+			kfree(rdma_pt_data[i]);
+		kfree(rdma_pt_data);
+	}
 }
 
 module_init(nvmet_rdma_init);
-- 
2.26.2



^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH 2/2] nvmet-tcp: add polling task for nvmet-tcp
  2024-06-26  8:28 [PATCH 0/2] nvmet: support polling task for RDMA and TCP Ping Gan
  2024-06-26  8:28 ` [PATCH 1/2] nvmet-rdma: add polling cq task for nvmet-rdma Ping Gan
@ 2024-06-26  8:28 ` Ping Gan
  2024-06-30  8:58 ` [PATCH 0/2] nvmet: support polling task for RDMA and TCP Sagi Grimberg
  2 siblings, 0 replies; 17+ messages in thread
From: Ping Gan @ 2024-06-26  8:28 UTC (permalink / raw)
  To: hch, sagi, kch, linux-nvme, linux-kernel; +Cc: ping.gan, Ping Gan

To add dedicated polling tasks versus kworker to handle tcp's IO
for nvmet-tcp module. And we have three module parametes:
  task_num is to define number of polling task.
  core_affinity is to define which cpu core will be begun to use.
  idle_peroid is to define task's polling time before go to idle.

Signed-off-by: Ping Gan <jacky_gam_2001@163.com>
---
 drivers/nvme/target/tcp.c | 356 ++++++++++++++++++++++++++++++++++++--
 1 file changed, 339 insertions(+), 17 deletions(-)

diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c
index 5bff0d5464d1..aa6d90f8d11c 100644
--- a/drivers/nvme/target/tcp.c
+++ b/drivers/nvme/target/tcp.c
@@ -73,6 +73,20 @@ device_param_cb(idle_poll_period_usecs, &set_param_ops,
 MODULE_PARM_DESC(idle_poll_period_usecs,
 		"nvmet tcp io_work poll till idle time period in usecs: Default 0");
 
+/* Define the target tcp polling thread's affinity cpu core.
+ */
+static int pt_affinity_core = -2;
+module_param(pt_affinity_core, int, 0644);
+MODULE_PARM_DESC(pt_affinity_core,
+	    "target tcp polling thread's affinity core, -1 for all online cpus");
+
+/* Define the polling thread number.
+ */
+static int pt_num;
+module_param(pt_num, int, 0644);
+MODULE_PARM_DESC(pt_num, "target tcp polling thread number");
+static bool tcp_polling_task;
+
 #ifdef CONFIG_NVME_TARGET_TCP_TLS
 /*
  * TLS handshake timeout
@@ -106,6 +120,25 @@ enum {
 	NVMET_TCP_F_INIT_FAILED = (1 << 0),
 };
 
+struct nvmet_tcp_pt_data {
+	struct wait_queue_head	wait_head;
+	struct mutex		queue_lock;
+	struct list_head	pt_queue_list;
+	struct list_head	pt_io_queue_list;
+	struct list_head	addon_queue_list;
+	struct mutex		addon_queue_lock;
+	u32		thread_idle;
+	int		affinity_cpu;
+	pid_t			task_pid;
+	pid_t			task_tgid;
+	atomic64_t		queue_cnt;
+	atomic64_t		io_queue_cnt;
+	struct task_struct *thread;
+	struct mutex	   thread_lock;
+};
+
+struct nvmet_tcp_pt_data **tcp_pt_data;
+
 struct nvmet_tcp_cmd {
 	struct nvmet_tcp_queue		*queue;
 	struct nvmet_req		req;
@@ -150,6 +183,9 @@ struct nvmet_tcp_queue {
 	struct socket		*sock;
 	struct nvmet_tcp_port	*port;
 	struct work_struct	io_work;
+	struct nvmet_tcp_pt_data *pt_data;
+	struct list_head	pt_list_entry;
+	atomic64_t		req_cnt;
 	struct nvmet_cq		nvme_cq;
 	struct nvmet_sq		nvme_sq;
 	struct kref		kref;
@@ -218,6 +254,46 @@ static const struct nvmet_fabrics_ops nvmet_tcp_ops;
 static void nvmet_tcp_free_cmd(struct nvmet_tcp_cmd *c);
 static void nvmet_tcp_free_cmd_buffers(struct nvmet_tcp_cmd *cmd);
 
+static void nvmet_tcp_wakeup_poll_thread(struct nvmet_tcp_queue *queue)
+{
+	smp_mb();
+	if (queue->pt_data && waitqueue_active(&queue->pt_data->wait_head))
+		wake_up(&queue->pt_data->wait_head);
+}
+
+static int nvmet_tcp_get_polling_task(void)
+{
+	int i = 1, ret = 0;
+	s64 min, tmp, totalq_min, totalq_tmp;
+	struct nvmet_tcp_pt_data *tptd;
+
+	tptd = tcp_pt_data[0];
+	min = atomic64_read(&tptd->io_queue_cnt);
+	totalq_min = atomic64_read(&tptd->queue_cnt);
+	while (i < pt_num) {
+		tptd = tcp_pt_data[i];
+		tmp = atomic64_read(&tptd->io_queue_cnt);
+		totalq_tmp = atomic64_read(&tptd->queue_cnt);
+		if (min > tmp || (min == tmp && totalq_min > totalq_tmp)) {
+			min = tmp;
+			totalq_min = totalq_tmp;
+			ret = i;
+		}
+		i++;
+	}
+	tptd = tcp_pt_data[ret];
+	atomic64_inc(&tptd->queue_cnt);
+	return ret;
+}
+
+static inline void nvmet_tcp_pq_clear_req(struct nvmet_tcp_queue *queue)
+{
+	struct nvmet_tcp_pt_data *tptd = queue->pt_data;
+	mutex_lock(&tptd->queue_lock);
+	list_del(&queue->pt_list_entry);
+	mutex_unlock(&tptd->queue_lock);
+}
+
 static inline u16 nvmet_tcp_cmd_tag(struct nvmet_tcp_queue *queue,
 		struct nvmet_tcp_cmd *cmd)
 {
@@ -590,7 +666,12 @@ static void nvmet_tcp_queue_response(struct nvmet_req *req)
 	}
 
 	llist_add(&cmd->lentry, &queue->resp_list);
-	queue_work_on(queue_cpu(queue), nvmet_tcp_wq, &cmd->queue->io_work);
+	if (tcp_polling_task) {
+		atomic64_set(&queue->req_cnt, 1);
+		nvmet_tcp_wakeup_poll_thread(queue);
+	} else {
+		queue_work_on(queue_cpu(queue), nvmet_tcp_wq, &cmd->queue->io_work);
+	}
 }
 
 static void nvmet_tcp_execute_request(struct nvmet_tcp_cmd *cmd)
@@ -1598,13 +1679,21 @@ static void nvmet_tcp_release_queue_work(struct work_struct *w)
 
 	nvmet_tcp_restore_socket_callbacks(queue);
 	cancel_delayed_work_sync(&queue->tls_handshake_tmo_work);
-	cancel_work_sync(&queue->io_work);
+	if (!tcp_polling_task)
+		cancel_work_sync(&queue->io_work);
+	else {
+		nvmet_tcp_pq_clear_req(queue);
+		if (queue->nvme_sq.qid != 0)
+			atomic64_dec(&queue->pt_data->io_queue_cnt);
+		atomic64_dec(&queue->pt_data->queue_cnt);
+	}
 	/* stop accepting incoming data */
 	queue->rcv_state = NVMET_TCP_RECV_ERR;
 
 	nvmet_tcp_uninit_data_in_cmds(queue);
 	nvmet_sq_destroy(&queue->nvme_sq);
-	cancel_work_sync(&queue->io_work);
+	if (!tcp_polling_task)
+		cancel_work_sync(&queue->io_work);
 	nvmet_tcp_free_cmd_data_in_buffers(queue);
 	/* ->sock will be released by fput() */
 	fput(queue->sock->file);
@@ -1627,9 +1716,15 @@ static void nvmet_tcp_data_ready(struct sock *sk)
 	if (likely(queue)) {
 		if (queue->data_ready)
 			queue->data_ready(sk);
-		if (queue->state != NVMET_TCP_Q_TLS_HANDSHAKE)
-			queue_work_on(queue_cpu(queue), nvmet_tcp_wq,
-				      &queue->io_work);
+		if (queue->state != NVMET_TCP_Q_TLS_HANDSHAKE) {
+			if (tcp_polling_task) {
+				atomic64_set(&queue->req_cnt, 1);
+				nvmet_tcp_wakeup_poll_thread(queue);
+			} else {
+				queue_work_on(queue_cpu(queue), nvmet_tcp_wq,
+					      &queue->io_work);
+			}
+		}
 	}
 	read_unlock_bh(&sk->sk_callback_lock);
 }
@@ -1650,7 +1745,12 @@ static void nvmet_tcp_write_space(struct sock *sk)
 
 	if (sk_stream_is_writeable(sk)) {
 		clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
-		queue_work_on(queue_cpu(queue), nvmet_tcp_wq, &queue->io_work);
+		if (tcp_polling_task) {
+			atomic64_set(&queue->req_cnt, 1);
+			nvmet_tcp_wakeup_poll_thread(queue);
+		} else {
+			queue_work_on(queue_cpu(queue), nvmet_tcp_wq, &queue->io_work);
+		}
 	}
 out:
 	read_unlock_bh(&sk->sk_callback_lock);
@@ -1731,7 +1831,19 @@ static int nvmet_tcp_set_queue_sock(struct nvmet_tcp_queue *queue)
 		sock->sk->sk_write_space = nvmet_tcp_write_space;
 		if (idle_poll_period_usecs)
 			nvmet_tcp_arm_queue_deadline(queue);
-		queue_work_on(queue_cpu(queue), nvmet_tcp_wq, &queue->io_work);
+		if (tcp_polling_task) {
+			int task_index = nvmet_tcp_get_polling_task();
+			queue->pt_data = tcp_pt_data[task_index];
+			write_unlock_bh(&sock->sk->sk_callback_lock);
+			mutex_lock(&queue->pt_data->addon_queue_lock);
+			list_add_tail(&queue->pt_list_entry, &queue->pt_data->addon_queue_list);
+			mutex_unlock(&queue->pt_data->addon_queue_lock);
+			write_lock_bh(&sock->sk->sk_callback_lock);
+			atomic64_set(&queue->req_cnt, 1);
+			nvmet_tcp_wakeup_poll_thread(queue);
+		} else {
+			queue_work_on(queue_cpu(queue), nvmet_tcp_wq, &queue->io_work);
+		}
 	}
 	write_unlock_bh(&sock->sk->sk_callback_lock);
 
@@ -1883,7 +1995,8 @@ static void nvmet_tcp_alloc_queue(struct nvmet_tcp_port *port,
 	}
 
 	INIT_WORK(&queue->release_work, nvmet_tcp_release_queue_work);
-	INIT_WORK(&queue->io_work, nvmet_tcp_io_work);
+	if (!tcp_polling_task)
+		INIT_WORK(&queue->io_work, nvmet_tcp_io_work);
 	kref_init(&queue->kref);
 	queue->sock = newsock;
 	queue->port = port;
@@ -2146,6 +2259,15 @@ static u16 nvmet_tcp_install_queue(struct nvmet_sq *sq)
 	}
 
 	queue->nr_cmds = sq->size * 2;
+	if (tcp_polling_task) {
+		if (queue->state == NVMET_TCP_Q_DISCONNECTING)
+			return 0; // if release worker schedule, directly return
+		if (sq->qid != 0) {
+			atomic64_inc(&queue->pt_data->io_queue_cnt);
+			list_del(&queue->pt_list_entry);
+			list_add_tail(&queue->pt_list_entry, &queue->pt_data->pt_io_queue_list);
+		}
+	}
 	if (nvmet_tcp_alloc_cmds(queue))
 		return NVME_SC_INTERNAL;
 	return 0;
@@ -2193,14 +2315,181 @@ static const struct nvmet_fabrics_ops nvmet_tcp_ops = {
 	.host_traddr		= nvmet_tcp_host_port_addr,
 };
 
-static int __init nvmet_tcp_init(void)
+static int __nvmet_tcp_poll_thread(struct nvmet_tcp_pt_data *tptd)
+{
+	int rcv_ret = 0, snd_ret = 0, ops = 0;
+	bool need_repoll = false;
+	struct nvmet_tcp_queue *qreq, *tmp;
+
+	mutex_lock(&tptd->addon_queue_lock);
+	mutex_lock(&tptd->queue_lock);
+	list_splice_tail_init(&tptd->addon_queue_list, &tptd->pt_queue_list);
+	mutex_unlock(&tptd->queue_lock);
+	mutex_unlock(&tptd->addon_queue_lock);
+
+	mutex_lock(&tptd->queue_lock);
+	if (!list_empty(&tptd->pt_queue_list)) {
+		list_for_each_entry_safe(qreq, tmp, &tptd->pt_queue_list, pt_list_entry) {
+			if (atomic64_read(&qreq->req_cnt) > 0) {
+				rcv_ret = nvmet_tcp_try_recv(qreq, NVMET_TCP_RECV_BUDGET, &ops);
+				if (rcv_ret < 0) {
+					atomic64_set(&qreq->req_cnt, 0);
+					continue;
+				}
+				if (rcv_ret > 0)
+					need_repoll = true;
+				snd_ret = nvmet_tcp_try_send(qreq, NVMET_TCP_SEND_BUDGET, &ops);
+				if (snd_ret < 0) {
+					atomic64_set(&qreq->req_cnt, 0);
+					continue;
+				}
+				if (snd_ret > 0)
+					need_repoll = true;
+				else if (rcv_ret == 0)
+					atomic64_set(&qreq->req_cnt, 0);
+			}
+		}
+	}
+	if (!list_empty(&tptd->pt_io_queue_list)) {
+		list_for_each_entry_safe(qreq, tmp, &tptd->pt_io_queue_list, pt_list_entry) {
+			if (atomic64_read(&qreq->req_cnt) > 0) {
+				rcv_ret = nvmet_tcp_try_recv(qreq, NVMET_TCP_RECV_BUDGET, &ops);
+				if (rcv_ret < 0) {
+					atomic64_set(&qreq->req_cnt, 0);
+					continue;
+				}
+				if (rcv_ret > 0)
+					need_repoll = true;
+				snd_ret = nvmet_tcp_try_send(qreq, NVMET_TCP_SEND_BUDGET, &ops);
+				if (snd_ret < 0) {
+					atomic64_set(&qreq->req_cnt, 0);
+					continue;
+				}
+				if (snd_ret > 0)
+					need_repoll = true;
+				else if (rcv_ret == 0)
+					atomic64_set(&qreq->req_cnt, 0);
+			}
+		}
+	}
+	mutex_unlock(&tptd->queue_lock);
+	if (need_repoll)
+		return 1;
+	else
+		return 0;
+}
+
+static int nvmet_tcp_poll_thread(void *data)
 {
-	int ret;
+	struct nvmet_tcp_pt_data *tptd = data;
+	unsigned long timeout = 0;
+	DEFINE_WAIT(wait);
 
-	nvmet_tcp_wq = alloc_workqueue("nvmet_tcp_wq",
-				WQ_MEM_RECLAIM | WQ_HIGHPRI, 0);
-	if (!nvmet_tcp_wq)
-		return -ENOMEM;
+	if (tptd->affinity_cpu != -1)
+		set_cpus_allowed_ptr(current, cpumask_of(tptd->affinity_cpu));
+	else
+		set_cpus_allowed_ptr(current, cpu_online_mask);
+	current->flags |= PF_NO_SETAFFINITY;
+	mutex_lock(&tptd->thread_lock);
+	tptd->task_pid = current->pid;
+	tptd->task_tgid = current->tgid;
+
+	while (!kthread_should_stop()) {
+		int ret = __nvmet_tcp_poll_thread(tptd);
+		if (ret > 0 || !time_after(jiffies, timeout)) {
+			cond_resched();
+			if (ret > 0)
+				timeout = jiffies + tptd->thread_idle;
+			continue;
+		}
+		prepare_to_wait(&tptd->wait_head, &wait, TASK_INTERRUPTIBLE);
+		mutex_unlock(&tptd->thread_lock);
+		schedule();
+		mutex_lock(&tptd->thread_lock);
+		finish_wait(&tptd->wait_head, &wait);
+		timeout = jiffies + tptd->thread_idle;
+	}
+	tptd->thread = NULL;
+	tptd->task_pid = -1;
+	tptd->task_tgid = -1;
+	mutex_unlock(&tptd->thread_lock);
+	kthread_complete_and_exit(NULL, 0);
+	//do_exit(0);
+}
+
+static int __init nvmet_tcp_init(void)
+{
+	int ret, i = 0;
+	char task_name[TASK_COMM_LEN];
+	struct task_struct *task;
+
+	tcp_polling_task = false;
+	if ((pt_affinity_core >= -1 &&
+		pt_affinity_core < (int)nr_cpu_ids) || pt_num > 0) {
+		if (pt_num == 0)
+			pt_num = 1;
+		else if (pt_num < 0) {
+			printk(KERN_ERR "bad parameter for task num\n");
+			ret =  -EINVAL;
+			return ret;
+		}
+		if (pt_affinity_core == -2)
+			pt_affinity_core = -1;
+		if (pt_affinity_core < -1 ||
+			pt_affinity_core >= (int)nr_cpu_ids) {
+			printk(KERN_ERR "bad parameter for affinity core \n");
+			ret =  -EINVAL;
+			return ret;
+		}
+		if (!(idle_poll_period_usecs > 0))
+			idle_poll_period_usecs = 1000; // default 1ms
+		tcp_pt_data = kmalloc(pt_num * sizeof(void *), GFP_KERNEL);
+		if (!tcp_pt_data)
+			return -ENOMEM;
+
+		for (i = 0; i < pt_num; i++) {
+			tcp_pt_data[i] = kmalloc(sizeof(struct nvmet_tcp_pt_data), GFP_KERNEL);
+			if (!tcp_pt_data[i]) {
+				ret = -ENOMEM;
+				goto err_free_pqtd;
+			}
+		}
+		for (i = 0; i < pt_num; i++) {
+			mutex_init(&tcp_pt_data[i]->thread_lock);
+			tcp_pt_data[i]->thread_idle = usecs_to_jiffies(idle_poll_period_usecs);
+			mutex_init(&tcp_pt_data[i]->queue_lock);
+			mutex_init(&tcp_pt_data[i]->addon_queue_lock);
+			INIT_LIST_HEAD(&tcp_pt_data[i]->pt_queue_list);
+			INIT_LIST_HEAD(&tcp_pt_data[i]->pt_io_queue_list);
+			INIT_LIST_HEAD(&tcp_pt_data[i]->addon_queue_list);
+			init_waitqueue_head(&tcp_pt_data[i]->wait_head);
+			atomic64_set(&tcp_pt_data[i]->queue_cnt, 0);
+			atomic64_set(&tcp_pt_data[i]->io_queue_cnt, 0);
+			if (pt_affinity_core != -1)
+				tcp_pt_data[i]->affinity_cpu = (pt_affinity_core + (int)i) %
+								((int) nr_cpu_ids);
+			else
+				tcp_pt_data[i]->affinity_cpu = -1;
+			snprintf(task_name, TASK_COMM_LEN, "nvmet-tcp-pt%u", i);
+			task = kthread_create(nvmet_tcp_poll_thread, (void *)tcp_pt_data[i], task_name);
+			if (IS_ERR(task)) {
+				ret = PTR_ERR(task);
+				goto err;
+			}
+			set_user_nice(task, -20);
+			mutex_lock(&tcp_pt_data[i]->thread_lock);
+			tcp_pt_data[i]->thread = task;
+			mutex_unlock(&tcp_pt_data[i]->thread_lock);
+		}
+		tcp_polling_task = true;
+		for (i = 0; i <  pt_num; i++)
+			wake_up_process(tcp_pt_data[i]->thread);
+	} else {
+		nvmet_tcp_wq = alloc_workqueue("nvmet_tcp_wq",
+					WQ_MEM_RECLAIM | WQ_HIGHPRI, 0);
+		if (!nvmet_tcp_wq)
+			return -ENOMEM;
+	}
 
 	ret = nvmet_register_transport(&nvmet_tcp_ops);
 	if (ret)
@@ -2208,15 +2497,42 @@ static int __init nvmet_tcp_init(void)
 
 	return 0;
 err:
-	destroy_workqueue(nvmet_tcp_wq);
+	if ((pt_affinity_core >= -1 &&
+		pt_affinity_core < (int)nr_cpu_ids) || pt_num > 0) {
+		while (i > 0) {
+			kthread_stop(tcp_pt_data[i-1]->thread);
+			i--;
+		}
+		i = pt_num;
+err_free_pqtd:
+		while (i > 0) {
+			kfree(tcp_pt_data[i-1]);
+			i--;
+		}
+		kfree(tcp_pt_data);
+	} else {
+		destroy_workqueue(nvmet_tcp_wq);
+	}
 	return ret;
 }
 
 static void __exit nvmet_tcp_exit(void)
 {
 	struct nvmet_tcp_queue *queue;
+	int i = 0;
 
 	nvmet_unregister_transport(&nvmet_tcp_ops);
+	if (tcp_polling_task) {
+		for (i = 0; i < pt_num; i++) {
+			mutex_lock(&tcp_pt_data[i]->thread_lock);
+			if (tcp_pt_data[i]->thread) {
+				mutex_unlock(&tcp_pt_data[i]->thread_lock);
+				kthread_stop(tcp_pt_data[i]->thread);
+			} else  {
+				mutex_unlock(&tcp_pt_data[i]->thread_lock);
+			}
+		}
+	}
 
 	flush_workqueue(nvmet_wq);
 	mutex_lock(&nvmet_tcp_queue_mutex);
@@ -2225,7 +2541,13 @@ static void __exit nvmet_tcp_exit(void)
 	mutex_unlock(&nvmet_tcp_queue_mutex);
 	flush_workqueue(nvmet_wq);
 
-	destroy_workqueue(nvmet_tcp_wq);
+	if (tcp_polling_task) {
+		for (i = 0; i < pt_num; i++)
+			kfree(tcp_pt_data[i]);
+		kfree(tcp_pt_data);
+	} else {
+		destroy_workqueue(nvmet_tcp_wq);
+	}
 	ida_destroy(&nvmet_tcp_queue_ida);
 }
 
-- 
2.26.2



^ permalink raw reply related	[flat|nested] 17+ messages in thread

* Re: [PATCH 0/2] nvmet: support polling task for RDMA and TCP
  2024-06-26  8:28 [PATCH 0/2] nvmet: support polling task for RDMA and TCP Ping Gan
  2024-06-26  8:28 ` [PATCH 1/2] nvmet-rdma: add polling cq task for nvmet-rdma Ping Gan
  2024-06-26  8:28 ` [PATCH 2/2] nvmet-tcp: add polling task for nvmet-tcp Ping Gan
@ 2024-06-30  8:58 ` Sagi Grimberg
  2024-07-01  7:42   ` Ping Gan
  2 siblings, 1 reply; 17+ messages in thread
From: Sagi Grimberg @ 2024-06-30  8:58 UTC (permalink / raw)
  To: Ping Gan, hch, kch, linux-nvme, linux-kernel; +Cc: ping.gan

Hey Ping Gan,

On 26/06/2024 11:28, Ping Gan wrote:
> When running nvmf on SMP platform, current nvme target's RDMA and
> TCP use kworker to handle IO. But if there is other high workload
> in the system(eg: on kubernetes), the competition between the
> kworker and other workload is very radical. And since the kworker
> is scheduled by OS randomly, it's difficult to control OS resource
> and also tune the performance. If target support to use delicated
> polling task to handle IO, it's useful to control OS resource and
> gain good performance. So it makes sense to add polling task in
> rdma-rdma and rdma-tcp modules.

This is NOT the way to go here.

Both rdma and tcp are driven from workqueue context, which are bound 
workqueues.

So there are two ways to go here:
1. Add generic port cpuset and use that to direct traffic to the 
appropriate set of cores
(i.e. select an appropriate comp_vector for rdma and add an appropriate 
steering rule
for tcp).
2. Add options to rdma/tcp to use UNBOUND workqueues, and allow users to 
control
these UNBOUND workqueues cpumask via sysfs.

(2) will not control interrupts to steer to other workloads cpus, but 
the handlers may
run on a set of dedicated cpus.

(1) is a better solution, but harder to implement.

You also should look into nvmet-fc as well (and nvmet-loop for that matter).

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 0/2] nvmet: support polling task for RDMA and TCP
  2024-06-30  8:58 ` [PATCH 0/2] nvmet: support polling task for RDMA and TCP Sagi Grimberg
@ 2024-07-01  7:42   ` Ping Gan
  2024-07-01  7:42     ` Ping Gan
  2024-07-01  8:22     ` Sagi Grimberg
  0 siblings, 2 replies; 17+ messages in thread
From: Ping Gan @ 2024-07-01  7:42 UTC (permalink / raw)
  To: sagi, hch, kch, linux-nvme, linux-kernel; +Cc: ping.gan

>Hey Ping Gan,
>
>
>On 26/06/2024 11:28, Ping Gan wrote:
>> When running nvmf on SMP platform, current nvme target's RDMA and
>> TCP use kworker to handle IO. But if there is other high workload
>> in the system(eg: on kubernetes), the competition between the
>> kworker and other workload is very radical. And since the kworker
>> is scheduled by OS randomly, it's difficult to control OS resource
>> and also tune the performance. If target support to use delicated
>> polling task to handle IO, it's useful to control OS resource and
>> gain good performance. So it makes sense to add polling task in
>> rdma-rdma and rdma-tcp modules.
>
>This is NOT the way to go here.
>
>Both rdma and tcp are driven from workqueue context, which are bound 
>workqueues.
>
>So there are two ways to go here:
>1. Add generic port cpuset and use that to direct traffic to the 
>appropriate set of cores
>(i.e. select an appropriate comp_vector for rdma and add an appropriate 
>steering rule
>for tcp).
>2. Add options to rdma/tcp to use UNBOUND workqueues, and allow users
>to 
>control
>these UNBOUND workqueues cpumask via sysfs.
>
>(2) will not control interrupts to steer to other workloads cpus, but 
>the handlers may
>run on a set of dedicated cpus.
>
>(1) is a better solution, but harder to implement.
>
>You also should look into nvmet-fc as well (and nvmet-loop for that
>matter).

hi Sagi Grimberg,
Thanks for your reply, actually we had tried the first advice you
suggested, but we found the performance was poor when using spdk 
as initiator. You know this patch is not only resolving OS resource
competition issue, but also the perf issue. We have analyzed if we
still use workqueue(kworker) as target when initiator is polling 
driver(eg: spdk), then workqueue/kworker target is the bottleneck 
since every nvmf request may have a wait latency from queuing on 
workqueue to begin processing, and the latency can be traced by wqlat 
of bcc (https://github.com/iovisor/bcc/blob/master/tools/wqlat.py). 
We think the latency is a disaster for the polling driver data plane,
right? So we think adding a polling task mode on nvmet side to handle
IO does really make sense; what's your opinion about this? And you
mentioned we should also look into nvmet-fc, I agree with you.
However currently we have no nvmf-fc's testbed; if we get the testbed,
will do that. 

Thanks,
Ping

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [PATCH 0/2] nvmet: support polling task for RDMA and TCP
  2024-07-01  7:42   ` Ping Gan
@ 2024-07-01  7:42     ` Ping Gan
  2024-07-01  8:22     ` Sagi Grimberg
  1 sibling, 0 replies; 17+ messages in thread
From: Ping Gan @ 2024-07-01  7:42 UTC (permalink / raw)
  To: sagi, hch, kch, linux-nvme, linux-kernel; +Cc: ping.gan, Ping Gan

When running nvmf on SMP platform, current nvme target's RDMA and
TCP use kworker to handle IO. But if there is other high workload 
in the system(eg: on kubernetes), the competition between the 
kworker and other workload is very radical. And since the kworker
is scheduled by OS randomly, it's difficult to control OS resource 
and also tune the performance. If target support to use delicated 
polling task to handle IO, it's useful to control OS resource and 
gain good performance. So it makes sense to add polling task in
rdma-rdma and rdma-tcp modules.

Ping Gan (2):
  nvmet-rdma: add polling cq task for nvmet-rdma
  nvmet-tcp: add polling task for nvmet-tcp

 drivers/nvme/target/rdma.c | 331 +++++++++++++++++++++++++++++++++-
 drivers/nvme/target/tcp.c  | 356 +++++++++++++++++++++++++++++++++++--
 2 files changed, 665 insertions(+), 22 deletions(-)

-- 
2.26.2

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 0/2] nvmet: support polling task for RDMA and TCP
  2024-07-01  7:42   ` Ping Gan
  2024-07-01  7:42     ` Ping Gan
@ 2024-07-01  8:22     ` Sagi Grimberg
  2024-07-02 10:02       ` Ping Gan
  1 sibling, 1 reply; 17+ messages in thread
From: Sagi Grimberg @ 2024-07-01  8:22 UTC (permalink / raw)
  To: hch, kch, linux-nvme, linux-kernel; +Cc: ping.gan



On 01/07/2024 10:42, Ping Gan wrote:
>> Hey Ping Gan,
>>
>>
>> On 26/06/2024 11:28, Ping Gan wrote:
>>> When running nvmf on SMP platform, current nvme target's RDMA and
>>> TCP use kworker to handle IO. But if there is other high workload
>>> in the system(eg: on kubernetes), the competition between the
>>> kworker and other workload is very radical. And since the kworker
>>> is scheduled by OS randomly, it's difficult to control OS resource
>>> and also tune the performance. If target support to use delicated
>>> polling task to handle IO, it's useful to control OS resource and
>>> gain good performance. So it makes sense to add polling task in
>>> rdma-rdma and rdma-tcp modules.
>> This is NOT the way to go here.
>>
>> Both rdma and tcp are driven from workqueue context, which are bound
>> workqueues.
>>
>> So there are two ways to go here:
>> 1. Add generic port cpuset and use that to direct traffic to the
>> appropriate set of cores
>> (i.e. select an appropriate comp_vector for rdma and add an appropriate
>> steering rule
>> for tcp).
>> 2. Add options to rdma/tcp to use UNBOUND workqueues, and allow users
>> to
>> control
>> these UNBOUND workqueues cpumask via sysfs.
>>
>> (2) will not control interrupts to steer to other workloads cpus, but
>> the handlers may
>> run on a set of dedicated cpus.
>>
>> (1) is a better solution, but harder to implement.
>>
>> You also should look into nvmet-fc as well (and nvmet-loop for that
>> matter).
> hi Sagi Grimberg,
> Thanks for your reply, actually we had tried the first advice you
> suggested, but we found the performance was poor when using spdk
> as initiator.

I suggest that you focus on that instead of what you proposed.
What is the source of your poor performance?

>   You know this patch is not only resolving OS resource
> competition issue, but also the perf issue. We have analyzed if we
> still use workqueue(kworker) as target when initiator is polling
> driver(eg: spdk), then workqueue/kworker target is the bottleneck
> since every nvmf request may have a wait latency from queuing on
> workqueue to begin processing,

That is incorrect, the work context polls the cq until it either drains it
completely, or exhaust a quota of IB_POLL_BUDGET_WORKQUEUE (or
NVMET_TCP_IO_WORK_BUDGET). Not every command gets its own workqueue
queuing delay.

And, what does the spdk initiator has to do with it? Didn't understand...

>   and the latency can be traced by wqlat
> of bcc (https://github.com/iovisor/bcc/blob/master/tools/wqlat.py).
> We think the latency is a disaster for the polling driver data plane,
> right?

If you need a target that polls all the time, you should probably resort 
to spdk.
If there is room for optimization in nvmet we'll gladly take it, but 
this is not the
way to go IMO.

> So we think adding a polling task mode on nvmet side to handle
> IO does really make sense; what's your opinion about this?

I personally think that adding a polling kthread is questionable.
However there is a precedent, io_uring sqthreads. So please look
into what is done there. I don't mind having something like 
IB_POLL_IOTASK (or
io_task threads in nvmet-tcp) if its done correctly (leverages common code).

>   And you
> mentioned we should also look into nvmet-fc, I agree with you.
> However currently we have no nvmf-fc's testbed; if we get the testbed,
> will do that.

There is fcloop, you should use that to test, same for loop. We don't want
the transports to diverge in functionality.


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 0/2] nvmet: support polling task for RDMA and TCP
  2024-07-01  8:22     ` Sagi Grimberg
@ 2024-07-02 10:02       ` Ping Gan
  2024-07-02 10:02         ` Ping Gan
  2024-07-03 19:58         ` Sagi Grimberg
  0 siblings, 2 replies; 17+ messages in thread
From: Ping Gan @ 2024-07-02 10:02 UTC (permalink / raw)
  To: sagi, hch, kch, linux-nvme, linux-kernel; +Cc: ping.gan

>On 01/07/2024 10:42, Ping Gan wrote:
>>> Hey Ping Gan,
>>>
>>>
>>> On 26/06/2024 11:28, Ping Gan wrote:
>>>> When running nvmf on SMP platform, current nvme target's RDMA and
>>>> TCP use kworker to handle IO. But if there is other high workload
>>>> in the system(eg: on kubernetes), the competition between the
>>>> kworker and other workload is very radical. And since the kworker
>>>> is scheduled by OS randomly, it's difficult to control OS resource
>>>> and also tune the performance. If target support to use delicated
>>>> polling task to handle IO, it's useful to control OS resource and
>>>> gain good performance. So it makes sense to add polling task in
>>>> rdma-rdma and rdma-tcp modules.
>>> This is NOT the way to go here.
>>>
>>> Both rdma and tcp are driven from workqueue context, which are bound
>>> workqueues.
>>>
>>> So there are two ways to go here:
>>> 1. Add generic port cpuset and use that to direct traffic to the
>>> appropriate set of cores
>>> (i.e. select an appropriate comp_vector for rdma and add an
>>> appropriate
>>> steering rule
>>> for tcp).
>>> 2. Add options to rdma/tcp to use UNBOUND workqueues, and allow
>>> users
>>> to
>>> control
>>> these UNBOUND workqueues cpumask via sysfs.
>>>
>>> (2) will not control interrupts to steer to other workloads cpus,
>>> but
>>> the handlers may
>>> run on a set of dedicated cpus.
>>>
>>> (1) is a better solution, but harder to implement.
>>>
>>> You also should look into nvmet-fc as well (and nvmet-loop for that
>>> matter).
>> hi Sagi Grimberg,
>> Thanks for your reply, actually we had tried the first advice you
>> suggested, but we found the performance was poor when using spdk
>> as initiator.
>
>I suggest that you focus on that instead of what you proposed.
>What is the source of your poor performance?
Before these patches, we had used linux's RPS to forward the packets
to a fixed cpu set for nvmet-tcp. But when did that we can still not 
cancel the competition between softirq and workqueue since nvme target's
kworker cpu core bind on socket's cpu which is from skb. Besides that
we found workqueue's wait latency was very high even we enabled polling
on nvmet-tcp by module parameter idle_poll_period_usecs. So when
initiator
is polling mode, the target of workqueue is the bottleneck. Below is 
work's wait latency trace log of our test on our cluster(per node uses 
4 numas 96 cores, 192G memory, one dual ports mellanox CX4LX(25Gbps X 2)
ethernet adapter and randrw 1M IO size) by RPS to 6 cpu cores. And
system's CPU and memory were used about 80%.
ogden-brown:~ #/usr/share/bcc/tools/wqlat -T -w nvmet_tcp_wq 1 2
01:06:59
     usecs               : count     distribution
         0 -> 1          : 0        |                              |
         2 -> 3          : 0        |                              |
         4 -> 7          : 0        |                              |
         8 -> 15         : 3        |                              |
        16 -> 31         : 10       |                              |
        32 -> 63         : 3        |                              |
        64 -> 127        : 2        |                              |
       128 -> 255        : 0        |                              |
       256 -> 511        : 5        |                              |
       512 -> 1023       : 12       |                              |
      1024 -> 2047       : 26       |*                             |
      2048 -> 4095       : 34       |*                             |
      4096 -> 8191       : 350      |************                  |
      8192 -> 16383      : 625      |******************************|
     16384 -> 32767      : 244      |*********                     |
     32768 -> 65535      : 39       |*                             |

01:07:00
     usecs               : count     distribution
         0 -> 1          : 1        |                              |
         2 -> 3          : 0        |                              |
         4 -> 7          : 4        |                              |
         8 -> 15         : 3        |                              |
        16 -> 31         : 8        |                              |
        32 -> 63         : 10       |                              |
        64 -> 127        : 3        |                              |
       128 -> 255        : 6        |                              |
       256 -> 511        : 8        |                              |
       512 -> 1023       : 20       |*                             |
      1024 -> 2047       : 19       |*                             |
      2048 -> 4095       : 57       |**                            |
      4096 -> 8191       : 325      |****************              |
      8192 -> 16383      : 647      |******************************|
     16384 -> 32767      : 228      |***********                   |
     32768 -> 65535      : 43       |**                            |
     65536 -> 131071     : 1        |                              |

And the bandwidth of a node is only 3100MB. While we used the patch
and enable 6 polling task, the bandwidth can be 4000MB. It's a good 
improvement.

>>   You know this patch is not only resolving OS resource
>> competition issue, but also the perf issue. We have analyzed if we
>> still use workqueue(kworker) as target when initiator is polling
>> driver(eg: spdk), then workqueue/kworker target is the bottleneck
>> since every nvmf request may have a wait latency from queuing on
>> workqueue to begin processing,
>
>That is incorrect, the work context polls the cq until it either drains
>it
>completely, or exhaust a quota of IB_POLL_BUDGET_WORKQUEUE (or
>NVMET_TCP_IO_WORK_BUDGET). Not every command gets its own workqueue
>queuing delay.
>
>And, what does the spdk initiator has to do with it? Didn't
>understand...
Yes, target workqueue implementation will poll a quota; but when the
work
load was high we found many work will wait too long(some of them at
several
ms to hundred ms shown above histogram). We use the spdk initiator(by 
polling mode) to send IO's read/write to nvme disks of a kubernetes 
cluster's remote node.

>>   and the latency can be traced by wqlat
>> of bcc (https://github.com/iovisor/bcc/blob/master/tools/wqlat.py).
>> We think the latency is a disaster for the polling driver data plane,
>> right?
>
>If you need a target that polls all the time, you should probably
>resort 
>to spdk.
>If there is room for optimization in nvmet we'll gladly take it, but 
>this is not the
>way to go IMO.
Yes, in the begining we did use the spdk as polling target driver,
but we suffered from spdk target could not support disk hot plug/unplug
well, sometimes it will cause data loss when did disk hot plug/unplug.
So we switch to kernel target driver because in production customer's
data security is first priority. And for kernel's target it has no
polling mode target driver, so we implemented these patches.

>> So we think adding a polling task mode on nvmet side to handle
>> IO does really make sense; what's your opinion about this?
>
>I personally think that adding a polling kthread is questionable.
>However there is a precedent, io_uring sqthreads. So please look
>into what is done there. I don't mind having something like 
>IB_POLL_IOTASK (or
>io_task threads in nvmet-tcp) if its done correctly (leverages common
>code).
Yes, we have studied io_uring's code before implementing the patches.
Actually we followed io_uring's design idea in these patches.

>>   And you
>> mentioned we should also look into nvmet-fc, I agree with you.
>> However currently we have no nvmf-fc's testbed; if we get the
>> testbed,
>> will do that.
>
>There is fcloop, you should use that to test, same for loop. We don't
>want
>the transports to diverge in functionality.
Ok, I will try, would you please give me some configuration guide for
fcloop since I never used fcloop before.

Thanks,
Ping




^ permalink raw reply	[flat|nested] 17+ messages in thread

* [PATCH 0/2] nvmet: support polling task for RDMA and TCP
  2024-07-02 10:02       ` Ping Gan
@ 2024-07-02 10:02         ` Ping Gan
  2024-07-03 19:58         ` Sagi Grimberg
  1 sibling, 0 replies; 17+ messages in thread
From: Ping Gan @ 2024-07-02 10:02 UTC (permalink / raw)
  To: sagi, hch, kch, linux-nvme, linux-kernel; +Cc: ping.gan, Ping Gan

When running nvmf on SMP platform, current nvme target's RDMA and
TCP use kworker to handle IO. But if there is other high workload 
in the system(eg: on kubernetes), the competition between the 
kworker and other workload is very radical. And since the kworker
is scheduled by OS randomly, it's difficult to control OS resource 
and also tune the performance. If target support to use delicated 
polling task to handle IO, it's useful to control OS resource and 
gain good performance. So it makes sense to add polling task in
rdma-rdma and rdma-tcp modules.

Ping Gan (2):
  nvmet-rdma: add polling cq task for nvmet-rdma
  nvmet-tcp: add polling task for nvmet-tcp

 drivers/nvme/target/rdma.c | 331 +++++++++++++++++++++++++++++++++-
 drivers/nvme/target/tcp.c  | 356 +++++++++++++++++++++++++++++++++++--
 2 files changed, 665 insertions(+), 22 deletions(-)

-- 
2.26.2

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 0/2] nvmet: support polling task for RDMA and TCP
  2024-07-02 10:02       ` Ping Gan
  2024-07-02 10:02         ` Ping Gan
@ 2024-07-03 19:58         ` Sagi Grimberg
  2024-07-04  8:10           ` Ping Gan
  1 sibling, 1 reply; 17+ messages in thread
From: Sagi Grimberg @ 2024-07-03 19:58 UTC (permalink / raw)
  To: hch, kch, linux-nvme, linux-kernel; +Cc: ping.gan



On 02/07/2024 13:02, Ping Gan wrote:
>> On 01/07/2024 10:42, Ping Gan wrote:
>>>> Hey Ping Gan,
>>>>
>>>>
>>>> On 26/06/2024 11:28, Ping Gan wrote:
>>>>> When running nvmf on SMP platform, current nvme target's RDMA and
>>>>> TCP use kworker to handle IO. But if there is other high workload
>>>>> in the system(eg: on kubernetes), the competition between the
>>>>> kworker and other workload is very radical. And since the kworker
>>>>> is scheduled by OS randomly, it's difficult to control OS resource
>>>>> and also tune the performance. If target support to use delicated
>>>>> polling task to handle IO, it's useful to control OS resource and
>>>>> gain good performance. So it makes sense to add polling task in
>>>>> rdma-rdma and rdma-tcp modules.
>>>> This is NOT the way to go here.
>>>>
>>>> Both rdma and tcp are driven from workqueue context, which are bound
>>>> workqueues.
>>>>
>>>> So there are two ways to go here:
>>>> 1. Add generic port cpuset and use that to direct traffic to the
>>>> appropriate set of cores
>>>> (i.e. select an appropriate comp_vector for rdma and add an
>>>> appropriate
>>>> steering rule
>>>> for tcp).
>>>> 2. Add options to rdma/tcp to use UNBOUND workqueues, and allow
>>>> users
>>>> to
>>>> control
>>>> these UNBOUND workqueues cpumask via sysfs.
>>>>
>>>> (2) will not control interrupts to steer to other workloads cpus,
>>>> but
>>>> the handlers may
>>>> run on a set of dedicated cpus.
>>>>
>>>> (1) is a better solution, but harder to implement.
>>>>
>>>> You also should look into nvmet-fc as well (and nvmet-loop for that
>>>> matter).
>>> hi Sagi Grimberg,
>>> Thanks for your reply, actually we had tried the first advice you
>>> suggested, but we found the performance was poor when using spdk
>>> as initiator.
>> I suggest that you focus on that instead of what you proposed.
>> What is the source of your poor performance?
> Before these patches, we had used linux's RPS to forward the packets
> to a fixed cpu set for nvmet-tcp. But when did that we can still not
> cancel the competition between softirq and workqueue since nvme target's
> kworker cpu core bind on socket's cpu which is from skb. Besides that
> we found workqueue's wait latency was very high even we enabled polling
> on nvmet-tcp by module parameter idle_poll_period_usecs. So when
> initiator
> is polling mode, the target of workqueue is the bottleneck. Below is
> work's wait latency trace log of our test on our cluster(per node uses
> 4 numas 96 cores, 192G memory, one dual ports mellanox CX4LX(25Gbps X 2)
> ethernet adapter and randrw 1M IO size) by RPS to 6 cpu cores. And
> system's CPU and memory were used about 80%.

I'd try a simple unbound CPU case, steer packets to say cores [0-5] and 
assign
the cpumask of the unbound workqueue to cores [6-11].

> ogden-brown:~ #/usr/share/bcc/tools/wqlat -T -w nvmet_tcp_wq 1 2
> 01:06:59
>       usecs               : count     distribution
>           0 -> 1          : 0        |                              |
>           2 -> 3          : 0        |                              |
>           4 -> 7          : 0        |                              |
>           8 -> 15         : 3        |                              |
>          16 -> 31         : 10       |                              |
>          32 -> 63         : 3        |                              |
>          64 -> 127        : 2        |                              |
>         128 -> 255        : 0        |                              |
>         256 -> 511        : 5        |                              |
>         512 -> 1023       : 12       |                              |
>        1024 -> 2047       : 26       |*                             |
>        2048 -> 4095       : 34       |*                             |
>        4096 -> 8191       : 350      |************                  |
>        8192 -> 16383      : 625      |******************************|
>       16384 -> 32767      : 244      |*********                     |
>       32768 -> 65535      : 39       |*                             |
>
> 01:07:00
>       usecs               : count     distribution
>           0 -> 1          : 1        |                              |
>           2 -> 3          : 0        |                              |
>           4 -> 7          : 4        |                              |
>           8 -> 15         : 3        |                              |
>          16 -> 31         : 8        |                              |
>          32 -> 63         : 10       |                              |
>          64 -> 127        : 3        |                              |
>         128 -> 255        : 6        |                              |
>         256 -> 511        : 8        |                              |
>         512 -> 1023       : 20       |*                             |
>        1024 -> 2047       : 19       |*                             |
>        2048 -> 4095       : 57       |**                            |
>        4096 -> 8191       : 325      |****************              |
>        8192 -> 16383      : 647      |******************************|
>       16384 -> 32767      : 228      |***********                   |
>       32768 -> 65535      : 43       |**                            |
>       65536 -> 131071     : 1        |                              |
>
> And the bandwidth of a node is only 3100MB. While we used the patch
> and enable 6 polling task, the bandwidth can be 4000MB. It's a good
> improvement.

I think you will see similar performance with unbound workqueue and rps.

>
>>>    You know this patch is not only resolving OS resource
>>> competition issue, but also the perf issue. We have analyzed if we
>>> still use workqueue(kworker) as target when initiator is polling
>>> driver(eg: spdk), then workqueue/kworker target is the bottleneck
>>> since every nvmf request may have a wait latency from queuing on
>>> workqueue to begin processing,
>> That is incorrect, the work context polls the cq until it either drains
>> it
>> completely, or exhaust a quota of IB_POLL_BUDGET_WORKQUEUE (or
>> NVMET_TCP_IO_WORK_BUDGET). Not every command gets its own workqueue
>> queuing delay.
>>
>> And, what does the spdk initiator has to do with it? Didn't
>> understand...
> Yes, target workqueue implementation will poll a quota; but when the
> work
> load was high we found many work will wait too long(some of them at
> several
> ms to hundred ms shown above histogram). We use the spdk initiator(by
> polling mode) to send IO's read/write to nvme disks of a kubernetes
> cluster's remote node.

The initiator is an orthogonal detail here. the same issue exists 
regardless of
spdk afaiu. Let's ignore it, its confusing.

>
>>>    and the latency can be traced by wqlat
>>> of bcc (https://github.com/iovisor/bcc/blob/master/tools/wqlat.py).
>>> We think the latency is a disaster for the polling driver data plane,
>>> right?
>> If you need a target that polls all the time, you should probably
>> resort
>> to spdk.
>> If there is room for optimization in nvmet we'll gladly take it, but
>> this is not the
>> way to go IMO.
> Yes, in the begining we did use the spdk as polling target driver,
> but we suffered from spdk target could not support disk hot plug/unplug
> well, sometimes it will cause data loss when did disk hot plug/unplug.
> So we switch to kernel target driver because in production customer's
> data security is first priority. And for kernel's target it has no
> polling mode target driver, so we implemented these patches.

Well, its a hard sell for upstream nvmet.

>
>>> So we think adding a polling task mode on nvmet side to handle
>>> IO does really make sense; what's your opinion about this?
>> I personally think that adding a polling kthread is questionable.
>> However there is a precedent, io_uring sqthreads. So please look
>> into what is done there. I don't mind having something like
>> IB_POLL_IOTASK (or
>> io_task threads in nvmet-tcp) if its done correctly (leverages common
>> code).
> Yes, we have studied io_uring's code before implementing the patches.
> Actually we followed io_uring's design idea in these patches.

I'm talking about reusing what io_uring sqpoll tasks. Move them to common
code, generalizing it to address what you need, and reuse that. 
Implementing a
half-baked inspired version in nvmet is not going to fly here. Sorry.


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 0/2] nvmet: support polling task for RDMA and TCP
  2024-07-03 19:58         ` Sagi Grimberg
@ 2024-07-04  8:10           ` Ping Gan
  2024-07-04  8:40             ` Sagi Grimberg
  2024-07-16 10:36             ` Hannes Reinecke
  0 siblings, 2 replies; 17+ messages in thread
From: Ping Gan @ 2024-07-04  8:10 UTC (permalink / raw)
  To: sagi, hch, kch, linux-nvme, linux-kernel; +Cc: ping.gan

> On 02/07/2024 13:02, Ping Gan wrote:
>>> On 01/07/2024 10:42, Ping Gan wrote:
>>>>> Hey Ping Gan,
>>>>>
>>>>>
>>>>> On 26/06/2024 11:28, Ping Gan wrote:
>>>>>> When running nvmf on SMP platform, current nvme target's RDMA and
>>>>>> TCP use kworker to handle IO. But if there is other high workload
>>>>>> in the system(eg: on kubernetes), the competition between the
>>>>>> kworker and other workload is very radical. And since the kworker
>>>>>> is scheduled by OS randomly, it's difficult to control OS
>>>>>> resource
>>>>>> and also tune the performance. If target support to use delicated
>>>>>> polling task to handle IO, it's useful to control OS resource and
>>>>>> gain good performance. So it makes sense to add polling task in
>>>>>> rdma-rdma and rdma-tcp modules.
>>>>> This is NOT the way to go here.
>>>>>
>>>>> Both rdma and tcp are driven from workqueue context, which are
>>>>> bound
>>>>> workqueues.
>>>>>
>>>>> So there are two ways to go here:
>>>>> 1. Add generic port cpuset and use that to direct traffic to the
>>>>> appropriate set of cores
>>>>> (i.e. select an appropriate comp_vector for rdma and add an
>>>>> appropriate
>>>>> steering rule
>>>>> for tcp).
>>>>> 2. Add options to rdma/tcp to use UNBOUND workqueues, and allow
>>>>> users
>>>>> to
>>>>> control
>>>>> these UNBOUND workqueues cpumask via sysfs.
>>>>>
>>>>> (2) will not control interrupts to steer to other workloads cpus,
>>>>> but
>>>>> the handlers may
>>>>> run on a set of dedicated cpus.
>>>>>
>>>>> (1) is a better solution, but harder to implement.
>>>>>
>>>>> You also should look into nvmet-fc as well (and nvmet-loop for
>>>>> that
>>>>> matter).
>>>> hi Sagi Grimberg,
>>>> Thanks for your reply, actually we had tried the first advice you
>>>> suggested, but we found the performance was poor when using spdk
>>>> as initiator.
>>> I suggest that you focus on that instead of what you proposed.
>>> What is the source of your poor performance?
>> Before these patches, we had used linux's RPS to forward the packets
>> to a fixed cpu set for nvmet-tcp. But when did that we can still not
>> cancel the competition between softirq and workqueue since nvme
>> target's
>> kworker cpu core bind on socket's cpu which is from skb. Besides that
>> we found workqueue's wait latency was very high even we enabled
>> polling
>> on nvmet-tcp by module parameter idle_poll_period_usecs. So when
>> initiator
>> is polling mode, the target of workqueue is the bottleneck. Below is
>> work's wait latency trace log of our test on our cluster(per node
>> uses
>> 4 numas 96 cores, 192G memory, one dual ports mellanox CX4LX(25Gbps X
>> 2)
>> ethernet adapter and randrw 1M IO size) by RPS to 6 cpu cores. And
>> system's CPU and memory were used about 80%.

> I'd try a simple unbound CPU case, steer packets to say cores [0-5]
> and 
> assign
> the cpumask of the unbound workqueue to cores [6-11].

Okay, thanks for your guide.

>> ogden-brown:~ #/usr/share/bcc/tools/wqlat -T -w nvmet_tcp_wq 1 2
>> 01:06:59
>>       usecs               : count     distribution
>>           0 -> 1          : 0        |                              |
>>           2 -> 3          : 0        |                              |
>>           4 -> 7          : 0        |                              |
>>           8 -> 15         : 3        |                              |
>>          16 -> 31         : 10       |                              |
>>          32 -> 63         : 3        |                              |
>>          64 -> 127        : 2        |                              |
>>         128 -> 255        : 0        |                              |
>>         256 -> 511        : 5        |                              |
>>         512 -> 1023       : 12       |                              |
>>        1024 -> 2047       : 26       |*                             |
>>        2048 -> 4095       : 34       |*                             |
>>        4096 -> 8191       : 350      |************                  |
>>        8192 -> 16383      : 625      |******************************|
>>       16384 -> 32767      : 244      |*********                     |
>>       32768 -> 65535      : 39       |*                             |
>>
>> 01:07:00
>>       usecs               : count     distribution
>>           0 -> 1          : 1        |                              |
>>           2 -> 3          : 0        |                              |
>>           4 -> 7          : 4        |                              |
>>           8 -> 15         : 3        |                              |
>>          16 -> 31         : 8        |                              |
>>          32 -> 63         : 10       |                              |
>>          64 -> 127        : 3        |                              |
>>         128 -> 255        : 6        |                              |
>>         256 -> 511        : 8        |                              |
>>         512 -> 1023       : 20       |*                             |
>>        1024 -> 2047       : 19       |*                             |
>>        2048 -> 4095       : 57       |**                            |
>>        4096 -> 8191       : 325      |****************              |
>>        8192 -> 16383      : 647      |******************************|
>>       16384 -> 32767      : 228      |***********                   |
>>       32768 -> 65535      : 43       |**                            |
>>       65536 -> 131071     : 1        |                              |
>>
>> And the bandwidth of a node is only 3100MB. While we used the patch
>> and enable 6 polling task, the bandwidth can be 4000MB. It's a good
>> improvement.
>
> I think you will see similar performance with unbound workqueue and
> rps.

Yes, I remodified the nvmet-tcp/nvmet-rdma code for supporting unbound
workqueue, and in same prerequisites of above to run test, and compared
the result of unbound workqueue and polling mode task. And I got a good 
performance for unbound workqueue. For unbound workqueue TCP we got 
3850M/node, it's almost equal to polling task. And also tested
nvmet-rdma
we get 5100M/node for unbound workqueue RDMA versus 5600M for polling
task,
seems the diff is very small. Anyway, your advice is good. Do you think
we
should submit the unbound workqueue patches for nvmet-tcp and nvmet-rdma 
to upstream nvmet?
BTW I have another question: Will nvmet of upstream have the plan to
support 
polling queue when doing submit_bio in future? 
 
>>
>>>>    You know this patch is not only resolving OS resource
>>>> competition issue, but also the perf issue. We have analyzed if we
>>>> still use workqueue(kworker) as target when initiator is polling
>>>> driver(eg: spdk), then workqueue/kworker target is the bottleneck
>>>> since every nvmf request may have a wait latency from queuing on
>>>> workqueue to begin processing,
>>> That is incorrect, the work context polls the cq until it either
>>> drains
>>> it
>>> completely, or exhaust a quota of IB_POLL_BUDGET_WORKQUEUE (or
>>> NVMET_TCP_IO_WORK_BUDGET). Not every command gets its own workqueue
>>> queuing delay.
>>>
>>> And, what does the spdk initiator has to do with it? Didn't
>>> understand...
>> Yes, target workqueue implementation will poll a quota; but when the
>> work
>> load was high we found many work will wait too long(some of them at
>> several
>> ms to hundred ms shown above histogram). We use the spdk initiator(by
>> polling mode) to send IO's read/write to nvme disks of a kubernetes
>> cluster's remote node.
>
> The initiator is an orthogonal detail here. the same issue exists 
> regardless of
> spdk afaiu. Let's ignore it, its confusing.
>
>>
>>>>    and the latency can be traced by wqlat
>>>> of bcc (https://github.com/iovisor/bcc/blob/master/tools/wqlat.py).
>>>> We think the latency is a disaster for the polling driver data
>>>> plane,
>>>> right?
>>> If you need a target that polls all the time, you should probably
>>> resort
>>> to spdk.
>>> If there is room for optimization in nvmet we'll gladly take it, but
>>> this is not the
>>> way to go IMO.
>> Yes, in the begining we did use the spdk as polling target driver,
>> but we suffered from spdk target could not support disk hot
>> plug/unplug
>> well, sometimes it will cause data loss when did disk hot
>> plug/unplug.
>> So we switch to kernel target driver because in production customer's
>> data security is first priority. And for kernel's target it has no
>> polling mode target driver, so we implemented these patches.
>
> Well, its a hard sell for upstream nvmet.
>>
>>
>>>> So we think adding a polling task mode on nvmet side to handle
>>>> IO does really make sense; what's your opinion about this?
>>> I personally think that adding a polling kthread is questionable.
>>> However there is a precedent, io_uring sqthreads. So please look
>>> into what is done there. I don't mind having something like
>>> IB_POLL_IOTASK (or
>>> io_task threads in nvmet-tcp) if its done correctly (leverages
>>> common
>>> code).
>> Yes, we have studied io_uring's code before implementing the patches.
>> Actually we followed io_uring's design idea in these patches.
>
> I'm talking about reusing what io_uring sqpoll tasks. Move them to
> common
> code, generalizing it to address what you need, and reuse that. 
> Implementing a
> half-baked inspired version in nvmet is not going to fly here. Sorry.
Okay, got it.


Thanks,
Ping




^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 0/2] nvmet: support polling task for RDMA and TCP
  2024-07-04  8:10           ` Ping Gan
@ 2024-07-04  8:40             ` Sagi Grimberg
  2024-07-04 10:35               ` Ping Gan
  2024-07-16 10:36             ` Hannes Reinecke
  1 sibling, 1 reply; 17+ messages in thread
From: Sagi Grimberg @ 2024-07-04  8:40 UTC (permalink / raw)
  To: Ping Gan, hch, kch, linux-nvme, linux-kernel; +Cc: ping.gan



On 7/4/24 11:10, Ping Gan wrote:
>> On 02/07/2024 13:02, Ping Gan wrote:
>>>> On 01/07/2024 10:42, Ping Gan wrote:
>>>>>> Hey Ping Gan,
>>>>>>
>>>>>>
>>>>>> On 26/06/2024 11:28, Ping Gan wrote:
>>>>>>> When running nvmf on SMP platform, current nvme target's RDMA and
>>>>>>> TCP use kworker to handle IO. But if there is other high workload
>>>>>>> in the system(eg: on kubernetes), the competition between the
>>>>>>> kworker and other workload is very radical. And since the kworker
>>>>>>> is scheduled by OS randomly, it's difficult to control OS
>>>>>>> resource
>>>>>>> and also tune the performance. If target support to use delicated
>>>>>>> polling task to handle IO, it's useful to control OS resource and
>>>>>>> gain good performance. So it makes sense to add polling task in
>>>>>>> rdma-rdma and rdma-tcp modules.
>>>>>> This is NOT the way to go here.
>>>>>>
>>>>>> Both rdma and tcp are driven from workqueue context, which are
>>>>>> bound
>>>>>> workqueues.
>>>>>>
>>>>>> So there are two ways to go here:
>>>>>> 1. Add generic port cpuset and use that to direct traffic to the
>>>>>> appropriate set of cores
>>>>>> (i.e. select an appropriate comp_vector for rdma and add an
>>>>>> appropriate
>>>>>> steering rule
>>>>>> for tcp).
>>>>>> 2. Add options to rdma/tcp to use UNBOUND workqueues, and allow
>>>>>> users
>>>>>> to
>>>>>> control
>>>>>> these UNBOUND workqueues cpumask via sysfs.
>>>>>>
>>>>>> (2) will not control interrupts to steer to other workloads cpus,
>>>>>> but
>>>>>> the handlers may
>>>>>> run on a set of dedicated cpus.
>>>>>>
>>>>>> (1) is a better solution, but harder to implement.
>>>>>>
>>>>>> You also should look into nvmet-fc as well (and nvmet-loop for
>>>>>> that
>>>>>> matter).
>>>>> hi Sagi Grimberg,
>>>>> Thanks for your reply, actually we had tried the first advice you
>>>>> suggested, but we found the performance was poor when using spdk
>>>>> as initiator.
>>>> I suggest that you focus on that instead of what you proposed.
>>>> What is the source of your poor performance?
>>> Before these patches, we had used linux's RPS to forward the packets
>>> to a fixed cpu set for nvmet-tcp. But when did that we can still not
>>> cancel the competition between softirq and workqueue since nvme
>>> target's
>>> kworker cpu core bind on socket's cpu which is from skb. Besides that
>>> we found workqueue's wait latency was very high even we enabled
>>> polling
>>> on nvmet-tcp by module parameter idle_poll_period_usecs. So when
>>> initiator
>>> is polling mode, the target of workqueue is the bottleneck. Below is
>>> work's wait latency trace log of our test on our cluster(per node
>>> uses
>>> 4 numas 96 cores, 192G memory, one dual ports mellanox CX4LX(25Gbps X
>>> 2)
>>> ethernet adapter and randrw 1M IO size) by RPS to 6 cpu cores. And
>>> system's CPU and memory were used about 80%.
>> I'd try a simple unbound CPU case, steer packets to say cores [0-5]
>> and
>> assign
>> the cpumask of the unbound workqueue to cores [6-11].
> Okay, thanks for your guide.
>
>>> ogden-brown:~ #/usr/share/bcc/tools/wqlat -T -w nvmet_tcp_wq 1 2
>>> 01:06:59
>>>        usecs               : count     distribution
>>>            0 -> 1          : 0        |                              |
>>>            2 -> 3          : 0        |                              |
>>>            4 -> 7          : 0        |                              |
>>>            8 -> 15         : 3        |                              |
>>>           16 -> 31         : 10       |                              |
>>>           32 -> 63         : 3        |                              |
>>>           64 -> 127        : 2        |                              |
>>>          128 -> 255        : 0        |                              |
>>>          256 -> 511        : 5        |                              |
>>>          512 -> 1023       : 12       |                              |
>>>         1024 -> 2047       : 26       |*                             |
>>>         2048 -> 4095       : 34       |*                             |
>>>         4096 -> 8191       : 350      |************                  |
>>>         8192 -> 16383      : 625      |******************************|
>>>        16384 -> 32767      : 244      |*********                     |
>>>        32768 -> 65535      : 39       |*                             |
>>>
>>> 01:07:00
>>>        usecs               : count     distribution
>>>            0 -> 1          : 1        |                              |
>>>            2 -> 3          : 0        |                              |
>>>            4 -> 7          : 4        |                              |
>>>            8 -> 15         : 3        |                              |
>>>           16 -> 31         : 8        |                              |
>>>           32 -> 63         : 10       |                              |
>>>           64 -> 127        : 3        |                              |
>>>          128 -> 255        : 6        |                              |
>>>          256 -> 511        : 8        |                              |
>>>          512 -> 1023       : 20       |*                             |
>>>         1024 -> 2047       : 19       |*                             |
>>>         2048 -> 4095       : 57       |**                            |
>>>         4096 -> 8191       : 325      |****************              |
>>>         8192 -> 16383      : 647      |******************************|
>>>        16384 -> 32767      : 228      |***********                   |
>>>        32768 -> 65535      : 43       |**                            |
>>>        65536 -> 131071     : 1        |                              |
>>>
>>> And the bandwidth of a node is only 3100MB. While we used the patch
>>> and enable 6 polling task, the bandwidth can be 4000MB. It's a good
>>> improvement.
>> I think you will see similar performance with unbound workqueue and
>> rps.
> Yes, I remodified the nvmet-tcp/nvmet-rdma code for supporting unbound
> workqueue, and in same prerequisites of above to run test, and compared
> the result of unbound workqueue and polling mode task. And I got a good
> performance for unbound workqueue. For unbound workqueue TCP we got
> 3850M/node, it's almost equal to polling task. And also tested
> nvmet-rdma
> we get 5100M/node for unbound workqueue RDMA versus 5600M for polling
> task,
> seems the diff is very small. Anyway, your advice is good.

I'm a bit surprised that you see ~10% delta here. I would look into what 
is the root-cause of
this difference. If indeed the load is high, the overhead of the 
workqueue mgmt should be
negligible. I'm assuming you used IB_POLL_UNBOUND_WORKQUEUE ?



>   Do you think
> we
> should submit the unbound workqueue patches for nvmet-tcp and nvmet-rdma
> to upstream nvmet?

For nvmet-tcp, I think there is merit to split socket processing from 
napi context. For nvmet-rdma
I think the only difference is if you have multiple CQs assigned with 
the same comp_vector.

How many queues do you have in your test?

> BTW I have another question: Will nvmet of upstream have the plan to
> support
> polling queue when doing submit_bio in future?

No plans that I know of. Don't have a coherent idea of how that would work.


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 0/2] nvmet: support polling task for RDMA and TCP
  2024-07-04  8:40             ` Sagi Grimberg
@ 2024-07-04 10:35               ` Ping Gan
  2024-07-05  5:59                 ` Sagi Grimberg
  0 siblings, 1 reply; 17+ messages in thread
From: Ping Gan @ 2024-07-04 10:35 UTC (permalink / raw)
  To: sagi, hch, kch, linux-nvme, linux-kernel; +Cc: ping.gan

> On 7/4/24 11:10, Ping Gan wrote:
>>> On 02/07/2024 13:02, Ping Gan wrote:
>>>>> On 01/07/2024 10:42, Ping Gan wrote:
>>>>>>> Hey Ping Gan,
>>>>>>>
>>>>>>>
>>>>>>> On 26/06/2024 11:28, Ping Gan wrote:
>>>>>>>> When running nvmf on SMP platform, current nvme target's RDMA
>>>>>>>> and
>>>>>>>> TCP use kworker to handle IO. But if there is other high
>>>>>>>> workload
>>>>>>>> in the system(eg: on kubernetes), the competition between the
>>>>>>>> kworker and other workload is very radical. And since the
>>>>>>>> kworker
>>>>>>>> is scheduled by OS randomly, it's difficult to control OS
>>>>>>>> resource
>>>>>>>> and also tune the performance. If target support to use
>>>>>>>> delicated
>>>>>>>> polling task to handle IO, it's useful to control OS resource
>>>>>>>> and
>>>>>>>> gain good performance. So it makes sense to add polling task in
>>>>>>>> rdma-rdma and rdma-tcp modules.
>>>>>>> This is NOT the way to go here.
>>>>>>>
>>>>>>> Both rdma and tcp are driven from workqueue context, which are
>>>>>>> bound
>>>>>>> workqueues.
>>>>>>>
>>>>>>> So there are two ways to go here:
>>>>>>> 1. Add generic port cpuset and use that to direct traffic to the
>>>>>>> appropriate set of cores
>>>>>>> (i.e. select an appropriate comp_vector for rdma and add an
>>>>>>> appropriate
>>>>>>> steering rule
>>>>>>> for tcp).
>>>>>>> 2. Add options to rdma/tcp to use UNBOUND workqueues, and allow
>>>>>>> users
>>>>>>> to
>>>>>>> control
>>>>>>> these UNBOUND workqueues cpumask via sysfs.
>>>>>>>
>>>>>>> (2) will not control interrupts to steer to other workloads
>>>>>>> cpus,
>>>>>>> but
>>>>>>> the handlers may
>>>>>>> run on a set of dedicated cpus.
>>>>>>>
>>>>>>> (1) is a better solution, but harder to implement.
>>>>>>>
>>>>>>> You also should look into nvmet-fc as well (and nvmet-loop for
>>>>>>> that
>>>>>>> matter).
>>>>>> hi Sagi Grimberg,
>>>>>> Thanks for your reply, actually we had tried the first advice you
>>>>>> suggested, but we found the performance was poor when using spdk
>>>>>> as initiator.
>>>>> I suggest that you focus on that instead of what you proposed.
>>>>> What is the source of your poor performance?
>>>> Before these patches, we had used linux's RPS to forward the
>>>> packets
>>>> to a fixed cpu set for nvmet-tcp. But when did that we can still
>>>> not
>>>> cancel the competition between softirq and workqueue since nvme
>>>> target's
>>>> kworker cpu core bind on socket's cpu which is from skb. Besides
>>>> that
>>>> we found workqueue's wait latency was very high even we enabled
>>>> polling
>>>> on nvmet-tcp by module parameter idle_poll_period_usecs. So when
>>>> initiator
>>>> is polling mode, the target of workqueue is the bottleneck. Below
>>>> is
>>>> work's wait latency trace log of our test on our cluster(per node
>>>> uses
>>>> 4 numas 96 cores, 192G memory, one dual ports mellanox CX4LX(25Gbps
>>>> X
>>>> 2)
>>>> ethernet adapter and randrw 1M IO size) by RPS to 6 cpu cores. And
>>>> system's CPU and memory were used about 80%.
>>> I'd try a simple unbound CPU case, steer packets to say cores [0-5]
>>> and
>>> assign
>>> the cpumask of the unbound workqueue to cores [6-11].
>> Okay, thanks for your guide.
>>
>>>> ogden-brown:~ #/usr/share/bcc/tools/wqlat -T -w nvmet_tcp_wq 1 2
>>>> 01:06:59
>>>>    usecs               : count     distribution
>>>>        0 -> 1          : 0        |                              |
>>>>        2 -> 3          : 0        |                              |
>>>>        4 -> 7          : 0        |                              |
>>>>        8 -> 15         : 3        |                              |
>>>>       16 -> 31         : 10       |                              |
>>>>       32 -> 63         : 3        |                              |
>>>>       64 -> 127        : 2        |                              |
>>>>      128 -> 255        : 0        |                              |
>>>>      256 -> 511        : 5        |                              |
>>>>      512 -> 1023       : 12       |                              |
>>>>     1024 -> 2047       : 26       |*                             |
>>>>     2048 -> 4095       : 34       |*                             |
>>>>     4096 -> 8191       : 350      |************                  |
>>>>     8192 -> 16383      : 625      |******************************|
>>>>    16384 -> 32767      : 244      |*********                     |
>>>>    32768 -> 65535      : 39       |*                             |
>>>>
>>>> 01:07:00
>>>>    usecs               : count     distribution
>>>>        0 -> 1          : 1        |                              |
>>>>        2 -> 3          : 0        |                              |
>>>>        4 -> 7          : 4        |                              |
>>>>        8 -> 15         : 3        |                              |
>>>>       16 -> 31         : 8        |                              |
>>>>       32 -> 63         : 10       |                              |
>>>>       64 -> 127        : 3        |                              |
>>>>      128 -> 255        : 6        |                              |
>>>>      256 -> 511        : 8        |                              |
>>>>      512 -> 1023       : 20       |*                             |
>>>>     1024 -> 2047       : 19       |*                             |
>>>>     2048 -> 4095       : 57       |**                            |
>>>>     4096 -> 8191       : 325      |****************              |
>>>>     8192 -> 16383      : 647      |******************************|
>>>>    16384 -> 32767      : 228      |***********                   |
>>>>    32768 -> 65535      : 43       |**                            |
>>>>    65536 -> 131071     : 1        |                              |
>>>>
>>>> And the bandwidth of a node is only 3100MB. While we used the patch
>>>> and enable 6 polling task, the bandwidth can be 4000MB. It's a good
>>>> improvement.
>>> I think you will see similar performance with unbound workqueue and
>>> rps.
>> Yes, I remodified the nvmet-tcp/nvmet-rdma code for supporting
>> unbound
>> workqueue, and in same prerequisites of above to run test, and
>> compared
>> the result of unbound workqueue and polling mode task. And I got a
>> good
>> performance for unbound workqueue. For unbound workqueue TCP we got
>> 3850M/node, it's almost equal to polling task. And also tested
>> nvmet-rdma
>> we get 5100M/node for unbound workqueue RDMA versus 5600M for polling
>> task,
>> seems the diff is very small. Anyway, your advice is good.
>
> I'm a bit surprised that you see ~10% delta here. I would look into
> what 
> is the root-cause of
> this difference. If indeed the load is high, the overhead of the 
> workqueue mgmt should be
> negligible. I'm assuming you used IB_POLL_UNBOUND_WORKQUEUE ?

Yes, we used IB_POLL_UNBOUND_WORKQUEUE to create ib CQ. And I observed
3% CPU
usage of unbound workqueue versus 6% of polling task. 

>>   Do you think
>> we
>> should submit the unbound workqueue patches for nvmet-tcp and
>> nvmet-rdma
>> to upstream nvmet?
>
> For nvmet-tcp, I think there is merit to split socket processing from 
> napi context. For nvmet-rdma
> I think the only difference is if you have multiple CQs assigned with 
> the same comp_vector.
>
> How many queues do you have in your test?

We used 24 IO queues to nvmet-rdma target. I think this may also be 
related to workqueue's wait latency. We still see some several ms wait
latency for unbound workqueue of RMDA. You can see below trace log.

ogden-brown:~ # /usr/share/bcc/tools/wqlat -T -w ib-comp-unb-wq 1 3
Tracing work queue request latency time... Hit Ctrl-C to end.

10:09:10
     usecs               : count     distribution
         0 -> 1          : 6        |                              |
         2 -> 3          : 105      |**                            |
         4 -> 7          : 1732     |******************************|
         8 -> 15         : 1597     |******************************|
        16 -> 31         : 526      |************                  |
        32 -> 63         : 543      |************                  |
        64 -> 127        : 950      |*********************         |
       128 -> 255        : 1335     |***************************** |
       256 -> 511        : 1534     |******************************|
       512 -> 1023       : 1039     |***********************       |
      1024 -> 2047       : 592      |*************                 |
      2048 -> 4095       : 112      |**                            |
      4096 -> 8191       : 6        |                              |

10:09:11
     usecs               : count     distribution
         0 -> 1          : 3        |                              |
         2 -> 3          : 62       |*                             |
         4 -> 7          : 1459     |***************************** |
         8 -> 15         : 1869     |******************************|
        16 -> 31         : 612      |*************                 |
        32 -> 63         : 478      |**********                    |
        64 -> 127        : 844      |******************            |
       128 -> 255        : 1123     |************************      |
       256 -> 511        : 1278     |***************************   |
       512 -> 1023       : 1113     |***********************       |
      1024 -> 2047       : 632      |*************                 |
      2048 -> 4095       : 158      |***                           |
      4096 -> 8191       : 18       |                              |
      8192 -> 16383      : 1        |                              |

10:09:12
     usecs               : count     distribution
         0 -> 1          : 1        |                              |
         2 -> 3          : 68       |*                             |
         4 -> 7          : 1399     |***************************   |
         8 -> 15         : 1822     |******************************|
        16 -> 31         : 559      |************                  |
        32 -> 63         : 513      |***********                   |
        64 -> 127        : 906      |*******************           |
       128 -> 255        : 1217     |***********************       |
       256 -> 511        : 1391     |***************************   |
       512 -> 1023       : 1135     |************************      |
      1024 -> 2047       : 569      |************                  |
      2048 -> 4095       : 110      |**                            |
      4096 -> 8191       : 26       |                              |
      8192 -> 16383      : 11       |                              |

Thanks,
Ping




^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 0/2] nvmet: support polling task for RDMA and TCP
  2024-07-04 10:35               ` Ping Gan
@ 2024-07-05  5:59                 ` Sagi Grimberg
  2024-07-05  6:28                   ` Ping Gan
  0 siblings, 1 reply; 17+ messages in thread
From: Sagi Grimberg @ 2024-07-05  5:59 UTC (permalink / raw)
  To: Ping Gan, hch, kch, linux-nvme, linux-kernel; +Cc: ping.gan



On 7/4/24 13:35, Ping Gan wrote:
>> On 7/4/24 11:10, Ping Gan wrote:
>>>> On 02/07/2024 13:02, Ping Gan wrote:
>>>>>> On 01/07/2024 10:42, Ping Gan wrote:
>>>>>>>> Hey Ping Gan,
>>>>>>>>
>>>>>>>>
>>>>>>>> On 26/06/2024 11:28, Ping Gan wrote:
>>>>>>>>> When running nvmf on SMP platform, current nvme target's RDMA
>>>>>>>>> and
>>>>>>>>> TCP use kworker to handle IO. But if there is other high
>>>>>>>>> workload
>>>>>>>>> in the system(eg: on kubernetes), the competition between the
>>>>>>>>> kworker and other workload is very radical. And since the
>>>>>>>>> kworker
>>>>>>>>> is scheduled by OS randomly, it's difficult to control OS
>>>>>>>>> resource
>>>>>>>>> and also tune the performance. If target support to use
>>>>>>>>> delicated
>>>>>>>>> polling task to handle IO, it's useful to control OS resource
>>>>>>>>> and
>>>>>>>>> gain good performance. So it makes sense to add polling task in
>>>>>>>>> rdma-rdma and rdma-tcp modules.
>>>>>>>> This is NOT the way to go here.
>>>>>>>>
>>>>>>>> Both rdma and tcp are driven from workqueue context, which are
>>>>>>>> bound
>>>>>>>> workqueues.
>>>>>>>>
>>>>>>>> So there are two ways to go here:
>>>>>>>> 1. Add generic port cpuset and use that to direct traffic to the
>>>>>>>> appropriate set of cores
>>>>>>>> (i.e. select an appropriate comp_vector for rdma and add an
>>>>>>>> appropriate
>>>>>>>> steering rule
>>>>>>>> for tcp).
>>>>>>>> 2. Add options to rdma/tcp to use UNBOUND workqueues, and allow
>>>>>>>> users
>>>>>>>> to
>>>>>>>> control
>>>>>>>> these UNBOUND workqueues cpumask via sysfs.
>>>>>>>>
>>>>>>>> (2) will not control interrupts to steer to other workloads
>>>>>>>> cpus,
>>>>>>>> but
>>>>>>>> the handlers may
>>>>>>>> run on a set of dedicated cpus.
>>>>>>>>
>>>>>>>> (1) is a better solution, but harder to implement.
>>>>>>>>
>>>>>>>> You also should look into nvmet-fc as well (and nvmet-loop for
>>>>>>>> that
>>>>>>>> matter).
>>>>>>> hi Sagi Grimberg,
>>>>>>> Thanks for your reply, actually we had tried the first advice you
>>>>>>> suggested, but we found the performance was poor when using spdk
>>>>>>> as initiator.
>>>>>> I suggest that you focus on that instead of what you proposed.
>>>>>> What is the source of your poor performance?
>>>>> Before these patches, we had used linux's RPS to forward the
>>>>> packets
>>>>> to a fixed cpu set for nvmet-tcp. But when did that we can still
>>>>> not
>>>>> cancel the competition between softirq and workqueue since nvme
>>>>> target's
>>>>> kworker cpu core bind on socket's cpu which is from skb. Besides
>>>>> that
>>>>> we found workqueue's wait latency was very high even we enabled
>>>>> polling
>>>>> on nvmet-tcp by module parameter idle_poll_period_usecs. So when
>>>>> initiator
>>>>> is polling mode, the target of workqueue is the bottleneck. Below
>>>>> is
>>>>> work's wait latency trace log of our test on our cluster(per node
>>>>> uses
>>>>> 4 numas 96 cores, 192G memory, one dual ports mellanox CX4LX(25Gbps
>>>>> X
>>>>> 2)
>>>>> ethernet adapter and randrw 1M IO size) by RPS to 6 cpu cores. And
>>>>> system's CPU and memory were used about 80%.
>>>> I'd try a simple unbound CPU case, steer packets to say cores [0-5]
>>>> and
>>>> assign
>>>> the cpumask of the unbound workqueue to cores [6-11].
>>> Okay, thanks for your guide.
>>>
>>>>> ogden-brown:~ #/usr/share/bcc/tools/wqlat -T -w nvmet_tcp_wq 1 2
>>>>> 01:06:59
>>>>>     usecs               : count     distribution
>>>>>         0 -> 1          : 0        |                              |
>>>>>         2 -> 3          : 0        |                              |
>>>>>         4 -> 7          : 0        |                              |
>>>>>         8 -> 15         : 3        |                              |
>>>>>        16 -> 31         : 10       |                              |
>>>>>        32 -> 63         : 3        |                              |
>>>>>        64 -> 127        : 2        |                              |
>>>>>       128 -> 255        : 0        |                              |
>>>>>       256 -> 511        : 5        |                              |
>>>>>       512 -> 1023       : 12       |                              |
>>>>>      1024 -> 2047       : 26       |*                             |
>>>>>      2048 -> 4095       : 34       |*                             |
>>>>>      4096 -> 8191       : 350      |************                  |
>>>>>      8192 -> 16383      : 625      |******************************|
>>>>>     16384 -> 32767      : 244      |*********                     |
>>>>>     32768 -> 65535      : 39       |*                             |
>>>>>
>>>>> 01:07:00
>>>>>     usecs               : count     distribution
>>>>>         0 -> 1          : 1        |                              |
>>>>>         2 -> 3          : 0        |                              |
>>>>>         4 -> 7          : 4        |                              |
>>>>>         8 -> 15         : 3        |                              |
>>>>>        16 -> 31         : 8        |                              |
>>>>>        32 -> 63         : 10       |                              |
>>>>>        64 -> 127        : 3        |                              |
>>>>>       128 -> 255        : 6        |                              |
>>>>>       256 -> 511        : 8        |                              |
>>>>>       512 -> 1023       : 20       |*                             |
>>>>>      1024 -> 2047       : 19       |*                             |
>>>>>      2048 -> 4095       : 57       |**                            |
>>>>>      4096 -> 8191       : 325      |****************              |
>>>>>      8192 -> 16383      : 647      |******************************|
>>>>>     16384 -> 32767      : 228      |***********                   |
>>>>>     32768 -> 65535      : 43       |**                            |
>>>>>     65536 -> 131071     : 1        |                              |
>>>>>
>>>>> And the bandwidth of a node is only 3100MB. While we used the patch
>>>>> and enable 6 polling task, the bandwidth can be 4000MB. It's a good
>>>>> improvement.
>>>> I think you will see similar performance with unbound workqueue and
>>>> rps.
>>> Yes, I remodified the nvmet-tcp/nvmet-rdma code for supporting
>>> unbound
>>> workqueue, and in same prerequisites of above to run test, and
>>> compared
>>> the result of unbound workqueue and polling mode task. And I got a
>>> good
>>> performance for unbound workqueue. For unbound workqueue TCP we got
>>> 3850M/node, it's almost equal to polling task. And also tested
>>> nvmet-rdma
>>> we get 5100M/node for unbound workqueue RDMA versus 5600M for polling
>>> task,
>>> seems the diff is very small. Anyway, your advice is good.
>> I'm a bit surprised that you see ~10% delta here. I would look into
>> what
>> is the root-cause of
>> this difference. If indeed the load is high, the overhead of the
>> workqueue mgmt should be
>> negligible. I'm assuming you used IB_POLL_UNBOUND_WORKQUEUE ?
> Yes, we used IB_POLL_UNBOUND_WORKQUEUE to create ib CQ. And I observed
> 3% CPU
> usage of unbound workqueue versus 6% of polling task.
>
>>>    Do you think
>>> we
>>> should submit the unbound workqueue patches for nvmet-tcp and
>>> nvmet-rdma
>>> to upstream nvmet?
>> For nvmet-tcp, I think there is merit to split socket processing from
>> napi context. For nvmet-rdma
>> I think the only difference is if you have multiple CQs assigned with
>> the same comp_vector.
>>
>> How many queues do you have in your test?
> We used 24 IO queues to nvmet-rdma target. I think this may also be
> related to workqueue's wait latency. We still see some several ms wait
> latency for unbound workqueue of RMDA. You can see below trace log.

What is the queue size of each? what rdma device are you using?


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 0/2] nvmet: support polling task for RDMA and TCP
  2024-07-05  5:59                 ` Sagi Grimberg
@ 2024-07-05  6:28                   ` Ping Gan
  0 siblings, 0 replies; 17+ messages in thread
From: Ping Gan @ 2024-07-05  6:28 UTC (permalink / raw)
  To: sagi, hch, kch, linux-nvme, linux-kernel; +Cc: ping.gan

> On 7/4/24 13:35, Ping Gan wrote:
>>> On 7/4/24 11:10, Ping Gan wrote:
>>>>> On 02/07/2024 13:02, Ping Gan wrote:
>>>>>>> On 01/07/2024 10:42, Ping Gan wrote:
>>>>>>>>> Hey Ping Gan,
>>>>>>>>>
>>>>>>>>>
>>>>>>>>> On 26/06/2024 11:28, Ping Gan wrote:
>>>>>>>>>> When running nvmf on SMP platform, current nvme target's RDMA
>>>>>>>>>> and
>>>>>>>>>> TCP use kworker to handle IO. But if there is other high
>>>>>>>>>> workload
>>>>>>>>>> in the system(eg: on kubernetes), the competition between the
>>>>>>>>>> kworker and other workload is very radical. And since the
>>>>>>>>>> kworker
>>>>>>>>>> is scheduled by OS randomly, it's difficult to control OS
>>>>>>>>>> resource
>>>>>>>>>> and also tune the performance. If target support to use
>>>>>>>>>> delicated
>>>>>>>>>> polling task to handle IO, it's useful to control OS resource
>>>>>>>>>> and
>>>>>>>>>> gain good performance. So it makes sense to add polling task
>>>>>>>>>> in
>>>>>>>>>> rdma-rdma and rdma-tcp modules.
>>>>>>>>> This is NOT the way to go here.
>>>>>>>>>
>>>>>>>>> Both rdma and tcp are driven from workqueue context, which are
>>>>>>>>> bound
>>>>>>>>> workqueues.
>>>>>>>>>
>>>>>>>>> So there are two ways to go here:
>>>>>>>>> 1. Add generic port cpuset and use that to direct traffic to
>>>>>>>>> the
>>>>>>>>> appropriate set of cores
>>>>>>>>> (i.e. select an appropriate comp_vector for rdma and add an
>>>>>>>>> appropriate
>>>>>>>>> steering rule
>>>>>>>>> for tcp).
>>>>>>>>> 2. Add options to rdma/tcp to use UNBOUND workqueues, and
>>>>>>>>> allow
>>>>>>>>> users
>>>>>>>>> to
>>>>>>>>> control
>>>>>>>>> these UNBOUND workqueues cpumask via sysfs.
>>>>>>>>>
>>>>>>>>> (2) will not control interrupts to steer to other workloads
>>>>>>>>> cpus,
>>>>>>>>> but
>>>>>>>>> the handlers may
>>>>>>>>> run on a set of dedicated cpus.
>>>>>>>>>
>>>>>>>>> (1) is a better solution, but harder to implement.
>>>>>>>>>
>>>>>>>>> You also should look into nvmet-fc as well (and nvmet-loop for
>>>>>>>>> that
>>>>>>>>> matter).
>>>>>>>> hi Sagi Grimberg,
>>>>>>>> Thanks for your reply, actually we had tried the first advice
>>>>>>>> you
>>>>>>>> suggested, but we found the performance was poor when using
>>>>>>>> spdk
>>>>>>>> as initiator.
>>>>>>> I suggest that you focus on that instead of what you proposed.
>>>>>>> What is the source of your poor performance?
>>>>>> Before these patches, we had used linux's RPS to forward the
>>>>>> packets
>>>>>> to a fixed cpu set for nvmet-tcp. But when did that we can still
>>>>>> not
>>>>>> cancel the competition between softirq and workqueue since nvme
>>>>>> target's
>>>>>> kworker cpu core bind on socket's cpu which is from skb. Besides
>>>>>> that
>>>>>> we found workqueue's wait latency was very high even we enabled
>>>>>> polling
>>>>>> on nvmet-tcp by module parameter idle_poll_period_usecs. So when
>>>>>> initiator
>>>>>> is polling mode, the target of workqueue is the bottleneck. Below
>>>>>> is
>>>>>> work's wait latency trace log of our test on our cluster(per node
>>>>>> uses
>>>>>> 4 numas 96 cores, 192G memory, one dual ports mellanox
>>>>>> CX4LX(25Gbps
>>>>>> X
>>>>>> 2)
>>>>>> ethernet adapter and randrw 1M IO size) by RPS to 6 cpu cores.
>>>>>> And
>>>>>> system's CPU and memory were used about 80%.
>>>>> I'd try a simple unbound CPU case, steer packets to say cores
>>>>> [0-5]
>>>>> and
>>>>> assign
>>>>> the cpumask of the unbound workqueue to cores [6-11].
>>>> Okay, thanks for your guide.
>>>>
>>>>>> ogden-brown:~ #/usr/share/bcc/tools/wqlat -T -w nvmet_tcp_wq 1 2
>>>>>> 01:06:59
>>>>>>     usecs               : count     distribution
>>>>>>      0 -> 1          : 0        |                              |
>>>>>>      2 -> 3          : 0        |                              |
>>>>>>      4 -> 7          : 0        |                              |
>>>>>>      8 -> 15         : 3        |                              |
>>>>>>     16 -> 31         : 10       |                              |
>>>>>>     32 -> 63         : 3        |                              |
>>>>>>     64 -> 127        : 2        |                              |
>>>>>>    128 -> 255        : 0        |                              |
>>>>>>    256 -> 511        : 5        |                              |
>>>>>>    512 -> 1023       : 12       |                              |
>>>>>>   1024 -> 2047       : 26       |*                             |
>>>>>>   2048 -> 4095       : 34       |*                             |
>>>>>>   4096 -> 8191       : 350      |************                  |
>>>>>>   8192 -> 16383      : 625      |******************************|
>>>>>>  16384 -> 32767      : 244      |*********                     |
>>>>>>  32768 -> 65535      : 39       |*                             |
>>>>>>
>>>>>> 01:07:00
>>>>>>     usecs               : count     distribution
>>>>>>      0 -> 1          : 1        |                              |
>>>>>>      2 -> 3          : 0        |                              |
>>>>>>      4 -> 7          : 4        |                              |
>>>>>>      8 -> 15         : 3        |                              |
>>>>>>     16 -> 31         : 8        |                              |
>>>>>>     32 -> 63         : 10       |                              |
>>>>>>     64 -> 127        : 3        |                              |
>>>>>>    128 -> 255        : 6        |                              |
>>>>>>    256 -> 511        : 8        |                              |
>>>>>>    512 -> 1023       : 20       |*                             |
>>>>>>   1024 -> 2047       : 19       |*                             |
>>>>>>   2048 -> 4095       : 57       |**                            |
>>>>>>   4096 -> 8191       : 325      |****************              |
>>>>>>   8192 -> 16383      : 647      |******************************|
>>>>>>  16384 -> 32767      : 228      |***********                   |
>>>>>>  32768 -> 65535      : 43       |**                            |
>>>>>>  65536 -> 131071     : 1        |                              |
>>>>>>
>>>>>> And the bandwidth of a node is only 3100MB. While we used the
>>>>>> patch
>>>>>> and enable 6 polling task, the bandwidth can be 4000MB. It's a
>>>>>> good
>>>>>> improvement.
>>>>> I think you will see similar performance with unbound workqueue
>>>>> and
>>>>> rps.
>>>> Yes, I remodified the nvmet-tcp/nvmet-rdma code for supporting
>>>> unbound
>>>> workqueue, and in same prerequisites of above to run test, and
>>>> compared
>>>> the result of unbound workqueue and polling mode task. And I got a
>>>> good
>>>> performance for unbound workqueue. For unbound workqueue TCP we got
>>>> 3850M/node, it's almost equal to polling task. And also tested
>>>> nvmet-rdma
>>>> we get 5100M/node for unbound workqueue RDMA versus 5600M for
>>>> polling
>>>> task,
>>>> seems the diff is very small. Anyway, your advice is good.
>>> I'm a bit surprised that you see ~10% delta here. I would look into
>>> what
>>> is the root-cause of
>>> this difference. If indeed the load is high, the overhead of the
>>> workqueue mgmt should be
>>> negligible. I'm assuming you used IB_POLL_UNBOUND_WORKQUEUE ?
>> Yes, we used IB_POLL_UNBOUND_WORKQUEUE to create ib CQ. And I
>> observed
>> 3% CPU
>> usage of unbound workqueue versus 6% of polling task.
>>
>>>>    Do you think
>>>> we
>>>> should submit the unbound workqueue patches for nvmet-tcp and
>>>> nvmet-rdma
>>>> to upstream nvmet?
>>> For nvmet-tcp, I think there is merit to split socket processing
>>> from
>>> napi context. For nvmet-rdma
>>> I think the only difference is if you have multiple CQs assigned
>>> with
>>> the same comp_vector.
>>>
>>> How many queues do you have in your test?
>> We used 24 IO queues to nvmet-rdma target. I think this may also be
>> related to workqueue's wait latency. We still see some several ms
>> wait
>> latency for unbound workqueue of RMDA. You can see below trace log.
>
> What is the queue size of each? what rdma device are you using?

All the queue's IO size is 1M and queue depth is 32. The rdma deive is
Mellanox CX4LX dual ports bonding. And in poll task we used
IB_POLL_DIRECT
to create CQ versus IB_POLL_UNBOUND_WORKQUEUE for workqueue.

Thanks,
Ping




^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 0/2] nvmet: support polling task for RDMA and TCP
  2024-07-04  8:10           ` Ping Gan
  2024-07-04  8:40             ` Sagi Grimberg
@ 2024-07-16 10:36             ` Hannes Reinecke
  2024-07-17  0:53               ` Ping Gan
  1 sibling, 1 reply; 17+ messages in thread
From: Hannes Reinecke @ 2024-07-16 10:36 UTC (permalink / raw)
  To: Ping Gan, sagi, hch, kch, linux-nvme, linux-kernel; +Cc: ping.gan

On 7/4/24 10:10, Ping Gan wrote:
>> On 02/07/2024 13:02, Ping Gan wrote:
[ .. ]
>>> And the bandwidth of a node is only 3100MB. While we used the patch
>>> and enable 6 polling task, the bandwidth can be 4000MB. It's a good
>>> improvement.
>>
>> I think you will see similar performance with unbound workqueue and
>> rps.
> 
> Yes, I remodified the nvmet-tcp/nvmet-rdma code for supporting unbound
> workqueue, and in same prerequisites of above to run test, and compared
> the result of unbound workqueue and polling mode task. And I got a good
> performance for unbound workqueue. For unbound workqueue TCP we got
> 3850M/node, it's almost equal to polling task. And also tested
> nvmet-rdma we get 5100M/node for unbound workqueue RDMA versus 5600M for
> polling task, seems the diff is very small. Anyway, your advice is good.
> Do you think we should submit the unbound workqueue patches for nvmet-tcp
> and nvmet-rdma to upstream nvmet?

Please do. I have been using pretty much the same patch during
development of my nvme-tcp scalability patchset, and using WQ_UNBOUND
definitely improves the situation here.

Cheers,

Hannes
-- 
Dr. Hannes Reinecke                  Kernel Storage Architect
hare@suse.de                                +49 911 74053 688
SUSE Software Solutions GmbH, Frankenstr. 146, 90461 Nürnberg
HRB 36809 (AG Nürnberg), GF: I. Totev, A. McDonald, W. Knoblich



^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 0/2] nvmet: support polling task for RDMA and TCP
  2024-07-16 10:36             ` Hannes Reinecke
@ 2024-07-17  0:53               ` Ping Gan
  0 siblings, 0 replies; 17+ messages in thread
From: Ping Gan @ 2024-07-17  0:53 UTC (permalink / raw)
  To: hare, sagi, hch, kch, linux-nvme, linux-kernel; +Cc: ping.gan

> On 7/4/24 10:10, Ping Gan wrote:
>>> On 02/07/2024 13:02, Ping Gan wrote:
>
>>>> And the bandwidth of a node is only 3100MB. While we used the patch
>>>> and enable 6 polling task, the bandwidth can be 4000MB. It's a good
>>>> improvement.
>>>
>>> I think you will see similar performance with unbound workqueue and
>>> rps.
>> 
>> Yes, I remodified the nvmet-tcp/nvmet-rdma code for supporting
>> unbound
>> workqueue, and in same prerequisites of above to run test, and
>> compared
>> the result of unbound workqueue and polling mode task. And I got a
>> good
>> performance for unbound workqueue. For unbound workqueue TCP we got
>> 3850M/node, it's almost equal to polling task. And also tested
>> nvmet-rdma we get 5100M/node for unbound workqueue RDMA versus 5600M
>> for
>> polling task, seems the diff is very small. Anyway, your advice is
>> good.
>> Do you think we should submit the unbound workqueue patches for
>> nvmet-tcp
>> and nvmet-rdma to upstream nvmet?
>
> Please do. I have been using pretty much the same patch during
> development of my nvme-tcp scalability patchset, and using WQ_UNBOUND
> definitely improves the situation here.

Thanks for your confirm! Okay, will do that.

Regards,
Ping




^ permalink raw reply	[flat|nested] 17+ messages in thread

end of thread, other threads:[~2024-07-17  0:54 UTC | newest]

Thread overview: 17+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-06-26  8:28 [PATCH 0/2] nvmet: support polling task for RDMA and TCP Ping Gan
2024-06-26  8:28 ` [PATCH 1/2] nvmet-rdma: add polling cq task for nvmet-rdma Ping Gan
2024-06-26  8:28 ` [PATCH 2/2] nvmet-tcp: add polling task for nvmet-tcp Ping Gan
2024-06-30  8:58 ` [PATCH 0/2] nvmet: support polling task for RDMA and TCP Sagi Grimberg
2024-07-01  7:42   ` Ping Gan
2024-07-01  7:42     ` Ping Gan
2024-07-01  8:22     ` Sagi Grimberg
2024-07-02 10:02       ` Ping Gan
2024-07-02 10:02         ` Ping Gan
2024-07-03 19:58         ` Sagi Grimberg
2024-07-04  8:10           ` Ping Gan
2024-07-04  8:40             ` Sagi Grimberg
2024-07-04 10:35               ` Ping Gan
2024-07-05  5:59                 ` Sagi Grimberg
2024-07-05  6:28                   ` Ping Gan
2024-07-16 10:36             ` Hannes Reinecke
2024-07-17  0:53               ` Ping Gan

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox