virtualization.lists.linux-foundation.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 01/14] vhost: add vhost_worker pointer to vhost_virtqueue
@ 2023-04-28 16:31 Mike Christie
  2023-04-28 16:31 ` [PATCH 02/14] vhost, vhost_net: add helper to check if vq has work Mike Christie
                   ` (13 more replies)
  0 siblings, 14 replies; 18+ messages in thread
From: Mike Christie @ 2023-04-28 16:31 UTC (permalink / raw)
  To: virtualization, mst, sgarzare, jasowang, stefanha

This patchset allows userspace to map vqs to different workers. This
patch adds a worker pointer to the vq so we can store that info.

Signed-off-by: Mike Christie <michael.christie@oracle.com>
Acked-by: Jason Wang <jasowang@redhat.com>
---
 drivers/vhost/vhost.c | 24 +++++++++++++-----------
 drivers/vhost/vhost.h |  1 +
 2 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 10bf35a3db6e..7a8eef246e1f 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -486,6 +486,7 @@ void vhost_dev_init(struct vhost_dev *dev,
 		vq->log = NULL;
 		vq->indirect = NULL;
 		vq->heads = NULL;
+		vq->worker = NULL;
 		vq->dev = dev;
 		mutex_init(&vq->mutex);
 		vhost_vq_reset(dev, vq);
@@ -554,16 +555,15 @@ static void vhost_worker_free(struct vhost_dev *dev)
 	kfree(worker);
 }
 
-static int vhost_worker_create(struct vhost_dev *dev)
+static struct vhost_worker *vhost_worker_create(struct vhost_dev *dev)
 {
 	struct vhost_worker *worker;
 	struct vhost_task *vtsk;
 	char name[TASK_COMM_LEN];
-	int ret;
 
 	worker = kzalloc(sizeof(*worker), GFP_KERNEL_ACCOUNT);
 	if (!worker)
-		return -ENOMEM;
+		return NULL;
 
 	dev->worker = worker;
 	worker->kcov_handle = kcov_common_handle();
@@ -571,25 +571,24 @@ static int vhost_worker_create(struct vhost_dev *dev)
 	snprintf(name, sizeof(name), "vhost-%d", current->pid);
 
 	vtsk = vhost_task_create(vhost_worker, worker, name);
-	if (!vtsk) {
-		ret = -ENOMEM;
+	if (!vtsk)
 		goto free_worker;
-	}
 
 	worker->vtsk = vtsk;
 	vhost_task_start(vtsk);
-	return 0;
+	return worker;
 
 free_worker:
 	kfree(worker);
 	dev->worker = NULL;
-	return ret;
+	return NULL;
 }
 
 /* Caller should have device mutex */
 long vhost_dev_set_owner(struct vhost_dev *dev)
 {
-	int err;
+	struct vhost_worker *worker;
+	int err, i;
 
 	/* Is there an owner already? */
 	if (vhost_dev_has_owner(dev)) {
@@ -600,9 +599,12 @@ long vhost_dev_set_owner(struct vhost_dev *dev)
 	vhost_attach_mm(dev);
 
 	if (dev->use_worker) {
-		err = vhost_worker_create(dev);
-		if (err)
+		worker = vhost_worker_create(dev);
+		if (!worker)
 			goto err_worker;
+
+		for (i = 0; i < dev->nvqs; i++)
+			dev->vqs[i]->worker = worker;
 	}
 
 	err = vhost_dev_alloc_iovecs(dev);
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 0308638cdeee..e72b665ba3a5 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -74,6 +74,7 @@ struct vhost_vring_call {
 /* The virtqueue structure describes a queue attached to a device. */
 struct vhost_virtqueue {
 	struct vhost_dev *dev;
+	struct vhost_worker *worker;
 
 	/* The actual ring of buffers. */
 	struct mutex mutex;
-- 
2.25.1

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH 02/14] vhost, vhost_net: add helper to check if vq has work
  2023-04-28 16:31 [PATCH 01/14] vhost: add vhost_worker pointer to vhost_virtqueue Mike Christie
@ 2023-04-28 16:31 ` Mike Christie
  2023-04-28 16:31 ` [PATCH 03/14] vhost: take worker or vq instead of dev for queueing Mike Christie
                   ` (12 subsequent siblings)
  13 siblings, 0 replies; 18+ messages in thread
From: Mike Christie @ 2023-04-28 16:31 UTC (permalink / raw)
  To: virtualization, mst, sgarzare, jasowang, stefanha

In the next patches each vq might have different workers so one could
have work but others do not. For net, we only want to check specific vqs,
so this adds a helper to check if a vq has work pending and converts
vhost-net to use it.

Signed-off-by: Mike Christie <michael.christie@oracle.com>
Acked-by: Jason Wang <jasowang@redhat.com>
---
 drivers/vhost/net.c   | 2 +-
 drivers/vhost/vhost.c | 6 +++---
 drivers/vhost/vhost.h | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 07181cd8d52e..8ed63651b9eb 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -546,7 +546,7 @@ static void vhost_net_busy_poll(struct vhost_net *net,
 	endtime = busy_clock() + busyloop_timeout;
 
 	while (vhost_can_busy_poll(endtime)) {
-		if (vhost_has_work(&net->dev)) {
+		if (vhost_vq_has_work(vq)) {
 			*busyloop_intr = true;
 			break;
 		}
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 7a8eef246e1f..aefb52952e1d 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -262,11 +262,11 @@ void vhost_work_queue(struct vhost_dev *dev, struct vhost_work *work)
 EXPORT_SYMBOL_GPL(vhost_work_queue);
 
 /* A lockless hint for busy polling code to exit the loop */
-bool vhost_has_work(struct vhost_dev *dev)
+bool vhost_vq_has_work(struct vhost_virtqueue *vq)
 {
-	return dev->worker && !llist_empty(&dev->worker->work_list);
+	return vq->worker && !llist_empty(&vq->worker->work_list);
 }
-EXPORT_SYMBOL_GPL(vhost_has_work);
+EXPORT_SYMBOL_GPL(vhost_vq_has_work);
 
 void vhost_poll_queue(struct vhost_poll *poll)
 {
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index e72b665ba3a5..0dde119fb0ee 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -45,7 +45,6 @@ struct vhost_poll {
 
 void vhost_work_init(struct vhost_work *work, vhost_work_fn_t fn);
 void vhost_work_queue(struct vhost_dev *dev, struct vhost_work *work);
-bool vhost_has_work(struct vhost_dev *dev);
 
 void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn,
 		     __poll_t mask, struct vhost_dev *dev);
@@ -195,6 +194,7 @@ int vhost_get_vq_desc(struct vhost_virtqueue *,
 		      struct vhost_log *log, unsigned int *log_num);
 void vhost_discard_vq_desc(struct vhost_virtqueue *, int n);
 
+bool vhost_vq_has_work(struct vhost_virtqueue *vq);
 bool vhost_vq_is_setup(struct vhost_virtqueue *vq);
 int vhost_vq_init_access(struct vhost_virtqueue *);
 int vhost_add_used(struct vhost_virtqueue *, unsigned int head, int len);
-- 
2.25.1

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH 03/14] vhost: take worker or vq instead of dev for queueing
  2023-04-28 16:31 [PATCH 01/14] vhost: add vhost_worker pointer to vhost_virtqueue Mike Christie
  2023-04-28 16:31 ` [PATCH 02/14] vhost, vhost_net: add helper to check if vq has work Mike Christie
@ 2023-04-28 16:31 ` Mike Christie
  2023-04-28 16:31 ` [PATCH 04/14] vhost: take worker or vq instead of dev for flushing Mike Christie
                   ` (11 subsequent siblings)
  13 siblings, 0 replies; 18+ messages in thread
From: Mike Christie @ 2023-04-28 16:31 UTC (permalink / raw)
  To: virtualization, mst, sgarzare, jasowang, stefanha

This patch has the core work queueing function take a worker for when we
support multiple workers. It also adds a helper that takes a vq during
queueing so modules can control which vq/worker to queue work on.

This temp leaves vhost_work_queue. It will be removed when the drivers
are converted in the next patches.

Signed-off-by: Mike Christie <michael.christie@oracle.com>
Acked-by: Jason Wang <jasowang@redhat.com>
---
 drivers/vhost/vhost.c | 44 +++++++++++++++++++++++++++----------------
 drivers/vhost/vhost.h |  1 +
 2 files changed, 29 insertions(+), 16 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index aefb52952e1d..faf1dcf0af30 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -231,6 +231,34 @@ void vhost_poll_stop(struct vhost_poll *poll)
 }
 EXPORT_SYMBOL_GPL(vhost_poll_stop);
 
+static void vhost_work_queue_on(struct vhost_worker *worker,
+				struct vhost_work *work)
+{
+	if (!worker)
+		return;
+
+	if (!test_and_set_bit(VHOST_WORK_QUEUED, &work->flags)) {
+		/* We can only add the work to the list after we're
+		 * sure it was not in the list.
+		 * test_and_set_bit() implies a memory barrier.
+		 */
+		llist_add(&work->node, &worker->work_list);
+		wake_up_process(worker->vtsk->task);
+	}
+}
+
+void vhost_work_queue(struct vhost_dev *dev, struct vhost_work *work)
+{
+	vhost_work_queue_on(dev->worker, work);
+}
+EXPORT_SYMBOL_GPL(vhost_work_queue);
+
+void vhost_vq_work_queue(struct vhost_virtqueue *vq, struct vhost_work *work)
+{
+	vhost_work_queue_on(vq->worker, work);
+}
+EXPORT_SYMBOL_GPL(vhost_vq_work_queue);
+
 void vhost_dev_flush(struct vhost_dev *dev)
 {
 	struct vhost_flush_struct flush;
@@ -245,22 +273,6 @@ void vhost_dev_flush(struct vhost_dev *dev)
 }
 EXPORT_SYMBOL_GPL(vhost_dev_flush);
 
-void vhost_work_queue(struct vhost_dev *dev, struct vhost_work *work)
-{
-	if (!dev->worker)
-		return;
-
-	if (!test_and_set_bit(VHOST_WORK_QUEUED, &work->flags)) {
-		/* We can only add the work to the list after we're
-		 * sure it was not in the list.
-		 * test_and_set_bit() implies a memory barrier.
-		 */
-		llist_add(&work->node, &dev->worker->work_list);
-		wake_up_process(dev->worker->vtsk->task);
-	}
-}
-EXPORT_SYMBOL_GPL(vhost_work_queue);
-
 /* A lockless hint for busy polling code to exit the loop */
 bool vhost_vq_has_work(struct vhost_virtqueue *vq)
 {
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 0dde119fb0ee..b64ee4ef387d 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -194,6 +194,7 @@ int vhost_get_vq_desc(struct vhost_virtqueue *,
 		      struct vhost_log *log, unsigned int *log_num);
 void vhost_discard_vq_desc(struct vhost_virtqueue *, int n);
 
+void vhost_vq_work_queue(struct vhost_virtqueue *vq, struct vhost_work *work);
 bool vhost_vq_has_work(struct vhost_virtqueue *vq);
 bool vhost_vq_is_setup(struct vhost_virtqueue *vq);
 int vhost_vq_init_access(struct vhost_virtqueue *);
-- 
2.25.1

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH 04/14] vhost: take worker or vq instead of dev for flushing
  2023-04-28 16:31 [PATCH 01/14] vhost: add vhost_worker pointer to vhost_virtqueue Mike Christie
  2023-04-28 16:31 ` [PATCH 02/14] vhost, vhost_net: add helper to check if vq has work Mike Christie
  2023-04-28 16:31 ` [PATCH 03/14] vhost: take worker or vq instead of dev for queueing Mike Christie
@ 2023-04-28 16:31 ` Mike Christie
  2023-04-28 16:31 ` [PATCH 05/14] vhost: convert poll work to be vq based Mike Christie
                   ` (10 subsequent siblings)
  13 siblings, 0 replies; 18+ messages in thread
From: Mike Christie @ 2023-04-28 16:31 UTC (permalink / raw)
  To: virtualization, mst, sgarzare, jasowang, stefanha

This patch has the core work flush function take a worker. When we
support multiple workers we can then flush each worker during device
removal, stoppage, etc.

Signed-off-by: Mike Christie <michael.christie@oracle.com>
Acked-by: Jason Wang <jasowang@redhat.com>
---
 drivers/vhost/vhost.c | 24 +++++++++++++++---------
 1 file changed, 15 insertions(+), 9 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index faf1dcf0af30..957f33f9ad25 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -247,6 +247,20 @@ static void vhost_work_queue_on(struct vhost_worker *worker,
 	}
 }
 
+static void vhost_work_flush_on(struct vhost_worker *worker)
+{
+	struct vhost_flush_struct flush;
+
+	if (!worker)
+		return;
+
+	init_completion(&flush.wait_event);
+	vhost_work_init(&flush.work, vhost_flush_work);
+
+	vhost_work_queue_on(worker, &flush.work);
+	wait_for_completion(&flush.wait_event);
+}
+
 void vhost_work_queue(struct vhost_dev *dev, struct vhost_work *work)
 {
 	vhost_work_queue_on(dev->worker, work);
@@ -261,15 +275,7 @@ EXPORT_SYMBOL_GPL(vhost_vq_work_queue);
 
 void vhost_dev_flush(struct vhost_dev *dev)
 {
-	struct vhost_flush_struct flush;
-
-	if (dev->worker) {
-		init_completion(&flush.wait_event);
-		vhost_work_init(&flush.work, vhost_flush_work);
-
-		vhost_work_queue(dev, &flush.work);
-		wait_for_completion(&flush.wait_event);
-	}
+	vhost_work_flush_on(dev->worker);
 }
 EXPORT_SYMBOL_GPL(vhost_dev_flush);
 
-- 
2.25.1

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH 05/14] vhost: convert poll work to be vq based
  2023-04-28 16:31 [PATCH 01/14] vhost: add vhost_worker pointer to vhost_virtqueue Mike Christie
                   ` (2 preceding siblings ...)
  2023-04-28 16:31 ` [PATCH 04/14] vhost: take worker or vq instead of dev for flushing Mike Christie
@ 2023-04-28 16:31 ` Mike Christie
  2023-04-28 16:31 ` [PATCH 06/14] vhost_sock: convert to vhost_vq_work_queue Mike Christie
                   ` (9 subsequent siblings)
  13 siblings, 0 replies; 18+ messages in thread
From: Mike Christie @ 2023-04-28 16:31 UTC (permalink / raw)
  To: virtualization, mst, sgarzare, jasowang, stefanha

This has the drivers pass in their poll to vq mapping and then converts
the core poll code to use the vq based helpers. In the next patches we
will allow vqs to be handled by different workers, so to allow drivers
to execute operations like queue, stop, flush, etc on specific polls/vqs
we need to know the mappings.

Signed-off-by: Mike Christie <michael.christie@oracle.com>
---
 drivers/vhost/net.c   | 6 ++++--
 drivers/vhost/vhost.c | 8 +++++---
 drivers/vhost/vhost.h | 4 +++-
 3 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 8ed63651b9eb..4a9b757071a2 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -1342,8 +1342,10 @@ static int vhost_net_open(struct inode *inode, struct file *f)
 		       VHOST_NET_PKT_WEIGHT, VHOST_NET_WEIGHT, true,
 		       NULL);
 
-	vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, EPOLLOUT, dev);
-	vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, EPOLLIN, dev);
+	vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, EPOLLOUT, dev,
+			vqs[VHOST_NET_VQ_TX]);
+	vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, EPOLLIN, dev,
+			vqs[VHOST_NET_VQ_RX]);
 
 	f->private_data = n;
 	n->page_frag.page = NULL;
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 957f33f9ad25..331728f58121 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -187,13 +187,15 @@ EXPORT_SYMBOL_GPL(vhost_work_init);
 
 /* Init poll structure */
 void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn,
-		     __poll_t mask, struct vhost_dev *dev)
+		     __poll_t mask, struct vhost_dev *dev,
+		     struct vhost_virtqueue *vq)
 {
 	init_waitqueue_func_entry(&poll->wait, vhost_poll_wakeup);
 	init_poll_funcptr(&poll->table, vhost_poll_func);
 	poll->mask = mask;
 	poll->dev = dev;
 	poll->wqh = NULL;
+	poll->vq = vq;
 
 	vhost_work_init(&poll->work, fn);
 }
@@ -288,7 +290,7 @@ EXPORT_SYMBOL_GPL(vhost_vq_has_work);
 
 void vhost_poll_queue(struct vhost_poll *poll)
 {
-	vhost_work_queue(poll->dev, &poll->work);
+	vhost_vq_work_queue(poll->vq, &poll->work);
 }
 EXPORT_SYMBOL_GPL(vhost_poll_queue);
 
@@ -510,7 +512,7 @@ void vhost_dev_init(struct vhost_dev *dev,
 		vhost_vq_reset(dev, vq);
 		if (vq->handle_kick)
 			vhost_poll_init(&vq->poll, vq->handle_kick,
-					EPOLLIN, dev);
+					EPOLLIN, dev, vq);
 	}
 }
 EXPORT_SYMBOL_GPL(vhost_dev_init);
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index b64ee4ef387d..d9b8abbe3a26 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -41,13 +41,15 @@ struct vhost_poll {
 	struct vhost_work	work;
 	__poll_t		mask;
 	struct vhost_dev	*dev;
+	struct vhost_virtqueue	*vq;
 };
 
 void vhost_work_init(struct vhost_work *work, vhost_work_fn_t fn);
 void vhost_work_queue(struct vhost_dev *dev, struct vhost_work *work);
 
 void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn,
-		     __poll_t mask, struct vhost_dev *dev);
+		     __poll_t mask, struct vhost_dev *dev,
+		     struct vhost_virtqueue *vq);
 int vhost_poll_start(struct vhost_poll *poll, struct file *file);
 void vhost_poll_stop(struct vhost_poll *poll);
 void vhost_poll_queue(struct vhost_poll *poll);
-- 
2.25.1

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH 06/14] vhost_sock: convert to vhost_vq_work_queue
  2023-04-28 16:31 [PATCH 01/14] vhost: add vhost_worker pointer to vhost_virtqueue Mike Christie
                   ` (3 preceding siblings ...)
  2023-04-28 16:31 ` [PATCH 05/14] vhost: convert poll work to be vq based Mike Christie
@ 2023-04-28 16:31 ` Mike Christie
  2023-04-28 16:31 ` [PATCH 07/14] vhost_scsi: make SCSI cmd completion per vq Mike Christie
                   ` (8 subsequent siblings)
  13 siblings, 0 replies; 18+ messages in thread
From: Mike Christie @ 2023-04-28 16:31 UTC (permalink / raw)
  To: virtualization, mst, sgarzare, jasowang, stefanha

Convert from vhost_work_queue to vhost_vq_work_queue.

Signed-off-by: Mike Christie <michael.christie@oracle.com>
---
 drivers/vhost/vsock.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index 6578db78f0ae..817d377a3f36 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -285,7 +285,7 @@ vhost_transport_send_pkt(struct sk_buff *skb)
 		atomic_inc(&vsock->queued_replies);
 
 	virtio_vsock_skb_queue_tail(&vsock->send_pkt_queue, skb);
-	vhost_work_queue(&vsock->dev, &vsock->send_pkt_work);
+	vhost_vq_work_queue(&vsock->vqs[VSOCK_VQ_RX], &vsock->send_pkt_work);
 
 	rcu_read_unlock();
 	return len;
@@ -583,7 +583,7 @@ static int vhost_vsock_start(struct vhost_vsock *vsock)
 	/* Some packets may have been queued before the device was started,
 	 * let's kick the send worker to send them.
 	 */
-	vhost_work_queue(&vsock->dev, &vsock->send_pkt_work);
+	vhost_vq_work_queue(&vsock->vqs[VSOCK_VQ_RX], &vsock->send_pkt_work);
 
 	mutex_unlock(&vsock->dev.mutex);
 	return 0;
-- 
2.25.1

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH 07/14] vhost_scsi: make SCSI cmd completion per vq
  2023-04-28 16:31 [PATCH 01/14] vhost: add vhost_worker pointer to vhost_virtqueue Mike Christie
                   ` (4 preceding siblings ...)
  2023-04-28 16:31 ` [PATCH 06/14] vhost_sock: convert to vhost_vq_work_queue Mike Christie
@ 2023-04-28 16:31 ` Mike Christie
  2023-04-28 16:31 ` [PATCH 08/14] vhost_scsi: convert to vhost_vq_work_queue Mike Christie
                   ` (7 subsequent siblings)
  13 siblings, 0 replies; 18+ messages in thread
From: Mike Christie @ 2023-04-28 16:31 UTC (permalink / raw)
  To: virtualization, mst, sgarzare, jasowang, stefanha

This patch separates the scsi cmd completion code paths so we can complete
cmds based on their vq instead of having all cmds complete on the same
worker/CPU. This will be useful with the next patches that allow us to
create mulitple worker threads and bind them to different vqs, so we can
have completions running on different threads/CPUs.

Signed-off-by: Mike Christie <michael.christie@oracle.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 drivers/vhost/scsi.c | 56 ++++++++++++++++++++------------------------
 1 file changed, 26 insertions(+), 30 deletions(-)

diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c
index bb10fa4bb4f6..a77c53bb035a 100644
--- a/drivers/vhost/scsi.c
+++ b/drivers/vhost/scsi.c
@@ -167,6 +167,7 @@ MODULE_PARM_DESC(max_io_vqs, "Set the max number of IO virtqueues a vhost scsi d
 
 struct vhost_scsi_virtqueue {
 	struct vhost_virtqueue vq;
+	struct vhost_scsi *vs;
 	/*
 	 * Reference counting for inflight reqs, used for flush operation. At
 	 * each time, one reference tracks new commands submitted, while we
@@ -181,6 +182,9 @@ struct vhost_scsi_virtqueue {
 	struct vhost_scsi_cmd *scsi_cmds;
 	struct sbitmap scsi_tags;
 	int max_cmds;
+
+	struct vhost_work completion_work;
+	struct llist_head completion_list;
 };
 
 struct vhost_scsi {
@@ -190,12 +194,8 @@ struct vhost_scsi {
 
 	struct vhost_dev dev;
 	struct vhost_scsi_virtqueue *vqs;
-	unsigned long *compl_bitmap;
 	struct vhost_scsi_inflight **old_inflight;
 
-	struct vhost_work vs_completion_work; /* cmd completion work item */
-	struct llist_head vs_completion_list; /* cmd completion queue */
-
 	struct vhost_work vs_event_work; /* evt injection work item */
 	struct llist_head vs_event_list; /* evt injection queue */
 
@@ -358,10 +358,11 @@ static void vhost_scsi_release_cmd(struct se_cmd *se_cmd)
 	} else {
 		struct vhost_scsi_cmd *cmd = container_of(se_cmd,
 					struct vhost_scsi_cmd, tvc_se_cmd);
-		struct vhost_scsi *vs = cmd->tvc_vhost;
+		struct vhost_scsi_virtqueue *svq =  container_of(cmd->tvc_vq,
+					struct vhost_scsi_virtqueue, vq);
 
-		llist_add(&cmd->tvc_completion_list, &vs->vs_completion_list);
-		vhost_work_queue(&vs->dev, &vs->vs_completion_work);
+		llist_add(&cmd->tvc_completion_list, &svq->completion_list);
+		vhost_vq_work_queue(&svq->vq, &svq->completion_work);
 	}
 }
 
@@ -509,17 +510,17 @@ static void vhost_scsi_evt_work(struct vhost_work *work)
  */
 static void vhost_scsi_complete_cmd_work(struct vhost_work *work)
 {
-	struct vhost_scsi *vs = container_of(work, struct vhost_scsi,
-					vs_completion_work);
+	struct vhost_scsi_virtqueue *svq = container_of(work,
+				struct vhost_scsi_virtqueue, completion_work);
 	struct virtio_scsi_cmd_resp v_rsp;
 	struct vhost_scsi_cmd *cmd, *t;
 	struct llist_node *llnode;
 	struct se_cmd *se_cmd;
 	struct iov_iter iov_iter;
-	int ret, vq;
+	bool signal = false;
+	int ret;
 
-	bitmap_zero(vs->compl_bitmap, vs->dev.nvqs);
-	llnode = llist_del_all(&vs->vs_completion_list);
+	llnode = llist_del_all(&svq->completion_list);
 	llist_for_each_entry_safe(cmd, t, llnode, tvc_completion_list) {
 		se_cmd = &cmd->tvc_se_cmd;
 
@@ -539,21 +540,17 @@ static void vhost_scsi_complete_cmd_work(struct vhost_work *work)
 			      cmd->tvc_in_iovs, sizeof(v_rsp));
 		ret = copy_to_iter(&v_rsp, sizeof(v_rsp), &iov_iter);
 		if (likely(ret == sizeof(v_rsp))) {
-			struct vhost_scsi_virtqueue *q;
+			signal = true;
+
 			vhost_add_used(cmd->tvc_vq, cmd->tvc_vq_desc, 0);
-			q = container_of(cmd->tvc_vq, struct vhost_scsi_virtqueue, vq);
-			vq = q - vs->vqs;
-			__set_bit(vq, vs->compl_bitmap);
 		} else
 			pr_err("Faulted on virtio_scsi_cmd_resp\n");
 
 		vhost_scsi_release_cmd_res(se_cmd);
 	}
 
-	vq = -1;
-	while ((vq = find_next_bit(vs->compl_bitmap, vs->dev.nvqs, vq + 1))
-		< vs->dev.nvqs)
-		vhost_signal(&vs->dev, &vs->vqs[vq].vq);
+	if (signal)
+		vhost_signal(&svq->vs->dev, &svq->vq);
 }
 
 static struct vhost_scsi_cmd *
@@ -1770,6 +1767,7 @@ static int vhost_scsi_set_features(struct vhost_scsi *vs, u64 features)
 
 static int vhost_scsi_open(struct inode *inode, struct file *f)
 {
+	struct vhost_scsi_virtqueue *svq;
 	struct vhost_scsi *vs;
 	struct vhost_virtqueue **vqs;
 	int r = -ENOMEM, i, nvqs = vhost_scsi_max_io_vqs;
@@ -1788,10 +1786,6 @@ static int vhost_scsi_open(struct inode *inode, struct file *f)
 	}
 	nvqs += VHOST_SCSI_VQ_IO;
 
-	vs->compl_bitmap = bitmap_alloc(nvqs, GFP_KERNEL);
-	if (!vs->compl_bitmap)
-		goto err_compl_bitmap;
-
 	vs->old_inflight = kmalloc_array(nvqs, sizeof(*vs->old_inflight),
 					 GFP_KERNEL | __GFP_ZERO);
 	if (!vs->old_inflight)
@@ -1806,7 +1800,6 @@ static int vhost_scsi_open(struct inode *inode, struct file *f)
 	if (!vqs)
 		goto err_local_vqs;
 
-	vhost_work_init(&vs->vs_completion_work, vhost_scsi_complete_cmd_work);
 	vhost_work_init(&vs->vs_event_work, vhost_scsi_evt_work);
 
 	vs->vs_events_nr = 0;
@@ -1817,8 +1810,14 @@ static int vhost_scsi_open(struct inode *inode, struct file *f)
 	vs->vqs[VHOST_SCSI_VQ_CTL].vq.handle_kick = vhost_scsi_ctl_handle_kick;
 	vs->vqs[VHOST_SCSI_VQ_EVT].vq.handle_kick = vhost_scsi_evt_handle_kick;
 	for (i = VHOST_SCSI_VQ_IO; i < nvqs; i++) {
-		vqs[i] = &vs->vqs[i].vq;
-		vs->vqs[i].vq.handle_kick = vhost_scsi_handle_kick;
+		svq = &vs->vqs[i];
+
+		vqs[i] = &svq->vq;
+		svq->vs = vs;
+		init_llist_head(&svq->completion_list);
+		vhost_work_init(&svq->completion_work,
+				vhost_scsi_complete_cmd_work);
+		svq->vq.handle_kick = vhost_scsi_handle_kick;
 	}
 	vhost_dev_init(&vs->dev, vqs, nvqs, UIO_MAXIOV,
 		       VHOST_SCSI_WEIGHT, 0, true, NULL);
@@ -1833,8 +1832,6 @@ static int vhost_scsi_open(struct inode *inode, struct file *f)
 err_vqs:
 	kfree(vs->old_inflight);
 err_inflight:
-	bitmap_free(vs->compl_bitmap);
-err_compl_bitmap:
 	kvfree(vs);
 err_vs:
 	return r;
@@ -1854,7 +1851,6 @@ static int vhost_scsi_release(struct inode *inode, struct file *f)
 	kfree(vs->dev.vqs);
 	kfree(vs->vqs);
 	kfree(vs->old_inflight);
-	bitmap_free(vs->compl_bitmap);
 	kvfree(vs);
 	return 0;
 }
-- 
2.25.1

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH 08/14] vhost_scsi: convert to vhost_vq_work_queue
  2023-04-28 16:31 [PATCH 01/14] vhost: add vhost_worker pointer to vhost_virtqueue Mike Christie
                   ` (5 preceding siblings ...)
  2023-04-28 16:31 ` [PATCH 07/14] vhost_scsi: make SCSI cmd completion per vq Mike Christie
@ 2023-04-28 16:31 ` Mike Christie
  2023-04-28 16:31 ` [PATCH 09/14] vhost: remove vhost_work_queue Mike Christie
                   ` (6 subsequent siblings)
  13 siblings, 0 replies; 18+ messages in thread
From: Mike Christie @ 2023-04-28 16:31 UTC (permalink / raw)
  To: virtualization, mst, sgarzare, jasowang, stefanha

Convert from vhost_work_queue to vhost_vq_work_queue.

Signed-off-by: Mike Christie <michael.christie@oracle.com>
---
 drivers/vhost/scsi.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c
index a77c53bb035a..1668009bd489 100644
--- a/drivers/vhost/scsi.c
+++ b/drivers/vhost/scsi.c
@@ -353,8 +353,9 @@ static void vhost_scsi_release_cmd(struct se_cmd *se_cmd)
 	if (se_cmd->se_cmd_flags & SCF_SCSI_TMR_CDB) {
 		struct vhost_scsi_tmf *tmf = container_of(se_cmd,
 					struct vhost_scsi_tmf, se_cmd);
+		struct vhost_virtqueue *vq = &tmf->svq->vq;
 
-		vhost_work_queue(&tmf->vhost->dev, &tmf->vwork);
+		vhost_vq_work_queue(vq, &tmf->vwork);
 	} else {
 		struct vhost_scsi_cmd *cmd = container_of(se_cmd,
 					struct vhost_scsi_cmd, tvc_se_cmd);
@@ -1332,11 +1333,9 @@ static void vhost_scsi_ctl_handle_kick(struct vhost_work *work)
 }
 
 static void
-vhost_scsi_send_evt(struct vhost_scsi *vs,
-		   struct vhost_scsi_tpg *tpg,
-		   struct se_lun *lun,
-		   u32 event,
-		   u32 reason)
+vhost_scsi_send_evt(struct vhost_scsi *vs, struct vhost_virtqueue *vq,
+		    struct vhost_scsi_tpg *tpg, struct se_lun *lun,
+		    u32 event, u32 reason)
 {
 	struct vhost_scsi_evt *evt;
 
@@ -1358,7 +1357,7 @@ vhost_scsi_send_evt(struct vhost_scsi *vs,
 	}
 
 	llist_add(&evt->list, &vs->vs_event_list);
-	vhost_work_queue(&vs->dev, &vs->vs_event_work);
+	vhost_vq_work_queue(vq, &vs->vs_event_work);
 }
 
 static void vhost_scsi_evt_handle_kick(struct vhost_work *work)
@@ -1372,7 +1371,8 @@ static void vhost_scsi_evt_handle_kick(struct vhost_work *work)
 		goto out;
 
 	if (vs->vs_events_missed)
-		vhost_scsi_send_evt(vs, NULL, NULL, VIRTIO_SCSI_T_NO_EVENT, 0);
+		vhost_scsi_send_evt(vs, vq, NULL, NULL, VIRTIO_SCSI_T_NO_EVENT,
+				    0);
 out:
 	mutex_unlock(&vq->mutex);
 }
@@ -1991,7 +1991,7 @@ vhost_scsi_do_plug(struct vhost_scsi_tpg *tpg,
 		goto unlock;
 
 	if (vhost_has_feature(vq, VIRTIO_SCSI_F_HOTPLUG))
-		vhost_scsi_send_evt(vs, tpg, lun,
+		vhost_scsi_send_evt(vs, vq, tpg, lun,
 				   VIRTIO_SCSI_T_TRANSPORT_RESET, reason);
 unlock:
 	mutex_unlock(&vq->mutex);
-- 
2.25.1

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH 09/14] vhost: remove vhost_work_queue
  2023-04-28 16:31 [PATCH 01/14] vhost: add vhost_worker pointer to vhost_virtqueue Mike Christie
                   ` (6 preceding siblings ...)
  2023-04-28 16:31 ` [PATCH 08/14] vhost_scsi: convert to vhost_vq_work_queue Mike Christie
@ 2023-04-28 16:31 ` Mike Christie
  2023-04-28 16:31 ` [PATCH 10/14] vhost_scsi: flush IO vqs then send TMF rsp Mike Christie
                   ` (5 subsequent siblings)
  13 siblings, 0 replies; 18+ messages in thread
From: Mike Christie @ 2023-04-28 16:31 UTC (permalink / raw)
  To: virtualization, mst, sgarzare, jasowang, stefanha

vhost_work_queue is no longer used. Each driver is using the poll or vq
based queueing, so remove vhost_work_queue.

Signed-off-by: Mike Christie <michael.christie@oracle.com>
---
 drivers/vhost/vhost.c | 6 ------
 drivers/vhost/vhost.h | 1 -
 2 files changed, 7 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 331728f58121..adc2678a7b80 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -263,12 +263,6 @@ static void vhost_work_flush_on(struct vhost_worker *worker)
 	wait_for_completion(&flush.wait_event);
 }
 
-void vhost_work_queue(struct vhost_dev *dev, struct vhost_work *work)
-{
-	vhost_work_queue_on(dev->worker, work);
-}
-EXPORT_SYMBOL_GPL(vhost_work_queue);
-
 void vhost_vq_work_queue(struct vhost_virtqueue *vq, struct vhost_work *work)
 {
 	vhost_work_queue_on(vq->worker, work);
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index d9b8abbe3a26..ef55fae2517c 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -45,7 +45,6 @@ struct vhost_poll {
 };
 
 void vhost_work_init(struct vhost_work *work, vhost_work_fn_t fn);
-void vhost_work_queue(struct vhost_dev *dev, struct vhost_work *work);
 
 void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn,
 		     __poll_t mask, struct vhost_dev *dev,
-- 
2.25.1

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH 10/14] vhost_scsi: flush IO vqs then send TMF rsp
  2023-04-28 16:31 [PATCH 01/14] vhost: add vhost_worker pointer to vhost_virtqueue Mike Christie
                   ` (7 preceding siblings ...)
  2023-04-28 16:31 ` [PATCH 09/14] vhost: remove vhost_work_queue Mike Christie
@ 2023-04-28 16:31 ` Mike Christie
  2023-04-28 16:31 ` [PATCH 11/14] vhost: add helper to parse userspace vring state/file Mike Christie
                   ` (4 subsequent siblings)
  13 siblings, 0 replies; 18+ messages in thread
From: Mike Christie @ 2023-04-28 16:31 UTC (permalink / raw)
  To: virtualization, mst, sgarzare, jasowang, stefanha

With one worker we will always send the scsi cmd responses then send the
TMF rsp, because LIO will always complete the scsi cmds first then call
into us to send the TMF response.

With multiple workers, the IO vq workers could be running while the
TMF/ctl vq worker is so this has us do a flush before completing the TMF
to make sure cmds are completed when it's work is later queued and run.

Signed-off-by: Mike Christie <michael.christie@oracle.com>
---
 drivers/vhost/scsi.c  | 21 ++++++++++++++++++---
 drivers/vhost/vhost.c |  6 ++++++
 drivers/vhost/vhost.h |  1 +
 3 files changed, 25 insertions(+), 3 deletions(-)

diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c
index 1668009bd489..2c3cf487cc71 100644
--- a/drivers/vhost/scsi.c
+++ b/drivers/vhost/scsi.c
@@ -1133,12 +1133,27 @@ static void vhost_scsi_tmf_resp_work(struct vhost_work *work)
 {
 	struct vhost_scsi_tmf *tmf = container_of(work, struct vhost_scsi_tmf,
 						  vwork);
-	int resp_code;
+	struct vhost_virtqueue *ctl_vq, *vq;
+	int resp_code, i;
+
+	if (tmf->scsi_resp == TMR_FUNCTION_COMPLETE) {
+		/*
+		 * Flush IO vqs that don't share a worker with the ctl to make
+		 * sure they have sent their responses before us.
+		 */
+		ctl_vq = &tmf->vhost->vqs[VHOST_SCSI_VQ_CTL].vq;
+		for (i = VHOST_SCSI_VQ_IO; i < tmf->vhost->dev.nvqs; i++) {
+			vq = &tmf->vhost->vqs[i].vq;
+
+			if (vhost_vq_is_setup(vq) &&
+			    vq->worker != ctl_vq->worker)
+				vhost_vq_flush(vq);
+		}
 
-	if (tmf->scsi_resp == TMR_FUNCTION_COMPLETE)
 		resp_code = VIRTIO_SCSI_S_FUNCTION_SUCCEEDED;
-	else
+	} else {
 		resp_code = VIRTIO_SCSI_S_FUNCTION_REJECTED;
+	}
 
 	vhost_scsi_send_tmf_resp(tmf->vhost, &tmf->svq->vq, tmf->in_iovs,
 				 tmf->vq_desc, &tmf->resp_iov, resp_code);
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index adc2678a7b80..2e27d24634f5 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -275,6 +275,12 @@ void vhost_dev_flush(struct vhost_dev *dev)
 }
 EXPORT_SYMBOL_GPL(vhost_dev_flush);
 
+void vhost_vq_flush(struct vhost_virtqueue *vq)
+{
+	vhost_work_flush_on(vq->worker);
+}
+EXPORT_SYMBOL_GPL(vhost_vq_flush);
+
 /* A lockless hint for busy polling code to exit the loop */
 bool vhost_vq_has_work(struct vhost_virtqueue *vq)
 {
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index ef55fae2517c..395707c680e5 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -53,6 +53,7 @@ int vhost_poll_start(struct vhost_poll *poll, struct file *file);
 void vhost_poll_stop(struct vhost_poll *poll);
 void vhost_poll_queue(struct vhost_poll *poll);
 void vhost_dev_flush(struct vhost_dev *dev);
+void vhost_vq_flush(struct vhost_virtqueue *vq);
 
 struct vhost_log {
 	u64 addr;
-- 
2.25.1

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH 11/14] vhost: add helper to parse userspace vring state/file
  2023-04-28 16:31 [PATCH 01/14] vhost: add vhost_worker pointer to vhost_virtqueue Mike Christie
                   ` (8 preceding siblings ...)
  2023-04-28 16:31 ` [PATCH 10/14] vhost_scsi: flush IO vqs then send TMF rsp Mike Christie
@ 2023-04-28 16:31 ` Mike Christie
  2023-04-28 16:31 ` [PATCH 12/14] vhost: replace single worker pointer with xarray Mike Christie
                   ` (3 subsequent siblings)
  13 siblings, 0 replies; 18+ messages in thread
From: Mike Christie @ 2023-04-28 16:31 UTC (permalink / raw)
  To: virtualization, mst, sgarzare, jasowang, stefanha

The next patches add new vhost worker ioctls which will need to get a
vhost_virtqueue from a userspace struct which specifies the vq's index.
This moves the vhost_vring_ioctl code to do this to a helper so it can
be shared.

Signed-off-by: Mike Christie <michael.christie@oracle.com>
---
 drivers/vhost/vhost.c | 29 ++++++++++++++++++++++-------
 1 file changed, 22 insertions(+), 7 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 2e27d24634f5..930b703eb6b6 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -604,6 +604,27 @@ static struct vhost_worker *vhost_worker_create(struct vhost_dev *dev)
 	return NULL;
 }
 
+static int vhost_get_vq_from_user(struct vhost_dev *dev, void __user *argp,
+				  struct vhost_virtqueue **vq, u32 *id)
+{
+	u32 __user *idxp = argp;
+	u32 idx;
+	long r;
+
+	r = get_user(idx, idxp);
+	if (r < 0)
+		return r;
+
+	if (idx >= dev->nvqs)
+		return -ENOBUFS;
+
+	idx = array_index_nospec(idx, dev->nvqs);
+
+	*vq = dev->vqs[idx];
+	*id = idx;
+	return 0;
+}
+
 /* Caller should have device mutex */
 long vhost_dev_set_owner(struct vhost_dev *dev)
 {
@@ -1614,21 +1635,15 @@ long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *arg
 	struct file *eventfp, *filep = NULL;
 	bool pollstart = false, pollstop = false;
 	struct eventfd_ctx *ctx = NULL;
-	u32 __user *idxp = argp;
 	struct vhost_virtqueue *vq;
 	struct vhost_vring_state s;
 	struct vhost_vring_file f;
 	u32 idx;
 	long r;
 
-	r = get_user(idx, idxp);
+	r = vhost_get_vq_from_user(d, argp, &vq, &idx);
 	if (r < 0)
 		return r;
-	if (idx >= d->nvqs)
-		return -ENOBUFS;
-
-	idx = array_index_nospec(idx, d->nvqs);
-	vq = d->vqs[idx];
 
 	if (ioctl == VHOST_SET_VRING_NUM ||
 	    ioctl == VHOST_SET_VRING_ADDR) {
-- 
2.25.1

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH 12/14] vhost: replace single worker pointer with xarray
  2023-04-28 16:31 [PATCH 01/14] vhost: add vhost_worker pointer to vhost_virtqueue Mike Christie
                   ` (9 preceding siblings ...)
  2023-04-28 16:31 ` [PATCH 11/14] vhost: add helper to parse userspace vring state/file Mike Christie
@ 2023-04-28 16:31 ` Mike Christie
  2023-04-28 16:31 ` [PATCH 13/14] vhost: allow userspace to create workers Mike Christie
                   ` (2 subsequent siblings)
  13 siblings, 0 replies; 18+ messages in thread
From: Mike Christie @ 2023-04-28 16:31 UTC (permalink / raw)
  To: virtualization, mst, sgarzare, jasowang, stefanha

The next patch allows userspace to create multiple workers per device,
so this patch replaces the vhost_worker pointer with an xarray so we
can store mupltiple workers and look them up.

Signed-off-by: Mike Christie <michael.christie@oracle.com>
---
 drivers/vhost/vhost.c | 48 +++++++++++++++++++++++++++++++++----------
 drivers/vhost/vhost.h |  3 ++-
 2 files changed, 39 insertions(+), 12 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 930b703eb6b6..4b0b82292379 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -271,7 +271,11 @@ EXPORT_SYMBOL_GPL(vhost_vq_work_queue);
 
 void vhost_dev_flush(struct vhost_dev *dev)
 {
-	vhost_work_flush_on(dev->worker);
+	struct vhost_worker *worker;
+	unsigned long i;
+
+	xa_for_each(&dev->worker_xa, i, worker)
+		vhost_work_flush_on(worker);
 }
 EXPORT_SYMBOL_GPL(vhost_dev_flush);
 
@@ -489,7 +493,6 @@ void vhost_dev_init(struct vhost_dev *dev,
 	dev->umem = NULL;
 	dev->iotlb = NULL;
 	dev->mm = NULL;
-	dev->worker = NULL;
 	dev->iov_limit = iov_limit;
 	dev->weight = weight;
 	dev->byte_weight = byte_weight;
@@ -499,7 +502,7 @@ void vhost_dev_init(struct vhost_dev *dev,
 	INIT_LIST_HEAD(&dev->read_list);
 	INIT_LIST_HEAD(&dev->pending_list);
 	spin_lock_init(&dev->iotlb_lock);
-
+	xa_init_flags(&dev->worker_xa, XA_FLAGS_ALLOC);
 
 	for (i = 0; i < dev->nvqs; ++i) {
 		vq = dev->vqs[i];
@@ -562,30 +565,46 @@ static void vhost_detach_mm(struct vhost_dev *dev)
 	dev->mm = NULL;
 }
 
-static void vhost_worker_free(struct vhost_dev *dev)
+static void vhost_worker_destroy(struct vhost_dev *dev,
+				 struct vhost_worker *worker)
 {
-	struct vhost_worker *worker = dev->worker;
-
 	if (!worker)
 		return;
 
-	dev->worker = NULL;
 	WARN_ON(!llist_empty(&worker->work_list));
+	xa_erase(&dev->worker_xa, worker->id);
 	vhost_task_stop(worker->vtsk);
 	kfree(worker);
 }
 
+static void vhost_workers_free(struct vhost_dev *dev)
+{
+	struct vhost_worker *worker;
+	unsigned long i;
+
+	if (!dev->use_worker)
+		return;
+	/*
+	 * Free the default worker we created and cleanup workers userspace
+	 * created but couldn't clean up (it forgot or crashed).
+	 */
+	xa_for_each(&dev->worker_xa, i, worker)
+		vhost_worker_destroy(dev, worker);
+	xa_destroy(&dev->worker_xa);
+}
+
 static struct vhost_worker *vhost_worker_create(struct vhost_dev *dev)
 {
 	struct vhost_worker *worker;
 	struct vhost_task *vtsk;
 	char name[TASK_COMM_LEN];
+	int ret;
+	u32 id;
 
 	worker = kzalloc(sizeof(*worker), GFP_KERNEL_ACCOUNT);
 	if (!worker)
 		return NULL;
 
-	dev->worker = worker;
 	worker->kcov_handle = kcov_common_handle();
 	init_llist_head(&worker->work_list);
 	snprintf(name, sizeof(name), "vhost-%d", current->pid);
@@ -596,11 +615,18 @@ static struct vhost_worker *vhost_worker_create(struct vhost_dev *dev)
 
 	worker->vtsk = vtsk;
 	vhost_task_start(vtsk);
+
+	ret = xa_alloc(&dev->worker_xa, &id, worker, xa_limit_32b, GFP_KERNEL);
+	if (ret < 0)
+		goto stop_worker;
+	worker->id = id;
+
 	return worker;
 
+stop_worker:
+	vhost_task_stop(vtsk);
 free_worker:
 	kfree(worker);
-	dev->worker = NULL;
 	return NULL;
 }
 
@@ -654,7 +680,7 @@ long vhost_dev_set_owner(struct vhost_dev *dev)
 
 	return 0;
 err_iovecs:
-	vhost_worker_free(dev);
+	vhost_workers_free(dev);
 err_worker:
 	vhost_detach_mm(dev);
 err_mm:
@@ -747,7 +773,7 @@ void vhost_dev_cleanup(struct vhost_dev *dev)
 	dev->iotlb = NULL;
 	vhost_clear_msg(dev);
 	wake_up_interruptible_poll(&dev->wait, EPOLLIN | EPOLLRDNORM);
-	vhost_worker_free(dev);
+	vhost_workers_free(dev);
 	vhost_detach_mm(dev);
 }
 EXPORT_SYMBOL_GPL(vhost_dev_cleanup);
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 395707c680e5..2eea20d54134 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -30,6 +30,7 @@ struct vhost_worker {
 	struct vhost_task	*vtsk;
 	struct llist_head	work_list;
 	u64			kcov_handle;
+	u32			id;
 };
 
 /* Poll a file (eventfd or socket) */
@@ -156,7 +157,6 @@ struct vhost_dev {
 	struct vhost_virtqueue **vqs;
 	int nvqs;
 	struct eventfd_ctx *log_ctx;
-	struct vhost_worker *worker;
 	struct vhost_iotlb *umem;
 	struct vhost_iotlb *iotlb;
 	spinlock_t iotlb_lock;
@@ -166,6 +166,7 @@ struct vhost_dev {
 	int iov_limit;
 	int weight;
 	int byte_weight;
+	struct xarray worker_xa;
 	bool use_worker;
 	int (*msg_handler)(struct vhost_dev *dev, u32 asid,
 			   struct vhost_iotlb_msg *msg);
-- 
2.25.1

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH 13/14] vhost: allow userspace to create workers
  2023-04-28 16:31 [PATCH 01/14] vhost: add vhost_worker pointer to vhost_virtqueue Mike Christie
                   ` (10 preceding siblings ...)
  2023-04-28 16:31 ` [PATCH 12/14] vhost: replace single worker pointer with xarray Mike Christie
@ 2023-04-28 16:31 ` Mike Christie
  2023-05-17  3:10   ` Jason Wang
  2023-04-28 16:31 ` [PATCH 14/14] vhost_scsi: add support for worker ioctls Mike Christie
  2023-04-28 16:35 ` [PATCH v7 00/14] vhost: multiple worker support michael.christie
  13 siblings, 1 reply; 18+ messages in thread
From: Mike Christie @ 2023-04-28 16:31 UTC (permalink / raw)
  To: virtualization, mst, sgarzare, jasowang, stefanha

For vhost-scsi with 3 vqs or more and a workload that tries to use
them in parallel like:

fio --filename=/dev/sdb  --direct=1 --rw=randrw --bs=4k \
--ioengine=libaio --iodepth=128  --numjobs=3

the single vhost worker thread will become a bottlneck and we are stuck
at around 500K IOPs no matter how many jobs, virtqueues, and CPUs are
used.

To better utilize virtqueues and available CPUs, this patch allows
userspace to create workers and bind them to vqs. You can have N workers
per dev and also share N workers with M vqs on that dev.

This patch adds the interface related code and the next patch will hook
vhost-scsi into it. The patches do not try to hook net and vsock into
the interface because:

1. multiple workers don't seem to help vsock. The problem is that with
only 2 virtqueues we never fully use the existing worker when doing
bidirectional tests. This seems to match vhost-scsi where we don't see
the worker as a bottleneck until 3 virtqueues are used.

2. net already has a way to use multiple workers.

Signed-off-by: Mike Christie <michael.christie@oracle.com>
---
 drivers/vhost/vhost.c            | 145 ++++++++++++++++++++++++++++++-
 drivers/vhost/vhost.h            |   3 +
 include/uapi/linux/vhost.h       |  33 +++++++
 include/uapi/linux/vhost_types.h |  16 ++++
 4 files changed, 196 insertions(+), 1 deletion(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 4b0b82292379..e8f829f35814 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -630,6 +630,80 @@ static struct vhost_worker *vhost_worker_create(struct vhost_dev *dev)
 	return NULL;
 }
 
+/* Caller must have device mutex */
+static void __vhost_vq_attach_worker(struct vhost_virtqueue *vq,
+				     struct vhost_worker *worker)
+{
+	if (vq->worker)
+		vq->worker->attachment_cnt--;
+	worker->attachment_cnt++;
+	vq->worker = worker;
+}
+
+/**
+ * vhost_vq_attach_worker - set a virtqueue's worker from an ioctl command
+ * @vq: the virtqueue we will set the worker for
+ * @info: the worker userspace has requested us to use
+ *
+ * We only allow userspace to set a virtqueue's worker if it's not active and
+ * polling is not enabled. We also assume drivers supporting this will not be
+ * internally queueing works directly or via calls like vhost_dev_flush at
+ * this time.
+ *
+ * Caller must have device and virtqueue mutex.
+ */
+static int vhost_vq_attach_worker(struct vhost_virtqueue *vq,
+				  struct vhost_vring_worker *info)
+{
+	unsigned long index = info->worker_id;
+	struct vhost_dev *dev = vq->dev;
+	struct vhost_worker *worker;
+
+	if (!dev->use_worker)
+		return -EINVAL;
+
+	if (vhost_vq_get_backend(vq) || vq->kick)
+		return -EBUSY;
+
+	worker = xa_find(&dev->worker_xa, &index, UINT_MAX, XA_PRESENT);
+	if (!worker || worker->id != info->worker_id)
+		return -ENODEV;
+
+	__vhost_vq_attach_worker(vq, worker);
+	return 0;
+}
+
+/* Caller must have device mutex */
+static int vhost_new_worker(struct vhost_dev *dev,
+			    struct vhost_worker_state *info)
+{
+	struct vhost_worker *worker;
+
+	worker = vhost_worker_create(dev);
+	if (!worker)
+		return -ENOMEM;
+
+	info->worker_id = worker->id;
+	return 0;
+}
+
+static int vhost_free_worker(struct vhost_dev *dev,
+			     struct vhost_worker_state *info)
+{
+	unsigned long index = info->worker_id;
+	struct vhost_worker *worker;
+
+	worker = xa_find(&dev->worker_xa, &index, UINT_MAX, XA_PRESENT);
+	if (!worker || worker->id != info->worker_id)
+		return -ENODEV;
+
+	if (worker->attachment_cnt)
+		return -EBUSY;
+
+	vhost_worker_destroy(dev, worker);
+	return 0;
+}
+
 static int vhost_get_vq_from_user(struct vhost_dev *dev, void __user *argp,
 				  struct vhost_virtqueue **vq, u32 *id)
 {
@@ -651,6 +725,75 @@ static int vhost_get_vq_from_user(struct vhost_dev *dev, void __user *argp,
 	return 0;
 }
 
+/* Caller must have device mutex */
+long vhost_worker_ioctl(struct vhost_dev *dev, unsigned int ioctl,
+			void __user *argp)
+{
+	struct vhost_vring_worker ring_worker;
+	struct vhost_worker_state state;
+	struct vhost_virtqueue *vq;
+	long ret;
+	u32 idx;
+
+	if (!dev->use_worker)
+		return -EINVAL;
+
+	if (!vhost_dev_has_owner(dev))
+		return -EINVAL;
+
+	switch (ioctl) {
+	/* dev worker ioctls */
+	case VHOST_NEW_WORKER:
+		ret = vhost_new_worker(dev, &state);
+		if (!ret && copy_to_user(argp, &state, sizeof(state)))
+			ret = -EFAULT;
+		return ret;
+	case VHOST_FREE_WORKER:
+		if (copy_from_user(&state, argp, sizeof(state)))
+			return -EFAULT;
+		return vhost_free_worker(dev, &state);
+	/* vring worker ioctls */
+	case VHOST_ATTACH_VRING_WORKER:
+	case VHOST_GET_VRING_WORKER:
+		break;
+	default:
+		return -ENOIOCTLCMD;
+	}
+
+	ret = vhost_get_vq_from_user(dev, argp, &vq, &idx);
+	if (ret)
+		return ret;
+
+	mutex_lock(&vq->mutex);
+	switch (ioctl) {
+	case VHOST_ATTACH_VRING_WORKER:
+		if (copy_from_user(&ring_worker, argp, sizeof(ring_worker))) {
+			ret = -EFAULT;
+			break;
+		}
+
+		ret = vhost_vq_attach_worker(vq, &ring_worker);
+		if (!ret && copy_to_user(argp, &ring_worker,
+					 sizeof(ring_worker)))
+			ret = -EFAULT;
+		break;
+	case VHOST_GET_VRING_WORKER:
+		ring_worker.index = idx;
+		ring_worker.worker_id = vq->worker->id;
+
+		if (copy_to_user(argp, &ring_worker, sizeof(ring_worker)))
+			ret = -EFAULT;
+		break;
+	default:
+		ret = -ENOIOCTLCMD;
+		break;
+	}
+
+	mutex_unlock(&vq->mutex);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(vhost_worker_ioctl);
+
 /* Caller should have device mutex */
 long vhost_dev_set_owner(struct vhost_dev *dev)
 {
@@ -671,7 +814,7 @@ long vhost_dev_set_owner(struct vhost_dev *dev)
 			goto err_worker;
 
 		for (i = 0; i < dev->nvqs; i++)
-			dev->vqs[i]->worker = worker;
+			__vhost_vq_attach_worker(dev->vqs[i], worker);
 	}
 
 	err = vhost_dev_alloc_iovecs(dev);
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 2eea20d54134..bcb33a2f64f0 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -31,6 +31,7 @@ struct vhost_worker {
 	struct llist_head	work_list;
 	u64			kcov_handle;
 	u32			id;
+	int			attachment_cnt;
 };
 
 /* Poll a file (eventfd or socket) */
@@ -187,6 +188,8 @@ void vhost_dev_cleanup(struct vhost_dev *);
 void vhost_dev_stop(struct vhost_dev *);
 long vhost_dev_ioctl(struct vhost_dev *, unsigned int ioctl, void __user *argp);
 long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *argp);
+long vhost_worker_ioctl(struct vhost_dev *dev, unsigned int ioctl,
+			void __user *argp);
 bool vhost_vq_access_ok(struct vhost_virtqueue *vq);
 bool vhost_log_access_ok(struct vhost_dev *);
 void vhost_clear_msg(struct vhost_dev *dev);
diff --git a/include/uapi/linux/vhost.h b/include/uapi/linux/vhost.h
index 92e1b700b51c..155710585979 100644
--- a/include/uapi/linux/vhost.h
+++ b/include/uapi/linux/vhost.h
@@ -45,6 +45,25 @@
 #define VHOST_SET_LOG_BASE _IOW(VHOST_VIRTIO, 0x04, __u64)
 /* Specify an eventfd file descriptor to signal on log write. */
 #define VHOST_SET_LOG_FD _IOW(VHOST_VIRTIO, 0x07, int)
+/* By default, a device gets one vhost_worker that its virtqueues share. This
+ * command allows the owner of the device to create an additional vhost_worker
+ * for the device. It can later be bound to 1 or more of its virtqueues using
+ * the VHOST_ATTACH_VRING_WORKER command.
+ *
+ * This must be called after VHOST_SET_OWNER and the caller must be the owner
+ * of the device. The new thread will inherit caller's cgroups and namespaces,
+ * and will share the caller's memory space. The new thread will also be
+ * counted against the caller's RLIMIT_NPROC value.
+ *
+ * The worker's ID used in other commands will be returned in
+ * vhost_worker_state.
+ */
+#define VHOST_NEW_WORKER _IOR(VHOST_VIRTIO, 0x8, struct vhost_worker_state)
+/* Free a worker created with VHOST_NEW_WORKER if it's not attached to any
+ * virtqueue. If userspace is not able to call this for workers its created,
+ * the kernel will free all the device's workers when the device is closed.
+ */
+#define VHOST_FREE_WORKER _IOW(VHOST_VIRTIO, 0x9, struct vhost_worker_state)
 
 /* Ring setup. */
 /* Set number of descriptors in ring. This parameter can not
@@ -70,6 +89,20 @@
 #define VHOST_VRING_BIG_ENDIAN 1
 #define VHOST_SET_VRING_ENDIAN _IOW(VHOST_VIRTIO, 0x13, struct vhost_vring_state)
 #define VHOST_GET_VRING_ENDIAN _IOW(VHOST_VIRTIO, 0x14, struct vhost_vring_state)
+/* Attach a vhost_worker created with VHOST_NEW_WORKER to one of the device's
+ * virtqueues. This must be done before VHOST_SET_VRING_KICK and the driver
+ * specific ioctl to activate the virtqueue (VHOST_SCSI_SET_ENDPOINT,
+ * VHOST_NET_SET_BACKEND, VHOST_VSOCK_SET_RUNNING) has been run.
+ *
+ * This will replace the virtqueue's existing worker. If the replaced worker
+ * is no longer attached to any virtqueues, it can be freed with
+ * VHOST_FREE_WORKER.
+ */
+#define VHOST_ATTACH_VRING_WORKER _IOWR(VHOST_VIRTIO, 0x15,		\
+					struct vhost_vring_worker)
+/* Return the vring worker's ID */
+#define VHOST_GET_VRING_WORKER _IOWR(VHOST_VIRTIO, 0x16,		\
+				     struct vhost_vring_worker)
 
 /* The following ioctls use eventfd file descriptors to signal and poll
  * for events. */
diff --git a/include/uapi/linux/vhost_types.h b/include/uapi/linux/vhost_types.h
index c5690a8992d8..d3aad12ad1fa 100644
--- a/include/uapi/linux/vhost_types.h
+++ b/include/uapi/linux/vhost_types.h
@@ -47,6 +47,22 @@ struct vhost_vring_addr {
 	__u64 log_guest_addr;
 };
 
+struct vhost_worker_state {
+	/*
+	 * For VHOST_NEW_WORKER the kernel will return the new vhost_worker id.
+	 * For VHOST_FREE_WORKER this must be set to the id of the vhost_worker
+	 * to free.
+	 */
+	unsigned int worker_id;
+};
+
+struct vhost_vring_worker {
+	/* vring index */
+	unsigned int index;
+	/* The id of the vhost_worker returned from VHOST_NEW_WORKER */
+	unsigned int worker_id;
+};
+
 /* no alignment requirement */
 struct vhost_iotlb_msg {
 	__u64 iova;
-- 
2.25.1

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH 14/14] vhost_scsi: add support for worker ioctls
  2023-04-28 16:31 [PATCH 01/14] vhost: add vhost_worker pointer to vhost_virtqueue Mike Christie
                   ` (11 preceding siblings ...)
  2023-04-28 16:31 ` [PATCH 13/14] vhost: allow userspace to create workers Mike Christie
@ 2023-04-28 16:31 ` Mike Christie
  2023-04-28 16:35 ` [PATCH v7 00/14] vhost: multiple worker support michael.christie
  13 siblings, 0 replies; 18+ messages in thread
From: Mike Christie @ 2023-04-28 16:31 UTC (permalink / raw)
  To: virtualization, mst, sgarzare, jasowang, stefanha

This has vhost-scsi support the worker ioctls by calling the
vhost_worker_ioctl helper.

With a single worker, the single thread becomes a bottlneck when trying
to use 3 or more virtqueues like:

fio --filename=/dev/sdb  --direct=1 --rw=randrw --bs=4k \
--ioengine=libaio --iodepth=128  --numjobs=3

With the patches and doing a worker per vq, we can scale to at least
16 vCPUs/vqs (that's my system limit) with the same command fio command
above with numjobs=16:

fio --filename=/dev/sdb  --direct=1 --rw=randrw --bs=4k \
--ioengine=libaio --iodepth=64  --numjobs=16

which gives around 2326K IOPs.

Note that for testing I dropped depth to 64 above because the vhost/virt
layer supports only 1024 total commands per device. And the only tuning I
did was set LIO's emulate_pr to 0 to avoid LIO's PR lock in the main IO
path which becomes an issue at around 12 jobs/virtqueues.

Signed-off-by: Mike Christie <michael.christie@oracle.com>
---
 drivers/vhost/scsi.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c
index 2c3cf487cc71..c83f7f043470 100644
--- a/drivers/vhost/scsi.c
+++ b/drivers/vhost/scsi.c
@@ -1927,6 +1927,14 @@ vhost_scsi_ioctl(struct file *f,
 		if (copy_from_user(&features, featurep, sizeof features))
 			return -EFAULT;
 		return vhost_scsi_set_features(vs, features);
+	case VHOST_NEW_WORKER:
+	case VHOST_FREE_WORKER:
+	case VHOST_ATTACH_VRING_WORKER:
+	case VHOST_GET_VRING_WORKER:
+		mutex_lock(&vs->dev.mutex);
+		r = vhost_worker_ioctl(&vs->dev, ioctl, argp);
+		mutex_unlock(&vs->dev.mutex);
+		return r;
 	default:
 		mutex_lock(&vs->dev.mutex);
 		r = vhost_dev_ioctl(&vs->dev, ioctl, argp);
-- 
2.25.1

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH v7 00/14] vhost: multiple worker support
  2023-04-28 16:31 [PATCH 01/14] vhost: add vhost_worker pointer to vhost_virtqueue Mike Christie
                   ` (12 preceding siblings ...)
  2023-04-28 16:31 ` [PATCH 14/14] vhost_scsi: add support for worker ioctls Mike Christie
@ 2023-04-28 16:35 ` michael.christie
  2023-06-02 11:49   ` Michael S. Tsirkin
  13 siblings, 1 reply; 18+ messages in thread
From: michael.christie @ 2023-04-28 16:35 UTC (permalink / raw)
  To: virtualization, mst, sgarzare, jasowang, stefanha

The following patches were built over Linux's tree. They allow us to
support multiple vhost workers tasks per device. The design is a modified
version of Stefan's original idea where userspace has the kernel create a
worker and we pass back the pid. In this version instead of passing the
pid between user/kernel space we use a worker_id which is just an integer
managed by the vhost driver and we allow userspace to create and free
workers and then attach them to virtqueues at setup time.

All review comments from the past reviews should be handled. If I didn't
reply to a review comment, I agreed with the comment and should have
handled it in this posting. Let me know if I missed one.

Results:
--------

fio jobs        1       2       4       8       12      16
----------------------------------------------------------
1 worker        160k   488k     -       -       -       -
worker per vq   160k   310k    620k    1300k   1836k   2326k


Notes:
0. This used a simple fio command:

fio --filename=/dev/sdb  --direct=1 --rw=randrw --bs=4k \
--ioengine=libaio --iodepth=128  --numjobs=$JOBS_ABOVE

and I used a VM with 16 vCPUs and 16 virtqueues.

1. The patches were tested with LIO's emulate_pr=0 which drops the
LIO PR lock use. This was a bottleneck at around 12 vqs/jobs.

2. Because we have a hard limit of 1024 cmds, if the num jobs * iodepth
was greater than 1024, I would decrease iodepth. So 12 jobs used 85 cmds,
and 16 used 64.

3. The perf issue above at 2 jobs is because when we only have 1 worker
we execute more cmds per vhost_work due to all vqs funneling to one worker.
I have 2 patches that fix this. One is just submit from the worker thread
instead of kikcing off to a workqueue like how the vhost block patches do.
I'll post this after the worker support is merged. I'm also working on
patches to add back batching during lio completion and do polling on the
submission side.

We will still want the threading patches, because if we batch at the fio
level plus use the vhost theading patches, we can see a big boost like
below. So hopefully doing it at the kernel will allow apps to just work
without having to be smart like fio.

fio using io_uring and batching with the iodepth_batch* settings:

fio jobs        1       2       4       8       12      16
-------------------------------------------------------------
1 worker        494k    520k    -       -       -       -
worker per vq   496k    878k    1542k   2436k   2304k   2590k

V7:
- Add more comments about assumptions.
- Drop refcounting and just use an attachment_cnt variable, so there
is no confusion about when a worker is freed.
- Do a opt-in model, because vsiock has an issue where it can queue works
before it's running and it doesn't even need multiple workers, so there 
is no point in chaning the driver or core code.
- Add back get worker ioctl.
- Broke up last patches to make it easier to read.

V6:
- Rebase against vhost_task patchset.
- Used xa instead of idr.
V5:
- Rebase against user_worker patchset.
- Rebase against flush patchset.
- Redo vhost-scsi tmf flush handling so it doesn't access vq->worker.
V4:
- fix vhost-sock VSOCK_VQ_RX use.
- name functions called directly by ioctl cmd's to match the ioctl cmd.
- break up VHOST_SET_VRING_WORKER into a new, free and attach cmd.
- document worker lifetime, and cgroup, namespace, mm, rlimit
inheritance, make it clear we currently only support sharing within the
device.
- add support to attach workers while IO is running.
- instead of passing a pid_t of the kernel thread, pass a int allocated
by the vhost layer with an idr.

V3:
- fully convert vhost code to use vq based APIs instead of leaving it
half per dev and half per vq.
- rebase against kernel worker API.
- Drop delayed worker creation. We always create the default worker at
VHOST_SET_OWNER time. Userspace can create and bind workers after that.

V2:
- change loop that we take a refcount to the worker in
- replaced pid == -1 with define.
- fixed tabbing/spacing coding style issue
- use hash instead of list to lookup workers.
- I dropped the patch that added an ioctl cmd to get a vq's worker's
pid. I saw we might do a generic netlink interface instead.
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 13/14] vhost: allow userspace to create workers
  2023-04-28 16:31 ` [PATCH 13/14] vhost: allow userspace to create workers Mike Christie
@ 2023-05-17  3:10   ` Jason Wang
  2023-05-18 23:21     ` Mike Christie
  0 siblings, 1 reply; 18+ messages in thread
From: Jason Wang @ 2023-05-17  3:10 UTC (permalink / raw)
  To: Mike Christie; +Cc: mst, stefanha, virtualization

On Sat, Apr 29, 2023 at 12:32 AM Mike Christie
<michael.christie@oracle.com> wrote:
>
> For vhost-scsi with 3 vqs or more and a workload that tries to use
> them in parallel like:
>
> fio --filename=/dev/sdb  --direct=1 --rw=randrw --bs=4k \
> --ioengine=libaio --iodepth=128  --numjobs=3
>
> the single vhost worker thread will become a bottlneck and we are stuck
> at around 500K IOPs no matter how many jobs, virtqueues, and CPUs are
> used.
>
> To better utilize virtqueues and available CPUs, this patch allows
> userspace to create workers and bind them to vqs. You can have N workers
> per dev and also share N workers with M vqs on that dev.
>
> This patch adds the interface related code and the next patch will hook
> vhost-scsi into it. The patches do not try to hook net and vsock into
> the interface because:
>
> 1. multiple workers don't seem to help vsock. The problem is that with
> only 2 virtqueues we never fully use the existing worker when doing
> bidirectional tests. This seems to match vhost-scsi where we don't see
> the worker as a bottleneck until 3 virtqueues are used.
>
> 2. net already has a way to use multiple workers.
>
> Signed-off-by: Mike Christie <michael.christie@oracle.com>
> ---
>  drivers/vhost/vhost.c            | 145 ++++++++++++++++++++++++++++++-
>  drivers/vhost/vhost.h            |   3 +
>  include/uapi/linux/vhost.h       |  33 +++++++
>  include/uapi/linux/vhost_types.h |  16 ++++
>  4 files changed, 196 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> index 4b0b82292379..e8f829f35814 100644
> --- a/drivers/vhost/vhost.c
> +++ b/drivers/vhost/vhost.c
> @@ -630,6 +630,80 @@ static struct vhost_worker *vhost_worker_create(struct vhost_dev *dev)
>         return NULL;
>  }
>
> +/* Caller must have device mutex */
> +static void __vhost_vq_attach_worker(struct vhost_virtqueue *vq,
> +                                    struct vhost_worker *worker)
> +{
> +       if (vq->worker)
> +               vq->worker->attachment_cnt--;
> +       worker->attachment_cnt++;
> +       vq->worker = worker;
> +}
> +
> +/**
> + * vhost_vq_attach_worker - set a virtqueue's worker from an ioctl command
> + * @vq: the virtqueue we will set the worker for
> + * @info: the worker userspace has requested us to use
> + *
> + * We only allow userspace to set a virtqueue's worker if it's not active and
> + * polling is not enabled.

I wonder if we can mandate this in the code like check the vq backend
in vhost_vq_work_queue().

 We also assume drivers supporting this will not be
> + * internally queueing works directly or via calls like vhost_dev_flush at
> + * this time.
> + *
> + * Caller must have device and virtqueue mutex.
> + */
> +static int vhost_vq_attach_worker(struct vhost_virtqueue *vq,
> +                                 struct vhost_vring_worker *info)
> +{
> +       unsigned long index = info->worker_id;
> +       struct vhost_dev *dev = vq->dev;
> +       struct vhost_worker *worker;
> +
> +       if (!dev->use_worker)
> +               return -EINVAL;
> +
> +       if (vhost_vq_get_backend(vq) || vq->kick)

It might be worthwhile to have a comment to explain why we need to
check vq->kick here.

This also means the device should not queue work when the backend is NULL.

But I found it is probably not the case for vsock, it calls
vhost_poll_queue() in vhost_transport_cancel_pkt() but
vhost_vsock_stop() doesn't wait before doing vhost_vq_set_backend(vq,
NULL);

Net seems to be fine since it waits for ubufs to be completed in
vhost_net_set_backend().

Can we make things easier by migrating the work_list? I also worry if
there are other corner cases which makes me think how hard it is if we
can just support those ioctls after the backend is set?


> +               return -EBUSY;
> +
> +       worker = xa_find(&dev->worker_xa, &index, UINT_MAX, XA_PRESENT);
> +       if (!worker || worker->id != info->worker_id)
> +               return -ENODEV;
> +
> +       __vhost_vq_attach_worker(vq, worker);
> +       return 0;
> +}
> +
> +/* Caller must have device mutex */
> +static int vhost_new_worker(struct vhost_dev *dev,
> +                           struct vhost_worker_state *info)
> +{
> +       struct vhost_worker *worker;
> +
> +       worker = vhost_worker_create(dev);
> +       if (!worker)
> +               return -ENOMEM;
> +
> +       info->worker_id = worker->id;
> +       return 0;
> +}
> +
> +static int vhost_free_worker(struct vhost_dev *dev,
> +                            struct vhost_worker_state *info)
> +{
> +       unsigned long index = info->worker_id;
> +       struct vhost_worker *worker;
> +
> +       worker = xa_find(&dev->worker_xa, &index, UINT_MAX, XA_PRESENT);
> +       if (!worker || worker->id != info->worker_id)
> +               return -ENODEV;
> +
> +       if (worker->attachment_cnt)
> +               return -EBUSY;
> +
> +       vhost_worker_destroy(dev, worker);
> +       return 0;
> +}
> +
>  static int vhost_get_vq_from_user(struct vhost_dev *dev, void __user *argp,
>                                   struct vhost_virtqueue **vq, u32 *id)
>  {
> @@ -651,6 +725,75 @@ static int vhost_get_vq_from_user(struct vhost_dev *dev, void __user *argp,
>         return 0;
>  }
>
> +/* Caller must have device mutex */
> +long vhost_worker_ioctl(struct vhost_dev *dev, unsigned int ioctl,
> +                       void __user *argp)
> +{
> +       struct vhost_vring_worker ring_worker;
> +       struct vhost_worker_state state;
> +       struct vhost_virtqueue *vq;
> +       long ret;
> +       u32 idx;
> +
> +       if (!dev->use_worker)
> +               return -EINVAL;
> +
> +       if (!vhost_dev_has_owner(dev))
> +               return -EINVAL;
> +
> +       switch (ioctl) {
> +       /* dev worker ioctls */
> +       case VHOST_NEW_WORKER:
> +               ret = vhost_new_worker(dev, &state);
> +               if (!ret && copy_to_user(argp, &state, sizeof(state)))
> +                       ret = -EFAULT;
> +               return ret;
> +       case VHOST_FREE_WORKER:
> +               if (copy_from_user(&state, argp, sizeof(state)))
> +                       return -EFAULT;
> +               return vhost_free_worker(dev, &state);
> +       /* vring worker ioctls */
> +       case VHOST_ATTACH_VRING_WORKER:
> +       case VHOST_GET_VRING_WORKER:
> +               break;
> +       default:
> +               return -ENOIOCTLCMD;
> +       }
> +
> +       ret = vhost_get_vq_from_user(dev, argp, &vq, &idx);
> +       if (ret)
> +               return ret;
> +
> +       mutex_lock(&vq->mutex);
> +       switch (ioctl) {
> +       case VHOST_ATTACH_VRING_WORKER:
> +               if (copy_from_user(&ring_worker, argp, sizeof(ring_worker))) {
> +                       ret = -EFAULT;
> +                       break;
> +               }
> +
> +               ret = vhost_vq_attach_worker(vq, &ring_worker);
> +               if (!ret && copy_to_user(argp, &ring_worker,
> +                                        sizeof(ring_worker)))
> +                       ret = -EFAULT;
> +               break;
> +       case VHOST_GET_VRING_WORKER:
> +               ring_worker.index = idx;
> +               ring_worker.worker_id = vq->worker->id;
> +
> +               if (copy_to_user(argp, &ring_worker, sizeof(ring_worker)))
> +                       ret = -EFAULT;
> +               break;
> +       default:
> +               ret = -ENOIOCTLCMD;
> +               break;
> +       }
> +
> +       mutex_unlock(&vq->mutex);
> +       return ret;
> +}
> +EXPORT_SYMBOL_GPL(vhost_worker_ioctl);
> +
>  /* Caller should have device mutex */
>  long vhost_dev_set_owner(struct vhost_dev *dev)
>  {
> @@ -671,7 +814,7 @@ long vhost_dev_set_owner(struct vhost_dev *dev)
>                         goto err_worker;
>
>                 for (i = 0; i < dev->nvqs; i++)
> -                       dev->vqs[i]->worker = worker;
> +                       __vhost_vq_attach_worker(dev->vqs[i], worker);
>         }
>
>         err = vhost_dev_alloc_iovecs(dev);
> diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
> index 2eea20d54134..bcb33a2f64f0 100644
> --- a/drivers/vhost/vhost.h
> +++ b/drivers/vhost/vhost.h
> @@ -31,6 +31,7 @@ struct vhost_worker {
>         struct llist_head       work_list;
>         u64                     kcov_handle;
>         u32                     id;
> +       int                     attachment_cnt;
>  };
>
>  /* Poll a file (eventfd or socket) */
> @@ -187,6 +188,8 @@ void vhost_dev_cleanup(struct vhost_dev *);
>  void vhost_dev_stop(struct vhost_dev *);
>  long vhost_dev_ioctl(struct vhost_dev *, unsigned int ioctl, void __user *argp);
>  long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *argp);
> +long vhost_worker_ioctl(struct vhost_dev *dev, unsigned int ioctl,
> +                       void __user *argp);
>  bool vhost_vq_access_ok(struct vhost_virtqueue *vq);
>  bool vhost_log_access_ok(struct vhost_dev *);
>  void vhost_clear_msg(struct vhost_dev *dev);
> diff --git a/include/uapi/linux/vhost.h b/include/uapi/linux/vhost.h
> index 92e1b700b51c..155710585979 100644
> --- a/include/uapi/linux/vhost.h
> +++ b/include/uapi/linux/vhost.h
> @@ -45,6 +45,25 @@
>  #define VHOST_SET_LOG_BASE _IOW(VHOST_VIRTIO, 0x04, __u64)
>  /* Specify an eventfd file descriptor to signal on log write. */
>  #define VHOST_SET_LOG_FD _IOW(VHOST_VIRTIO, 0x07, int)
> +/* By default, a device gets one vhost_worker that its virtqueues share. This
> + * command allows the owner of the device to create an additional vhost_worker
> + * for the device. It can later be bound to 1 or more of its virtqueues using
> + * the VHOST_ATTACH_VRING_WORKER command.
> + *
> + * This must be called after VHOST_SET_OWNER and the caller must be the owner
> + * of the device. The new thread will inherit caller's cgroups and namespaces,
> + * and will share the caller's memory space. The new thread will also be
> + * counted against the caller's RLIMIT_NPROC value.

This makes me think if we should destroy and re-create those after
VHOST_RESET_OWNER?

Thanks

> + *
> + * The worker's ID used in other commands will be returned in
> + * vhost_worker_state.
> + */
> +#define VHOST_NEW_WORKER _IOR(VHOST_VIRTIO, 0x8, struct vhost_worker_state)
> +/* Free a worker created with VHOST_NEW_WORKER if it's not attached to any
> + * virtqueue. If userspace is not able to call this for workers its created,
> + * the kernel will free all the device's workers when the device is closed.
> + */
> +#define VHOST_FREE_WORKER _IOW(VHOST_VIRTIO, 0x9, struct vhost_worker_state)
>
>  /* Ring setup. */
>  /* Set number of descriptors in ring. This parameter can not
> @@ -70,6 +89,20 @@
>  #define VHOST_VRING_BIG_ENDIAN 1
>  #define VHOST_SET_VRING_ENDIAN _IOW(VHOST_VIRTIO, 0x13, struct vhost_vring_state)
>  #define VHOST_GET_VRING_ENDIAN _IOW(VHOST_VIRTIO, 0x14, struct vhost_vring_state)
> +/* Attach a vhost_worker created with VHOST_NEW_WORKER to one of the device's
> + * virtqueues. This must be done before VHOST_SET_VRING_KICK and the driver
> + * specific ioctl to activate the virtqueue (VHOST_SCSI_SET_ENDPOINT,
> + * VHOST_NET_SET_BACKEND, VHOST_VSOCK_SET_RUNNING) has been run.
> + *
> + * This will replace the virtqueue's existing worker. If the replaced worker
> + * is no longer attached to any virtqueues, it can be freed with
> + * VHOST_FREE_WORKER.
> + */
> +#define VHOST_ATTACH_VRING_WORKER _IOWR(VHOST_VIRTIO, 0x15,            \
> +                                       struct vhost_vring_worker)
> +/* Return the vring worker's ID */
> +#define VHOST_GET_VRING_WORKER _IOWR(VHOST_VIRTIO, 0x16,               \
> +                                    struct vhost_vring_worker)
>
>  /* The following ioctls use eventfd file descriptors to signal and poll
>   * for events. */
> diff --git a/include/uapi/linux/vhost_types.h b/include/uapi/linux/vhost_types.h
> index c5690a8992d8..d3aad12ad1fa 100644
> --- a/include/uapi/linux/vhost_types.h
> +++ b/include/uapi/linux/vhost_types.h
> @@ -47,6 +47,22 @@ struct vhost_vring_addr {
>         __u64 log_guest_addr;
>  };
>
> +struct vhost_worker_state {
> +       /*
> +        * For VHOST_NEW_WORKER the kernel will return the new vhost_worker id.
> +        * For VHOST_FREE_WORKER this must be set to the id of the vhost_worker
> +        * to free.
> +        */
> +       unsigned int worker_id;
> +};
> +
> +struct vhost_vring_worker {
> +       /* vring index */
> +       unsigned int index;
> +       /* The id of the vhost_worker returned from VHOST_NEW_WORKER */
> +       unsigned int worker_id;
> +};
> +
>  /* no alignment requirement */
>  struct vhost_iotlb_msg {
>         __u64 iova;
> --
> 2.25.1
>

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 13/14] vhost: allow userspace to create workers
  2023-05-17  3:10   ` Jason Wang
@ 2023-05-18 23:21     ` Mike Christie
  0 siblings, 0 replies; 18+ messages in thread
From: Mike Christie @ 2023-05-18 23:21 UTC (permalink / raw)
  To: Jason Wang; +Cc: mst, stefanha, virtualization

On 5/16/23 10:10 PM, Jason Wang wrote:
> On Sat, Apr 29, 2023 at 12:32 AM Mike Christie
> <michael.christie@oracle.com> wrote:
>>
>> For vhost-scsi with 3 vqs or more and a workload that tries to use
>> them in parallel like:
>>
>> fio --filename=/dev/sdb  --direct=1 --rw=randrw --bs=4k \
>> --ioengine=libaio --iodepth=128  --numjobs=3
>>
>> the single vhost worker thread will become a bottlneck and we are stuck
>> at around 500K IOPs no matter how many jobs, virtqueues, and CPUs are
>> used.
>>
>> To better utilize virtqueues and available CPUs, this patch allows
>> userspace to create workers and bind them to vqs. You can have N workers
>> per dev and also share N workers with M vqs on that dev.
>>
>> This patch adds the interface related code and the next patch will hook
>> vhost-scsi into it. The patches do not try to hook net and vsock into
>> the interface because:
>>
>> 1. multiple workers don't seem to help vsock. The problem is that with
>> only 2 virtqueues we never fully use the existing worker when doing
>> bidirectional tests. This seems to match vhost-scsi where we don't see
>> the worker as a bottleneck until 3 virtqueues are used.
>>
>> 2. net already has a way to use multiple workers.
>>
>> Signed-off-by: Mike Christie <michael.christie@oracle.com>
>> ---
>>  drivers/vhost/vhost.c            | 145 ++++++++++++++++++++++++++++++-
>>  drivers/vhost/vhost.h            |   3 +
>>  include/uapi/linux/vhost.h       |  33 +++++++
>>  include/uapi/linux/vhost_types.h |  16 ++++
>>  4 files changed, 196 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
>> index 4b0b82292379..e8f829f35814 100644
>> --- a/drivers/vhost/vhost.c
>> +++ b/drivers/vhost/vhost.c
>> @@ -630,6 +630,80 @@ static struct vhost_worker *vhost_worker_create(struct vhost_dev *dev)
>>         return NULL;
>>  }
>>
>> +/* Caller must have device mutex */
>> +static void __vhost_vq_attach_worker(struct vhost_virtqueue *vq,
>> +                                    struct vhost_worker *worker)
>> +{
>> +       if (vq->worker)
>> +               vq->worker->attachment_cnt--;
>> +       worker->attachment_cnt++;
>> +       vq->worker = worker;
>> +}
>> +
>> +/**
>> + * vhost_vq_attach_worker - set a virtqueue's worker from an ioctl command
>> + * @vq: the virtqueue we will set the worker for
>> + * @info: the worker userspace has requested us to use
>> + *
>> + * We only allow userspace to set a virtqueue's worker if it's not active and
>> + * polling is not enabled.
> 
> I wonder if we can mandate this in the code like check the vq backend
> in vhost_vq_work_queue().

I'll look into it. However, for the vsock case we actually do want to
queue the work even though there is no backend set yet. It sort of just
keeps queueing works until VHOST_VSOCK_SET_RUNNING is executed. When that's
done it will run the works that have been queueing up.


> 
>  We also assume drivers supporting this will not be
>> + * internally queueing works directly or via calls like vhost_dev_flush at
>> + * this time.
>> + *
>> + * Caller must have device and virtqueue mutex.
>> + */
>> +static int vhost_vq_attach_worker(struct vhost_virtqueue *vq,
>> +                                 struct vhost_vring_worker *info)
>> +{
>> +       unsigned long index = info->worker_id;
>> +       struct vhost_dev *dev = vq->dev;
>> +       struct vhost_worker *worker;
>> +
>> +       if (!dev->use_worker)
>> +               return -EINVAL;
>> +
>> +       if (vhost_vq_get_backend(vq) || vq->kick)
> 
> It might be worthwhile to have a comment to explain why we need to
> check vq->kick here.
> 
> This also means the device should not queue work when the backend is NULL.
> 
> But I found it is probably not the case for vsock, it calls
> vhost_poll_queue() in vhost_transport_cancel_pkt() but
> vhost_vsock_stop() doesn't wait before doing vhost_vq_set_backend(vq,
> NULL);
Yeah, there was another case where vhost_transport_send_pkt can call
vhost_work_queue before the backend is set.

I ended up doing the opt in method though. I did a test to convert vsock
and a worker for the recv queue and one for the xmit queue doesn't help.
It was like with vhost-scsi where with just 2 virtqueues, one worker could
handle the load. So I thought there is no point in adding extra code for
vsock if it wasn't going to be used.


> 
> Net seems to be fine since it waits for ubufs to be completed in
> vhost_net_set_backend().
> 
> Can we make things easier by migrating the work_list? I also worry if
> there are other corner cases which makes me think how hard it is if we
> can just support those ioctls after the backend is set?


It's not too diffcult to support attaching/detaching workers from queues
when IO is running. One of the versions I posted used RCU. However, it adds
complexity and I didn't have a use case so I didn't want add code that most
likely wouldn't be used.


>>  void vhost_clear_msg(struct vhost_dev *dev);
>> diff --git a/include/uapi/linux/vhost.h b/include/uapi/linux/vhost.h
>> index 92e1b700b51c..155710585979 100644
>> --- a/include/uapi/linux/vhost.h
>> +++ b/include/uapi/linux/vhost.h
>> @@ -45,6 +45,25 @@
>>  #define VHOST_SET_LOG_BASE _IOW(VHOST_VIRTIO, 0x04, __u64)
>>  /* Specify an eventfd file descriptor to signal on log write. */
>>  #define VHOST_SET_LOG_FD _IOW(VHOST_VIRTIO, 0x07, int)
>> +/* By default, a device gets one vhost_worker that its virtqueues share. This
>> + * command allows the owner of the device to create an additional vhost_worker
>> + * for the device. It can later be bound to 1 or more of its virtqueues using
>> + * the VHOST_ATTACH_VRING_WORKER command.
>> + *
>> + * This must be called after VHOST_SET_OWNER and the caller must be the owner
>> + * of the device. The new thread will inherit caller's cgroups and namespaces,
>> + * and will share the caller's memory space. The new thread will also be
>> + * counted against the caller's RLIMIT_NPROC value.
> 
> This makes me think if we should destroy and re-create those after
> VHOST_RESET_OWNER?

Do you mean what happens if we did something like we destroy the cgroup after
VHOST_RESET_OWNER? You have to move the threads in the cgroup to another
cgroup before destroying it, so it depends on what's going on at the time.

For example if you are asking about a racey kind of case like we have moved the
parent/qemu thread to new cgroup, then we do VHOST_NEW_WORKER then the new thread
will end up in the new group with the parent. The existing threads might still be
it the old cgroup until you have finished moving them.


_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH v7 00/14] vhost: multiple worker support
  2023-04-28 16:35 ` [PATCH v7 00/14] vhost: multiple worker support michael.christie
@ 2023-06-02 11:49   ` Michael S. Tsirkin
  0 siblings, 0 replies; 18+ messages in thread
From: Michael S. Tsirkin @ 2023-06-02 11:49 UTC (permalink / raw)
  To: michael.christie; +Cc: stefanha, virtualization

Hi Mike,

On Fri, Apr 28, 2023 at 11:35:20AM -0500, michael.christie@oracle.com wrote:
> The following patches were built over Linux's tree. They allow us to
> support multiple vhost workers tasks per device. The design is a modified
> version of Stefan's original idea where userspace has the kernel create a
> worker and we pass back the pid. In this version instead of passing the
> pid between user/kernel space we use a worker_id which is just an integer
> managed by the vhost driver and we allow userspace to create and free
> workers and then attach them to virtqueues at setup time.
> 
> All review comments from the past reviews should be handled. If I didn't
> reply to a review comment, I agreed with the comment and should have
> handled it in this posting. Let me know if I missed one.
> 
> Results:
> --------
> 
> fio jobs        1       2       4       8       12      16
> ----------------------------------------------------------
> 1 worker        160k   488k     -       -       -       -
> worker per vq   160k   310k    620k    1300k   1836k   2326k
> 
> 
> Notes:
> 0. This used a simple fio command:
> 
> fio --filename=/dev/sdb  --direct=1 --rw=randrw --bs=4k \
> --ioengine=libaio --iodepth=128  --numjobs=$JOBS_ABOVE
> 
> and I used a VM with 16 vCPUs and 16 virtqueues.
> 
> 1. The patches were tested with LIO's emulate_pr=0 which drops the
> LIO PR lock use. This was a bottleneck at around 12 vqs/jobs.
> 
> 2. Because we have a hard limit of 1024 cmds, if the num jobs * iodepth
> was greater than 1024, I would decrease iodepth. So 12 jobs used 85 cmds,
> and 16 used 64.
> 
> 3. The perf issue above at 2 jobs is because when we only have 1 worker
> we execute more cmds per vhost_work due to all vqs funneling to one worker.
> I have 2 patches that fix this. One is just submit from the worker thread
> instead of kikcing off to a workqueue like how the vhost block patches do.
> I'll post this after the worker support is merged. I'm also working on
> patches to add back batching during lio completion and do polling on the
> submission side.
> 
> We will still want the threading patches, because if we batch at the fio
> level plus use the vhost theading patches, we can see a big boost like
> below. So hopefully doing it at the kernel will allow apps to just work
> without having to be smart like fio.
> 
> fio using io_uring and batching with the iodepth_batch* settings:
> 
> fio jobs        1       2       4       8       12      16
> -------------------------------------------------------------
> 1 worker        494k    520k    -       -       -       -
> worker per vq   496k    878k    1542k   2436k   2304k   2590k


Could you rebase on latest rc and repost please?
Thanks!

> V7:
> - Add more comments about assumptions.
> - Drop refcounting and just use an attachment_cnt variable, so there
> is no confusion about when a worker is freed.
> - Do a opt-in model, because vsiock has an issue where it can queue works
> before it's running and it doesn't even need multiple workers, so there 
> is no point in chaning the driver or core code.
> - Add back get worker ioctl.
> - Broke up last patches to make it easier to read.
> 
> V6:
> - Rebase against vhost_task patchset.
> - Used xa instead of idr.
> V5:
> - Rebase against user_worker patchset.
> - Rebase against flush patchset.
> - Redo vhost-scsi tmf flush handling so it doesn't access vq->worker.
> V4:
> - fix vhost-sock VSOCK_VQ_RX use.
> - name functions called directly by ioctl cmd's to match the ioctl cmd.
> - break up VHOST_SET_VRING_WORKER into a new, free and attach cmd.
> - document worker lifetime, and cgroup, namespace, mm, rlimit
> inheritance, make it clear we currently only support sharing within the
> device.
> - add support to attach workers while IO is running.
> - instead of passing a pid_t of the kernel thread, pass a int allocated
> by the vhost layer with an idr.
> 
> V3:
> - fully convert vhost code to use vq based APIs instead of leaving it
> half per dev and half per vq.
> - rebase against kernel worker API.
> - Drop delayed worker creation. We always create the default worker at
> VHOST_SET_OWNER time. Userspace can create and bind workers after that.
> 
> V2:
> - change loop that we take a refcount to the worker in
> - replaced pid == -1 with define.
> - fixed tabbing/spacing coding style issue
> - use hash instead of list to lookup workers.
> - I dropped the patch that added an ioctl cmd to get a vq's worker's
> pid. I saw we might do a generic netlink interface instead.

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply	[flat|nested] 18+ messages in thread

end of thread, other threads:[~2023-06-02 11:50 UTC | newest]

Thread overview: 18+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2023-04-28 16:31 [PATCH 01/14] vhost: add vhost_worker pointer to vhost_virtqueue Mike Christie
2023-04-28 16:31 ` [PATCH 02/14] vhost, vhost_net: add helper to check if vq has work Mike Christie
2023-04-28 16:31 ` [PATCH 03/14] vhost: take worker or vq instead of dev for queueing Mike Christie
2023-04-28 16:31 ` [PATCH 04/14] vhost: take worker or vq instead of dev for flushing Mike Christie
2023-04-28 16:31 ` [PATCH 05/14] vhost: convert poll work to be vq based Mike Christie
2023-04-28 16:31 ` [PATCH 06/14] vhost_sock: convert to vhost_vq_work_queue Mike Christie
2023-04-28 16:31 ` [PATCH 07/14] vhost_scsi: make SCSI cmd completion per vq Mike Christie
2023-04-28 16:31 ` [PATCH 08/14] vhost_scsi: convert to vhost_vq_work_queue Mike Christie
2023-04-28 16:31 ` [PATCH 09/14] vhost: remove vhost_work_queue Mike Christie
2023-04-28 16:31 ` [PATCH 10/14] vhost_scsi: flush IO vqs then send TMF rsp Mike Christie
2023-04-28 16:31 ` [PATCH 11/14] vhost: add helper to parse userspace vring state/file Mike Christie
2023-04-28 16:31 ` [PATCH 12/14] vhost: replace single worker pointer with xarray Mike Christie
2023-04-28 16:31 ` [PATCH 13/14] vhost: allow userspace to create workers Mike Christie
2023-05-17  3:10   ` Jason Wang
2023-05-18 23:21     ` Mike Christie
2023-04-28 16:31 ` [PATCH 14/14] vhost_scsi: add support for worker ioctls Mike Christie
2023-04-28 16:35 ` [PATCH v7 00/14] vhost: multiple worker support michael.christie
2023-06-02 11:49   ` Michael S. Tsirkin

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).