netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [RFC PATCH 0/1] vhost: Reduce TX used buffer signal for performance
@ 2010-10-27 21:00 Shirley Ma
  2010-10-27 21:05 ` Shirley Ma
  0 siblings, 1 reply; 4+ messages in thread
From: Shirley Ma @ 2010-10-27 21:00 UTC (permalink / raw)
  To: mst@redhat.com, David Miller; +Cc: netdev, kvm, linux-kernel

This patch will change vhost TX used buffer guest signaling from one by
one to 3/4 ring size. I have tried different size, like 4, 16, 1/4 size,
1/2 size, and found that the large size is best for message size between
256 - 4K with netperf TCP_STREAM test, so 3/4 of the ring size is picked
up for signaling. 

Tested both UDP and TCP performance with guest 2vcpu. The 60 secs
netperf run shows that guest to host performance for TCP.

TCP_STREAM

Message size	Guest CPU(%)	BW (Mb/s)
		before:after	before:after

256		57.84:58.42	1678.47:1908.75
512		68.68:60.21	1844.18:3387.33
1024		68.01:58.70	1945.14:3384.72
2048		65.36:54.25	2342.45:3799.31
4096		63.25:54.62	3307.11:4451.78
8192		59.57:57.89	6038.64:6694.04

UDP_STREAM
1024		49.64:26.69	1161.0:1687.6
2048		49.88:29.25	2326.8:2850.9
4096		49.59:29.15	3871.1:4880.3
8192		46.09:32.66	6822.9:7825.1
16K		42.90:34.96	11347.1:11767.4

For large message size, 60 secs run remains almost the same. I guess the
signal might not play a big role in large message transmission.

Shirley

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [RFC PATCH 0/1] vhost: Reduce TX used buffer signal for performance
  2010-10-27 21:00 [RFC PATCH 0/1] vhost: Reduce TX used buffer signal for performance Shirley Ma
@ 2010-10-27 21:05 ` Shirley Ma
  2010-10-28  8:57   ` Stefan Hajnoczi
  0 siblings, 1 reply; 4+ messages in thread
From: Shirley Ma @ 2010-10-27 21:05 UTC (permalink / raw)
  To: mst@redhat.com; +Cc: David Miller, netdev, kvm, linux-kernel

This patch changes vhost TX used buffer signal to guest from one by
one to up to 3/4 of vring size. This change improves vhost TX message 
size from 256 to 8K performance for both bandwidth and CPU utilization 
without inducing any regression.

Signed-off-by: Shirley Ma <xma@us.ibm.com>
---

 drivers/vhost/net.c   |   19 ++++++++++++++++++-
 drivers/vhost/vhost.c |   31 +++++++++++++++++++++++++++++++
 drivers/vhost/vhost.h |    3 +++
 3 files changed, 52 insertions(+), 1 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 4b4da5b..bd1ba71 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -198,7 +198,24 @@ static void handle_tx(struct vhost_net *net)
 		if (err != len)
 			pr_debug("Truncated TX packet: "
 				 " len %d != %zd\n", err, len);
-		vhost_add_used_and_signal(&net->dev, vq, head, 0);
+		/*
+		 * if no pending buffer size allocate, signal used buffer
+		 * one by one, otherwise, signal used buffer when reaching
+		 * 3/4 ring size to reduce CPU utilization.
+		 */
+		if (unlikely(vq->pend))
+			vhost_add_used_and_signal(&net->dev, vq, head, 0);
+		else {
+			vq->pend[vq->num_pend].id = head;
+			vq->pend[vq->num_pend].len = 0;
+			++vq->num_pend;
+			if (vq->num_pend == (vq->num - (vq->num >> 2))) {
+				vhost_add_used_and_signal_n(&net->dev, vq,
+							    vq->pend,
+							    vq->num_pend);
+				vq->num_pend = 0;
+			}
+		}
 		total_len += len;
 		if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
 			vhost_poll_queue(&vq->poll);
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 94701ff..47696d2 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -170,6 +170,16 @@ static void vhost_vq_reset(struct vhost_dev *dev,
 	vq->call_ctx = NULL;
 	vq->call = NULL;
 	vq->log_ctx = NULL;
+	/* signal pending used buffers */
+	if (vq->pend) {
+		if (vq->num_pend != 0) {
+			vhost_add_used_and_signal_n(dev, vq, vq->pend,
+						    vq->num_pend);
+			vq->num_pend = 0;
+		}
+		kfree(vq->pend);
+	}
+	vq->pend = NULL;
 }
 
 static int vhost_worker(void *data)
@@ -273,7 +283,13 @@ long vhost_dev_init(struct vhost_dev *dev,
 		dev->vqs[i].heads = NULL;
 		dev->vqs[i].dev = dev;
 		mutex_init(&dev->vqs[i].mutex);
+		dev->vqs[i].num_pend = 0;
+		dev->vqs[i].pend = NULL;
 		vhost_vq_reset(dev, dev->vqs + i);
+		/* signal 3/4 of ring size used buffers */
+		dev->vqs[i].pend = kmalloc((dev->vqs[i].num -
+					   (dev->vqs[i].num >> 2)) *
+					   sizeof *vq->peed, GFP_KERNEL);
 		if (dev->vqs[i].handle_kick)
 			vhost_poll_init(&dev->vqs[i].poll,
 					dev->vqs[i].handle_kick, POLLIN, dev);
@@ -599,6 +615,21 @@ static long vhost_set_vring(struct vhost_dev *d, int ioctl, void __user *argp)
 			r = -EINVAL;
 			break;
 		}
+		if (vq->num != s.num) {
+			/* signal used buffers first */
+			if (vq->pend) {
+				if (vq->num_pend != 0) {
+					vhost_add_used_and_signal_n(vq->dev, vq,
+								    vq->pend,
+								    vq->num_pend);
+					vq->num_pend = 0;
+				}
+				kfree(vq->pend);
+			}
+			/* realloc pending used buffers size */
+			vq->pend = kmalloc((s.num - (s.num >> 2)) *
+					   sizeof *vq->pend, GFP_KERNEL);
+		}
 		vq->num = s.num;
 		break;
 	case VHOST_SET_VRING_BASE:
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 073d06a..78949c0 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -108,6 +108,9 @@ struct vhost_virtqueue {
 	/* Log write descriptors */
 	void __user *log_base;
 	struct vhost_log *log;
+	/* delay multiple used buffers to signal once */
+	int num_pend;
+	struct vring_used_elem *pend;
 };
 
 struct vhost_dev {

^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [RFC PATCH 0/1] vhost: Reduce TX used buffer signal for performance
  2010-10-27 21:05 ` Shirley Ma
@ 2010-10-28  8:57   ` Stefan Hajnoczi
  2010-10-28  8:59     ` Stefan Hajnoczi
  0 siblings, 1 reply; 4+ messages in thread
From: Stefan Hajnoczi @ 2010-10-28  8:57 UTC (permalink / raw)
  To: Shirley Ma; +Cc: mst@redhat.com, David Miller, netdev, kvm, linux-kernel

On Wed, Oct 27, 2010 at 10:05 PM, Shirley Ma <mashirle@us.ibm.com> wrote:
> This patch changes vhost TX used buffer signal to guest from one by
> one to up to 3/4 of vring size. This change improves vhost TX message
> size from 256 to 8K performance for both bandwidth and CPU utilization
> without inducing any regression.

Any concerns about introducing latency or does the guest not care when
TX completions come in?

> Signed-off-by: Shirley Ma <xma@us.ibm.com>
> ---
>
>  drivers/vhost/net.c   |   19 ++++++++++++++++++-
>  drivers/vhost/vhost.c |   31 +++++++++++++++++++++++++++++++
>  drivers/vhost/vhost.h |    3 +++
>  3 files changed, 52 insertions(+), 1 deletions(-)
>
> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> index 4b4da5b..bd1ba71 100644
> --- a/drivers/vhost/net.c
> +++ b/drivers/vhost/net.c
> @@ -198,7 +198,24 @@ static void handle_tx(struct vhost_net *net)
>                if (err != len)
>                        pr_debug("Truncated TX packet: "
>                                 " len %d != %zd\n", err, len);
> -               vhost_add_used_and_signal(&net->dev, vq, head, 0);
> +               /*
> +                * if no pending buffer size allocate, signal used buffer
> +                * one by one, otherwise, signal used buffer when reaching
> +                * 3/4 ring size to reduce CPU utilization.
> +                */
> +               if (unlikely(vq->pend))
> +                       vhost_add_used_and_signal(&net->dev, vq, head, 0);
> +               else {
> +                       vq->pend[vq->num_pend].id = head;

I don't understand the logic here: if !vq->pend then we assign to
vq->pend[vq->num_pend].

> +                       vq->pend[vq->num_pend].len = 0;
> +                       ++vq->num_pend;
> +                       if (vq->num_pend == (vq->num - (vq->num >> 2))) {
> +                               vhost_add_used_and_signal_n(&net->dev, vq,
> +                                                           vq->pend,
> +                                                           vq->num_pend);
> +                               vq->num_pend = 0;
> +                       }
> +               }
>                total_len += len;
>                if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
>                        vhost_poll_queue(&vq->poll);
> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> index 94701ff..47696d2 100644
> --- a/drivers/vhost/vhost.c
> +++ b/drivers/vhost/vhost.c
> @@ -170,6 +170,16 @@ static void vhost_vq_reset(struct vhost_dev *dev,
>        vq->call_ctx = NULL;
>        vq->call = NULL;
>        vq->log_ctx = NULL;
> +       /* signal pending used buffers */
> +       if (vq->pend) {
> +               if (vq->num_pend != 0) {
> +                       vhost_add_used_and_signal_n(dev, vq, vq->pend,
> +                                                   vq->num_pend);
> +                       vq->num_pend = 0;
> +               }
> +               kfree(vq->pend);
> +       }
> +       vq->pend = NULL;
>  }
>
>  static int vhost_worker(void *data)
> @@ -273,7 +283,13 @@ long vhost_dev_init(struct vhost_dev *dev,
>                dev->vqs[i].heads = NULL;
>                dev->vqs[i].dev = dev;
>                mutex_init(&dev->vqs[i].mutex);
> +               dev->vqs[i].num_pend = 0;
> +               dev->vqs[i].pend = NULL;
>                vhost_vq_reset(dev, dev->vqs + i);
> +               /* signal 3/4 of ring size used buffers */
> +               dev->vqs[i].pend = kmalloc((dev->vqs[i].num -
> +                                          (dev->vqs[i].num >> 2)) *
> +                                          sizeof *vq->peed, GFP_KERNEL);

Has this patch been compile tested?  vq->peed?

Stefan

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [RFC PATCH 0/1] vhost: Reduce TX used buffer signal for performance
  2010-10-28  8:57   ` Stefan Hajnoczi
@ 2010-10-28  8:59     ` Stefan Hajnoczi
  0 siblings, 0 replies; 4+ messages in thread
From: Stefan Hajnoczi @ 2010-10-28  8:59 UTC (permalink / raw)
  To: Shirley Ma; +Cc: mst@redhat.com, David Miller, netdev, kvm, linux-kernel

On Thu, Oct 28, 2010 at 9:57 AM, Stefan Hajnoczi <stefanha@gmail.com> wrote:
Just read the patch 1/1 discussion and it looks like you're already on
it.  Sorry for the noise.

Stefan

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2010-10-28  8:59 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2010-10-27 21:00 [RFC PATCH 0/1] vhost: Reduce TX used buffer signal for performance Shirley Ma
2010-10-27 21:05 ` Shirley Ma
2010-10-28  8:57   ` Stefan Hajnoczi
2010-10-28  8:59     ` Stefan Hajnoczi

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).