[RFC PATCH 1/1] vhost: TX used buffer guest signal accumulation

netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [RFC PATCH 1/1] vhost: TX used buffer guest signal accumulation
@ 2010-10-27 21:58 Shirley Ma
  2010-10-28  4:40 ` Shirley Ma
  0 siblings, 1 reply; 20+ messages in thread
From: Shirley Ma @ 2010-10-27 21:58 UTC (permalink / raw)
  To: mst@redhat.com; +Cc: David Miller, netdev, kvm, linux-kernel

This patch changes vhost TX used buffer guest signal from one by
one to 3/4 of vring size. This change improves vhost TX transmission 
both bandwidth and CPU utilization performance for 256 to 8K messages s
ize without inducing any regression.


Signed-off-by: Shirley Ma <xma@us.ibm.com>
---
 drivers/vhost/net.c   |   20 +++++++++++++++++++-
 drivers/vhost/vhost.c |   31 +++++++++++++++++++++++++++++++
 drivers/vhost/vhost.h |    3 +++
 3 files changed, 53 insertions(+), 1 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 4b4da5b..45e07cd 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -128,6 +128,7 @@ static void handle_tx(struct vhost_net *net)
 	int err, wmem;
 	size_t hdr_size;
 	struct socket *sock;
+	int max_pend = vq->num - (vq->num >> 2); 
 
 	sock = rcu_dereference_check(vq->private_data,
 				     lockdep_is_held(&vq->mutex));
@@ -198,7 +199,24 @@ static void handle_tx(struct vhost_net *net)
 		if (err != len)
 			pr_debug("Truncated TX packet: "
 				 " len %d != %zd\n", err, len);
-		vhost_add_used_and_signal(&net->dev, vq, head, 0);
+		/*
+		 * if no pending buffer size allocate, signal used buffer
+		 * one by one, otherwise, signal used buffer when reaching
+		 * 3/4 ring size to reduce CPU utilization.
+		 */
+		if (unlikely(vq->pend))
+			vhost_add_used_and_signal(&net->dev, vq, head, 0);
+		else {
+			vq->pend[vq->num_pend].id = head;
+			vq->pend[vq->num_pend].len = 0;
+			++vq->num_pend;
+			if (vq->num_pend == max_pend) {
+				vhost_add_used_and_signal_n(&net->dev, vq,
+							    vq->pend,
+							    vq->num_pend);
+				vq->num_pend = 0;
+			}
+		}
 		total_len += len;
 		if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
 			vhost_poll_queue(&vq->poll);
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 94701ff..9486a25 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -170,6 +170,16 @@ static void vhost_vq_reset(struct vhost_dev *dev,
 	vq->call_ctx = NULL;
 	vq->call = NULL;
 	vq->log_ctx = NULL;
+	/* signal pending used buffers */
+	if (vq->pend) {
+		if (vq->num_pend != 0) {
+			vhost_add_used_and_signal_n(dev, vq, vq->pend,
+						    vq->num_pend);
+			vq->num_pend = 0;
+		}
+		kfree(vq->pend);
+	}
+	vq->pend = NULL;
 }
 
 static int vhost_worker(void *data)
@@ -273,7 +283,13 @@ long vhost_dev_init(struct vhost_dev *dev,
 		dev->vqs[i].heads = NULL;
 		dev->vqs[i].dev = dev;
 		mutex_init(&dev->vqs[i].mutex);
+		dev->vqs[i].num_pend = 0;
+		dev->vqs[i].pend = NULL;
 		vhost_vq_reset(dev, dev->vqs + i);
+		/* signal 3/4 of ring size used buffers */
+		dev->vqs[i].pend = kmalloc((dev->vqs[i].num -
+					   (dev->vqs[i].num >> 2)) *
+					   sizeof *vq->pend, GFP_KERNEL);
 		if (dev->vqs[i].handle_kick)
 			vhost_poll_init(&dev->vqs[i].poll,
 					dev->vqs[i].handle_kick, POLLIN, dev);
@@ -599,6 +615,21 @@ static long vhost_set_vring(struct vhost_dev *d, int ioctl, void __user *argp)
 			r = -EINVAL;
 			break;
 		}
+		if (vq->num != s.num) {
+			/* signal used buffers first */
+			if (vq->pend) {
+				if (vq->num_pend != 0) {
+					vhost_add_used_and_signal_n(vq->dev, vq,
+								    vq->pend,
+								    vq->num_pend);
+					vq->num_pend = 0;
+				}
+				kfree(vq->pend);
+			}
+			/* realloc pending used buffers size */
+			vq->pend = kmalloc((s.num - (s.num >> 2)) *
+					   sizeof *vq->pend, GFP_KERNEL);
+		}
 		vq->num = s.num;
 		break;
 	case VHOST_SET_VRING_BASE:
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 073d06a..78949c0 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -108,6 +108,9 @@ struct vhost_virtqueue {
 	/* Log write descriptors */
 	void __user *log_base;
 	struct vhost_log *log;
+	/* delay multiple used buffers to signal once */
+	int num_pend;
+	struct vring_used_elem *pend;
 };
 
 struct vhost_dev {



^ permalink raw reply related	[flat|nested] 20+ messages in thread

* Re: [RFC PATCH 1/1] vhost: TX used buffer guest signal accumulation
  2010-10-27 21:58 [RFC PATCH 1/1] vhost: TX used buffer guest signal accumulation Shirley Ma
@ 2010-10-28  4:40 ` Shirley Ma
  2010-10-28  5:20   ` Michael S. Tsirkin
  0 siblings, 1 reply; 20+ messages in thread
From: Shirley Ma @ 2010-10-28  4:40 UTC (permalink / raw)
  To: mst@redhat.com; +Cc: David Miller, netdev, kvm, linux-kernel

Resubmit this patch for fixing some minor error (white space, typo).

Signed-off-by: Shirley Ma <xma@us.ibm.com>
---
 drivers/vhost/net.c   |   20 +++++++++++++++++++-
 drivers/vhost/vhost.c |   32 ++++++++++++++++++++++++++++++++
 drivers/vhost/vhost.h |    3 +++
 3 files changed, 54 insertions(+), 1 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 4b4da5b..3eb8016 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -128,6 +128,7 @@ static void handle_tx(struct vhost_net *net)
 	int err, wmem;
 	size_t hdr_size;
 	struct socket *sock;
+	int max_pend = vq->num - (vq->num >> 2);
 
 	sock = rcu_dereference_check(vq->private_data,
 				     lockdep_is_held(&vq->mutex));
@@ -198,7 +199,24 @@ static void handle_tx(struct vhost_net *net)
 		if (err != len)
 			pr_debug("Truncated TX packet: "
 				 " len %d != %zd\n", err, len);
-		vhost_add_used_and_signal(&net->dev, vq, head, 0);
+		/*
+		 * if no pending buffer size allocate, signal used buffer
+		 * one by one, otherwise, signal used buffer when reaching
+		 * 3/4 ring size to reduce CPU utilization.
+		 */
+		if (unlikely(vq->pend))
+			vhost_add_used_and_signal(&net->dev, vq, head, 0);
+		else {
+			vq->pend[vq->num_pend].id = head;
+			vq->pend[vq->num_pend].len = 0;
+			++vq->num_pend;
+			if (vq->num_pend == max_pend) {
+				vhost_add_used_and_signal_n(&net->dev, vq,
+							    vq->pend,
+							    vq->num_pend);
+				vq->num_pend = 0;
+			}
+		}
 		total_len += len;
 		if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
 			vhost_poll_queue(&vq->poll);
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 94701ff..f2f3288 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -170,6 +170,16 @@ static void vhost_vq_reset(struct vhost_dev *dev,
 	vq->call_ctx = NULL;
 	vq->call = NULL;
 	vq->log_ctx = NULL;
+	/* signal pending used buffers */
+	if (vq->pend) {
+		if (vq->num_pend != 0) {
+			vhost_add_used_and_signal_n(dev, vq, vq->pend,
+						    vq->num_pend);
+			vq->num_pend = 0;
+		}
+		kfree(vq->pend);
+	}
+	vq->pend = NULL;
 }
 
 static int vhost_worker(void *data)
@@ -273,7 +283,14 @@ long vhost_dev_init(struct vhost_dev *dev,
 		dev->vqs[i].heads = NULL;
 		dev->vqs[i].dev = dev;
 		mutex_init(&dev->vqs[i].mutex);
+		dev->vqs[i].num_pend = 0;
+		dev->vqs[i].pend = NULL;
 		vhost_vq_reset(dev, dev->vqs + i);
+		/* signal 3/4 of ring size used buffers */
+		dev->vqs[i].pend = kmalloc((dev->vqs[i].num -
+					   (dev->vqs[i].num >> 2)) *
+					   sizeof *dev->vqs[i].pend,
+					   GFP_KERNEL);
 		if (dev->vqs[i].handle_kick)
 			vhost_poll_init(&dev->vqs[i].poll,
 					dev->vqs[i].handle_kick, POLLIN, dev);
@@ -599,6 +616,21 @@ static long vhost_set_vring(struct vhost_dev *d, int ioctl, void __user *argp)
 			r = -EINVAL;
 			break;
 		}
+		if (vq->num != s.num) {
+			/* signal used buffers first */
+			if (vq->pend) {
+				if (vq->num_pend != 0) {
+					vhost_add_used_and_signal_n(vq->dev, vq,
+								    vq->pend,
+								    vq->num_pend);
+					vq->num_pend = 0;
+				}
+				kfree(vq->pend);
+			}
+			/* realloc pending used buffers size */
+			vq->pend = kmalloc((s.num - (s.num >> 2)) *
+					   sizeof *vq->pend, GFP_KERNEL);
+		}
 		vq->num = s.num;
 		break;
 	case VHOST_SET_VRING_BASE:
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 073d06a..78949c0 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -108,6 +108,9 @@ struct vhost_virtqueue {
 	/* Log write descriptors */
 	void __user *log_base;
 	struct vhost_log *log;
+	/* delay multiple used buffers to signal once */
+	int num_pend;
+	struct vring_used_elem *pend;
 };
 
 struct vhost_dev {



^ permalink raw reply related	[flat|nested] 20+ messages in thread

* Re: [RFC PATCH 1/1] vhost: TX used buffer guest signal accumulation
  2010-10-28  4:40 ` Shirley Ma
@ 2010-10-28  5:20   ` Michael S. Tsirkin
  2010-10-28 15:24     ` Shirley Ma
                       ` (2 more replies)
  0 siblings, 3 replies; 20+ messages in thread
From: Michael S. Tsirkin @ 2010-10-28  5:20 UTC (permalink / raw)
  To: Shirley Ma; +Cc: David Miller, netdev, kvm, linux-kernel

On Wed, Oct 27, 2010 at 09:40:04PM -0700, Shirley Ma wrote:
> Resubmit this patch for fixing some minor error (white space, typo).
> 
> Signed-off-by: Shirley Ma <xma@us.ibm.com>

My concern is this can delay signalling for unlimited time.
Could you pls test this with guests that do not have
2b5bbe3b8bee8b38bdc27dd9c0270829b6eb7eeb
b0c39dbdc204006ef3558a66716ff09797619778
that is 2.6.31 and older?

This seems to be slighltly out of spec, even though
for TX, signals are less important.

Two ideas:
1. How about writing out used, just delaying the signal?
   This way we don't have to queue separately.
2. How about flushing out queued stuff before we exit
   the handle_tx loop? That would address most of
   the spec issue.

> ---
>  drivers/vhost/net.c   |   20 +++++++++++++++++++-
>  drivers/vhost/vhost.c |   32 ++++++++++++++++++++++++++++++++
>  drivers/vhost/vhost.h |    3 +++
>  3 files changed, 54 insertions(+), 1 deletions(-)
> 
> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> index 4b4da5b..3eb8016 100644
> --- a/drivers/vhost/net.c
> +++ b/drivers/vhost/net.c
> @@ -128,6 +128,7 @@ static void handle_tx(struct vhost_net *net)
>  	int err, wmem;
>  	size_t hdr_size;
>  	struct socket *sock;
> +	int max_pend = vq->num - (vq->num >> 2);
>  
>  	sock = rcu_dereference_check(vq->private_data,
>  				     lockdep_is_held(&vq->mutex));
> @@ -198,7 +199,24 @@ static void handle_tx(struct vhost_net *net)
>  		if (err != len)
>  			pr_debug("Truncated TX packet: "
>  				 " len %d != %zd\n", err, len);
> -		vhost_add_used_and_signal(&net->dev, vq, head, 0);
> +		/*
> +		 * if no pending buffer size allocate, signal used buffer
> +		 * one by one, otherwise, signal used buffer when reaching
> +		 * 3/4 ring size to reduce CPU utilization.
> +		 */
> +		if (unlikely(vq->pend))
> +			vhost_add_used_and_signal(&net->dev, vq, head, 0);
> +		else {
> +			vq->pend[vq->num_pend].id = head;
> +			vq->pend[vq->num_pend].len = 0;
> +			++vq->num_pend;
> +			if (vq->num_pend == max_pend) {
> +				vhost_add_used_and_signal_n(&net->dev, vq,
> +							    vq->pend,
> +							    vq->num_pend);
> +				vq->num_pend = 0;
> +			}
> +		}
>  		total_len += len;
>  		if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
>  			vhost_poll_queue(&vq->poll);
> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> index 94701ff..f2f3288 100644
> --- a/drivers/vhost/vhost.c
> +++ b/drivers/vhost/vhost.c
> @@ -170,6 +170,16 @@ static void vhost_vq_reset(struct vhost_dev *dev,
>  	vq->call_ctx = NULL;
>  	vq->call = NULL;
>  	vq->log_ctx = NULL;
> +	/* signal pending used buffers */
> +	if (vq->pend) {
> +		if (vq->num_pend != 0) {
> +			vhost_add_used_and_signal_n(dev, vq, vq->pend,
> +						    vq->num_pend);
> +			vq->num_pend = 0;
> +		}
> +		kfree(vq->pend);
> +	}
> +	vq->pend = NULL;
>  }
>  
>  static int vhost_worker(void *data)
> @@ -273,7 +283,14 @@ long vhost_dev_init(struct vhost_dev *dev,
>  		dev->vqs[i].heads = NULL;
>  		dev->vqs[i].dev = dev;
>  		mutex_init(&dev->vqs[i].mutex);
> +		dev->vqs[i].num_pend = 0;
> +		dev->vqs[i].pend = NULL;
>  		vhost_vq_reset(dev, dev->vqs + i);
> +		/* signal 3/4 of ring size used buffers */
> +		dev->vqs[i].pend = kmalloc((dev->vqs[i].num -
> +					   (dev->vqs[i].num >> 2)) *
> +					   sizeof *dev->vqs[i].pend,
> +					   GFP_KERNEL);
>  		if (dev->vqs[i].handle_kick)
>  			vhost_poll_init(&dev->vqs[i].poll,
>  					dev->vqs[i].handle_kick, POLLIN, dev);
> @@ -599,6 +616,21 @@ static long vhost_set_vring(struct vhost_dev *d, int ioctl, void __user *argp)
>  			r = -EINVAL;
>  			break;
>  		}
> +		if (vq->num != s.num) {
> +			/* signal used buffers first */
> +			if (vq->pend) {
> +				if (vq->num_pend != 0) {
> +					vhost_add_used_and_signal_n(vq->dev, vq,
> +								    vq->pend,
> +								    vq->num_pend);
> +					vq->num_pend = 0;
> +				}
> +				kfree(vq->pend);
> +			}
> +			/* realloc pending used buffers size */
> +			vq->pend = kmalloc((s.num - (s.num >> 2)) *
> +					   sizeof *vq->pend, GFP_KERNEL);
> +		}
>  		vq->num = s.num;
>  		break;
>  	case VHOST_SET_VRING_BASE:
> diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
> index 073d06a..78949c0 100644
> --- a/drivers/vhost/vhost.h
> +++ b/drivers/vhost/vhost.h
> @@ -108,6 +108,9 @@ struct vhost_virtqueue {
>  	/* Log write descriptors */
>  	void __user *log_base;
>  	struct vhost_log *log;
> +	/* delay multiple used buffers to signal once */
> +	int num_pend;
> +	struct vring_used_elem *pend;
>  };
>  
>  struct vhost_dev {
> 

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [RFC PATCH 1/1] vhost: TX used buffer guest signal accumulation
  2010-10-28  5:20   ` Michael S. Tsirkin
@ 2010-10-28 15:24     ` Shirley Ma
  2010-10-28 17:14     ` Shirley Ma
  2010-10-28 19:32     ` Shirley Ma
  2 siblings, 0 replies; 20+ messages in thread
From: Shirley Ma @ 2010-10-28 15:24 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: David Miller, netdev, kvm, linux-kernel

On Thu, 2010-10-28 at 07:20 +0200, Michael S. Tsirkin wrote:
> My concern is this can delay signalling for unlimited time.
> Could you pls test this with guests that do not have
> 2b5bbe3b8bee8b38bdc27dd9c0270829b6eb7eeb
> b0c39dbdc204006ef3558a66716ff09797619778
> that is 2.6.31 and older?

I will test it out.

> This seems to be slighltly out of spec, even though
> for TX, signals are less important.
> Two ideas:
> 1. How about writing out used, just delaying the signal?
>    This way we don't have to queue separately.
> 2. How about flushing out queued stuff before we exit
>    the handle_tx loop? That would address most of
>    the spec issue. 

I will modify the patch to test out the performance for both 1 & 2
approaches.

Thanks
Shirley




^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [RFC PATCH 1/1] vhost: TX used buffer guest signal accumulation
  2010-10-28  5:20   ` Michael S. Tsirkin
  2010-10-28 15:24     ` Shirley Ma
@ 2010-10-28 17:14     ` Shirley Ma
  2010-10-29  8:10       ` Michael S. Tsirkin
  2010-10-28 19:32     ` Shirley Ma
  2 siblings, 1 reply; 20+ messages in thread
From: Shirley Ma @ 2010-10-28 17:14 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: David Miller, netdev, kvm, linux-kernel

> Two ideas:
> 1. How about writing out used, just delaying the signal?
>    This way we don't have to queue separately.

This improves some performance, but not as good as delaying
both used and signal. Since delaying used buffers combining
multiple small copies to a large copy, which saves more CPU
utilization and increased some BW.

> 2. How about flushing out queued stuff before we exit
>    the handle_tx loop? That would address most of
>    the spec issue. 

The performance is almost as same as the previous patch. I will resubmit
the modified one, adding vhost_add_used_and_signal_n after handle_tx
loop for processing pending queue.

This patch was a part of modified macvtap zero copy which I haven't
submitted yet. I found this helped vhost TX in general. This pending
queue will be used by DMA done later, so I put it in vq instead of a
local variable in handle_tx.

Thanks
Shirley

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [RFC PATCH 1/1] vhost: TX used buffer guest signal accumulation
  2010-10-28  5:20   ` Michael S. Tsirkin
  2010-10-28 15:24     ` Shirley Ma
  2010-10-28 17:14     ` Shirley Ma
@ 2010-10-28 19:32     ` Shirley Ma
  2010-10-28 20:13       ` Shirley Ma
  2010-10-29  8:03       ` Michael S. Tsirkin
  2 siblings, 2 replies; 20+ messages in thread
From: Shirley Ma @ 2010-10-28 19:32 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: David Miller, netdev, kvm, linux-kernel

On Thu, 2010-10-28 at 07:20 +0200, Michael S. Tsirkin wrote:
> My concern is this can delay signalling for unlimited time.
> Could you pls test this with guests that do not have
> 2b5bbe3b8bee8b38bdc27dd9c0270829b6eb7eeb
> b0c39dbdc204006ef3558a66716ff09797619778
> that is 2.6.31 and older? 

The patch only induces delay signaling unlimited time when there is no
TX packet to transmit. I thought TX signaling only noticing guest to
release the used buffers, anything else beside this?

I tested rhel5u5 guest (2.6.18 kernel), it works fine. I checked the two
commits log, I don't think this patch could cause any issue w/o these
two patches.

Also I found a big TX regression for old guest and new guest. For old
guest, I am able to get almost 11Gb/s for 2K message size, but for the
new guest kernel, I can only get 3.5 Gb/s with the patch and same host.
I will dig it why.

thanks
Shirley

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [RFC PATCH 1/1] vhost: TX used buffer guest signal accumulation
  2010-10-28 19:32     ` Shirley Ma
@ 2010-10-28 20:13       ` Shirley Ma
  2010-10-28 21:04         ` Sridhar Samudrala
  2010-10-29  8:12         ` Michael S. Tsirkin
  2010-10-29  8:03       ` Michael S. Tsirkin
  1 sibling, 2 replies; 20+ messages in thread
From: Shirley Ma @ 2010-10-28 20:13 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: David Miller, netdev, kvm, linux-kernel

On Thu, 2010-10-28 at 12:32 -0700, Shirley Ma wrote:
> Also I found a big TX regression for old guest and new guest. For old
> guest, I am able to get almost 11Gb/s for 2K message size, but for the
> new guest kernel, I can only get 3.5 Gb/s with the patch and same
> host.
> I will dig it why. 

The regression is from guest kernel, not from this patch. Tested 2.6.31
kernel, it's performance is less than 2Gb/s for 2K message size already.
I will resubmit the patch for review. 

I will start to test from 2.6.30 kernel to figure it when TX regression
induced in virtio_net. Any suggestion which guest kernel I should test
to figure out this regression?

Thanks
Shirley


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [RFC PATCH 1/1] vhost: TX used buffer guest signal accumulation
  2010-10-28 20:13       ` Shirley Ma
@ 2010-10-28 21:04         ` Sridhar Samudrala
  2010-10-28 21:40           ` Shirley Ma
  2010-10-29  8:12         ` Michael S. Tsirkin
  1 sibling, 1 reply; 20+ messages in thread
From: Sridhar Samudrala @ 2010-10-28 21:04 UTC (permalink / raw)
  To: Shirley Ma; +Cc: Michael S. Tsirkin, David Miller, netdev, kvm, linux-kernel

On Thu, 2010-10-28 at 13:13 -0700, Shirley Ma wrote:
> On Thu, 2010-10-28 at 12:32 -0700, Shirley Ma wrote:
> > Also I found a big TX regression for old guest and new guest. For old
> > guest, I am able to get almost 11Gb/s for 2K message size, but for the
> > new guest kernel, I can only get 3.5 Gb/s with the patch and same
> > host.
> > I will dig it why. 
> 
> The regression is from guest kernel, not from this patch. Tested 2.6.31
> kernel, it's performance is less than 2Gb/s for 2K message size already.
> I will resubmit the patch for review. 
> 
> I will start to test from 2.6.30 kernel to figure it when TX regression
> induced in virtio_net. Any suggestion which guest kernel I should test
> to figure out this regression?

It would be some change in virtio-net driver that may have improved the
latency of small messages which in turn would have reduced the bandwidth
as TCP could not accumulate and send large packets.

Thanks
Sridhar


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [RFC PATCH 1/1] vhost: TX used buffer guest signal accumulation
  2010-10-28 21:04         ` Sridhar Samudrala
@ 2010-10-28 21:40           ` Shirley Ma
  2010-10-29  8:11             ` Michael S. Tsirkin
  0 siblings, 1 reply; 20+ messages in thread
From: Shirley Ma @ 2010-10-28 21:40 UTC (permalink / raw)
  To: Sridhar Samudrala
  Cc: Michael S. Tsirkin, David Miller, netdev, kvm, linux-kernel

On Thu, 2010-10-28 at 14:04 -0700, Sridhar Samudrala wrote:
> It would be some change in virtio-net driver that may have improved
> the
> latency of small messages which in turn would have reduced the
> bandwidth
> as TCP could not accumulate and send large packets.

I will check out any latency improvement patch in virtio_net. If that's
the case, whether it is good to have some tunable parameter to benefit
both BW and latency workload?

Shirley 

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [RFC PATCH 1/1] vhost: TX used buffer guest signal accumulation
  2010-10-28 19:32     ` Shirley Ma
  2010-10-28 20:13       ` Shirley Ma
@ 2010-10-29  8:03       ` Michael S. Tsirkin
  1 sibling, 0 replies; 20+ messages in thread
From: Michael S. Tsirkin @ 2010-10-29  8:03 UTC (permalink / raw)
  To: Shirley Ma; +Cc: David Miller, netdev, kvm, linux-kernel

On Thu, Oct 28, 2010 at 12:32:35PM -0700, Shirley Ma wrote:
> On Thu, 2010-10-28 at 07:20 +0200, Michael S. Tsirkin wrote:
> > My concern is this can delay signalling for unlimited time.
> > Could you pls test this with guests that do not have
> > 2b5bbe3b8bee8b38bdc27dd9c0270829b6eb7eeb
> > b0c39dbdc204006ef3558a66716ff09797619778
> > that is 2.6.31 and older? 
> 
> The patch only induces delay signaling unlimited time when there is no
> TX packet to transmit. I thought TX signaling only noticing guest to
> release the used buffers, anything else beside this?

Right, that's it I think. For newer kernels we orphan the skb
on xmit so we don't care that much about completing them.
This does rely on an undocumented assumption about guest
behaviour though.

> I tested rhel5u5 guest (2.6.18 kernel), it works fine. I checked the two
> commits log, I don't think this patch could cause any issue w/o these
> two patches.
> 
> Also I found a big TX regression for old guest and new guest. For old
> guest, I am able to get almost 11Gb/s for 2K message size, but for the
> new guest kernel, I can only get 3.5 Gb/s with the patch and same host.
> I will dig it why.
> 
> thanks
> Shirley

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [RFC PATCH 1/1] vhost: TX used buffer guest signal accumulation
  2010-10-28 17:14     ` Shirley Ma
@ 2010-10-29  8:10       ` Michael S. Tsirkin
  2010-10-29 15:43         ` Shirley Ma
  0 siblings, 1 reply; 20+ messages in thread
From: Michael S. Tsirkin @ 2010-10-29  8:10 UTC (permalink / raw)
  To: Shirley Ma; +Cc: David Miller, netdev, kvm, linux-kernel

On Thu, Oct 28, 2010 at 10:14:22AM -0700, Shirley Ma wrote:
> 
> > Two ideas:
> > 1. How about writing out used, just delaying the signal?
> >    This way we don't have to queue separately.
> 
> This improves some performance, but not as good as delaying
> both used and signal. Since delaying used buffers combining
> multiple small copies to a large copy, which saves more CPU
> utilization and increased some BW.

Hmm. I don't yet understand. We are still doing copies into the per-vq
buffer, and the data copied is really small.  Is it about cache line
bounces?  Could you try figuring it out?

> > 2. How about flushing out queued stuff before we exit
> >    the handle_tx loop? That would address most of
> >    the spec issue. 
> 
> The performance is almost as same as the previous patch. I will resubmit
> the modified one, adding vhost_add_used_and_signal_n after handle_tx
> loop for processing pending queue.
> 
> This patch was a part of modified macvtap zero copy which I haven't
> submitted yet. I found this helped vhost TX in general. This pending
> queue will be used by DMA done later, so I put it in vq instead of a
> local variable in handle_tx.
> 
> Thanks
> Shirley

BTW why do we need another array? Isn't heads field exactly what we need
here?


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [RFC PATCH 1/1] vhost: TX used buffer guest signal accumulation
  2010-10-28 21:40           ` Shirley Ma
@ 2010-10-29  8:11             ` Michael S. Tsirkin
  0 siblings, 0 replies; 20+ messages in thread
From: Michael S. Tsirkin @ 2010-10-29  8:11 UTC (permalink / raw)
  To: Shirley Ma; +Cc: Sridhar Samudrala, David Miller, netdev, kvm, linux-kernel

On Thu, Oct 28, 2010 at 02:40:50PM -0700, Shirley Ma wrote:
> On Thu, 2010-10-28 at 14:04 -0700, Sridhar Samudrala wrote:
> > It would be some change in virtio-net driver that may have improved
> > the
> > latency of small messages which in turn would have reduced the
> > bandwidth
> > as TCP could not accumulate and send large packets.
> 
> I will check out any latency improvement patch in virtio_net. If that's
> the case, whether it is good to have some tunable parameter to benefit
> both BW and latency workload?
> 
> Shirley 

No, we need it to work well automatically somehow.

-- 
MST

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [RFC PATCH 1/1] vhost: TX used buffer guest signal accumulation
  2010-10-28 20:13       ` Shirley Ma
  2010-10-28 21:04         ` Sridhar Samudrala
@ 2010-10-29  8:12         ` Michael S. Tsirkin
  1 sibling, 0 replies; 20+ messages in thread
From: Michael S. Tsirkin @ 2010-10-29  8:12 UTC (permalink / raw)
  To: Shirley Ma; +Cc: David Miller, netdev, kvm, linux-kernel

On Thu, Oct 28, 2010 at 01:13:55PM -0700, Shirley Ma wrote:
> On Thu, 2010-10-28 at 12:32 -0700, Shirley Ma wrote:
> > Also I found a big TX regression for old guest and new guest. For old
> > guest, I am able to get almost 11Gb/s for 2K message size, but for the
> > new guest kernel, I can only get 3.5 Gb/s with the patch and same
> > host.
> > I will dig it why. 
> 
> The regression is from guest kernel, not from this patch. Tested 2.6.31
> kernel, it's performance is less than 2Gb/s for 2K message size already.
> I will resubmit the patch for review. 
> 
> I will start to test from 2.6.30 kernel to figure it when TX regression
> induced in virtio_net. Any suggestion which guest kernel I should test
> to figure out this regression?
> 
> Thanks
> Shirley

git bisect 2.6.31 2.6.30
and go from here.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [RFC PATCH 1/1] vhost: TX used buffer guest signal accumulation
  2010-10-29  8:10       ` Michael S. Tsirkin
@ 2010-10-29 15:43         ` Shirley Ma
  2010-10-30 20:06           ` Michael S. Tsirkin
  0 siblings, 1 reply; 20+ messages in thread
From: Shirley Ma @ 2010-10-29 15:43 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: David Miller, netdev, kvm, linux-kernel

On Fri, 2010-10-29 at 10:10 +0200, Michael S. Tsirkin wrote:
> Hmm. I don't yet understand. We are still doing copies into the per-vq
> buffer, and the data copied is really small.  Is it about cache line
> bounces?  Could you try figuring it out?

per-vq buffer is much less expensive than 3 put_copy() call. I will
collect the profiling data to show that.

> > > 2. How about flushing out queued stuff before we exit
> > >    the handle_tx loop? That would address most of
> > >    the spec issue. 
> > 
> > The performance is almost as same as the previous patch. I will
> resubmit
> > the modified one, adding vhost_add_used_and_signal_n after handle_tx
> > loop for processing pending queue.
> > 
> > This patch was a part of modified macvtap zero copy which I haven't
> > submitted yet. I found this helped vhost TX in general. This pending
> > queue will be used by DMA done later, so I put it in vq instead of a
> > local variable in handle_tx.
> > 
> > Thanks
> > Shirley
> 
> BTW why do we need another array? Isn't heads field exactly what we
> need
> here?

head field is only for up to 32, the more used buffers add and signal
accumulated the better performance is from test results. That's was one
of the reason I didn't use heads. The other reason was I used these
buffer for pending dma done in mavctap zero copy patch. It could be up
to vq->num in worse case.

Thanks
Shirley


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [RFC PATCH 1/1] vhost: TX used buffer guest signal accumulation
  2010-10-29 15:43         ` Shirley Ma
@ 2010-10-30 20:06           ` Michael S. Tsirkin
  2010-11-01 20:17             ` Shirley Ma
  0 siblings, 1 reply; 20+ messages in thread
From: Michael S. Tsirkin @ 2010-10-30 20:06 UTC (permalink / raw)
  To: Shirley Ma; +Cc: David Miller, netdev, kvm, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 2192 bytes --]

On Fri, Oct 29, 2010 at 08:43:08AM -0700, Shirley Ma wrote:
> On Fri, 2010-10-29 at 10:10 +0200, Michael S. Tsirkin wrote:
> > Hmm. I don't yet understand. We are still doing copies into the per-vq
> > buffer, and the data copied is really small.  Is it about cache line
> > bounces?  Could you try figuring it out?
> 
> per-vq buffer is much less expensive than 3 put_copy() call. I will
> collect the profiling data to show that.

What about __put_user? Maybe the access checks are the ones
that add the cost here? I attach patches to strip access checks:
they are not needed as we do them on setup time already, anyway.
Can you try them out and see if performance is improved for you please?
On top of this, we will need to add some scheme to accumulate signals,
but that is a separate issue.

> > > > 2. How about flushing out queued stuff before we exit
> > > >    the handle_tx loop? That would address most of
> > > >    the spec issue. 
> > > 
> > > The performance is almost as same as the previous patch. I will
> > resubmit
> > > the modified one, adding vhost_add_used_and_signal_n after handle_tx
> > > loop for processing pending queue.
> > > 
> > > This patch was a part of modified macvtap zero copy which I haven't
> > > submitted yet. I found this helped vhost TX in general. This pending
> > > queue will be used by DMA done later, so I put it in vq instead of a
> > > local variable in handle_tx.
> > > 
> > > Thanks
> > > Shirley
> > 
> > BTW why do we need another array? Isn't heads field exactly what we
> > need
> > here?
> 
> head field is only for up to 32, the more used buffers add and signal
> accumulated the better performance is from test results.

I think we should separate the used update and signalling.  Interrupts
are expensive so I can believe accumulating even up to 100 of them
helps. But used head copies are already prety cheap. If we cut the
overhead by x32, that should make them almost free?

> That's was one
> of the reason I didn't use heads. The other reason was I used these
> buffer for pending dma done in mavctap zero copy patch. It could be up
> to vq->num in worse case.

We can always increase that, not an issue.

> Thanks
> Shirley

[-- Attachment #2: 1 --]
[-- Type: text/plain, Size: 809 bytes --]

commit f983547ed65a90c73b3e5eeee707041cc04acb0e
Author: Michael S. Tsirkin <mst@redhat.com>
Date:   Tue Sep 21 14:18:01 2010 +0200

    vhost: copy_to_user -> __copy_to_user
    
    We do access_ok checks at setup time, so we don't need to
    redo them on each access.
    
    Signed-off-by: Michael S. Tsirkin <mst@redhat.com>

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 54ba561..5109bc0 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -1245,7 +1245,7 @@ static int __vhost_add_used_n(struct vhost_virtqueue *vq,
 
 	start = vq->last_used_idx % vq->num;
 	used = vq->used->ring + start;
-	if (copy_to_user(used, heads, count * sizeof *used)) {
+	if (__copy_to_user(used, heads, count * sizeof *used)) {
 		vq_err(vq, "Failed to write used");
 		return -EFAULT;
 	}

[-- Attachment #3: 2 --]
[-- Type: text/plain, Size: 2889 bytes --]

commit 85337783cb9487246ed067592e50a5ee800e2683
Author: Michael S. Tsirkin <mst@redhat.com>
Date:   Sun Sep 19 15:56:30 2010 +0200

    vhost: get/put_user -> __get/__put_user
    
    We do access_ok checks on all ring values on an ioctl,
    so we don't need to redo them on each access.
    
    Signed-off-by: Michael S. Tsirkin <mst@redhat.com>

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 3440197..54ba561 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -1082,7 +1082,7 @@ int vhost_get_vq_desc(struct vhost_dev *dev, struct vhost_virtqueue *vq,
 
 	/* Check it isn't doing very strange things with descriptor numbers. */
 	last_avail_idx = vq->last_avail_idx;
-	if (unlikely(get_user(vq->avail_idx, &vq->avail->idx))) {
+	if (unlikely(__get_user(vq->avail_idx, &vq->avail->idx))) {
 		vq_err(vq, "Failed to access avail idx at %p\n",
 		       &vq->avail->idx);
 		return -EFAULT;
@@ -1103,8 +1103,8 @@ int vhost_get_vq_desc(struct vhost_dev *dev, struct vhost_virtqueue *vq,
 
 	/* Grab the next descriptor number they're advertising, and increment
 	 * the index we've seen. */
-	if (unlikely(get_user(head,
-			      &vq->avail->ring[last_avail_idx % vq->num]))) {
+	if (unlikely(__get_user(head,
+				&vq->avail->ring[last_avail_idx % vq->num]))) {
 		vq_err(vq, "Failed to read head: idx %d address %p\n",
 		       last_avail_idx,
 		       &vq->avail->ring[last_avail_idx % vq->num]);
@@ -1203,17 +1203,17 @@ int vhost_add_used(struct vhost_virtqueue *vq, unsigned int head, int len)
 	/* The virtqueue contains a ring of used buffers.  Get a pointer to the
 	 * next entry in that used ring. */
 	used = &vq->used->ring[vq->last_used_idx % vq->num];
-	if (put_user(head, &used->id)) {
+	if (__put_user(head, &used->id)) {
 		vq_err(vq, "Failed to write used id");
 		return -EFAULT;
 	}
-	if (put_user(len, &used->len)) {
+	if (__put_user(len, &used->len)) {
 		vq_err(vq, "Failed to write used len");
 		return -EFAULT;
 	}
 	/* Make sure buffer is written before we update index. */
 	smp_wmb();
-	if (put_user(vq->last_used_idx + 1, &vq->used->idx)) {
+	if (__put_user(vq->last_used_idx + 1, &vq->used->idx)) {
 		vq_err(vq, "Failed to increment used idx");
 		return -EFAULT;
 	}
@@ -1306,7 +1306,7 @@ void vhost_signal(struct vhost_dev *dev, struct vhost_virtqueue *vq)
 	 * interrupts. */
 	smp_mb();
 
-	if (get_user(flags, &vq->avail->flags)) {
+	if (__get_user(flags, &vq->avail->flags)) {
 		vq_err(vq, "Failed to get flags");
 		return;
 	}
@@ -1357,7 +1357,7 @@ bool vhost_enable_notify(struct vhost_virtqueue *vq)
 	/* They could have slipped one in as we were doing that: make
 	 * sure it's written, then check again. */
 	smp_mb();
-	r = get_user(avail_idx, &vq->avail->idx);
+	r = __get_user(avail_idx, &vq->avail->idx);
 	if (r) {
 		vq_err(vq, "Failed to check avail idx at %p: %d\n",
 		       &vq->avail->idx, r);

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* Re: [RFC PATCH 1/1] vhost: TX used buffer guest signal accumulation
  2010-10-30 20:06           ` Michael S. Tsirkin
@ 2010-11-01 20:17             ` Shirley Ma
  2010-11-03 10:48               ` Michael S. Tsirkin
  0 siblings, 1 reply; 20+ messages in thread
From: Shirley Ma @ 2010-11-01 20:17 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: David Miller, netdev, kvm, linux-kernel

On Sat, 2010-10-30 at 22:06 +0200, Michael S. Tsirkin wrote:
> On Fri, Oct 29, 2010 at 08:43:08AM -0700, Shirley Ma wrote:
> > On Fri, 2010-10-29 at 10:10 +0200, Michael S. Tsirkin wrote:
> > > Hmm. I don't yet understand. We are still doing copies into the
> per-vq
> > > buffer, and the data copied is really small.  Is it about cache
> line
> > > bounces?  Could you try figuring it out?
> > 
> > per-vq buffer is much less expensive than 3 put_copy() call. I will
> > collect the profiling data to show that.
> 
> What about __put_user? Maybe the access checks are the ones
> that add the cost here? I attach patches to strip access checks:
> they are not needed as we do them on setup time already, anyway.
> Can you try them out and see if performance is improved for you
> please?
> On top of this, we will need to add some scheme to accumulate signals,
> but that is a separate issue.

Yes, moving from put_user/get_user to __put_user/__get_user does improve
the performance by removing the checking.

My concern here is whether checking only in set up would be sufficient
for security? Would be there is a case guest could corrupt the ring
later? If not, that's OK.

> > > > > 2. How about flushing out queued stuff before we exit
> > > > >    the handle_tx loop? That would address most of
> > > > >    the spec issue. 
> > > > 
> > > > The performance is almost as same as the previous patch. I will
> > > resubmit
> > > > the modified one, adding vhost_add_used_and_signal_n after
> handle_tx
> > > > loop for processing pending queue.
> > > > 
> > > > This patch was a part of modified macvtap zero copy which I
> haven't
> > > > submitted yet. I found this helped vhost TX in general. This
> pending
> > > > queue will be used by DMA done later, so I put it in vq instead
> of a
> > > > local variable in handle_tx.
> > > > 
> > > > Thanks
> > > > Shirley
> > > 
> > > BTW why do we need another array? Isn't heads field exactly what
> we
> > > need
> > > here?
> > 
> > head field is only for up to 32, the more used buffers add and
> signal
> > accumulated the better performance is from test results.
> 
> I think we should separate the used update and signalling.  Interrupts
> are expensive so I can believe accumulating even up to 100 of them
> helps. But used head copies are already prety cheap. If we cut the
> overhead by x32, that should make them almost free?

I can separate the used update and signaling to see the best
performance.

> > That's was one
> > of the reason I didn't use heads. The other reason was I used these
> > buffer for pending dma done in mavctap zero copy patch. It could be
> up
> > to vq->num in worse case.
> 
> We can always increase that, not an issue. 

Good, I will change heads up to vq->num and use it.

Thanks
Shirley


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [RFC PATCH 1/1] vhost: TX used buffer guest signal accumulation
  2010-11-01 20:17             ` Shirley Ma
@ 2010-11-03 10:48               ` Michael S. Tsirkin
  2010-11-04  5:38                 ` Shirley Ma
  0 siblings, 1 reply; 20+ messages in thread
From: Michael S. Tsirkin @ 2010-11-03 10:48 UTC (permalink / raw)
  To: Shirley Ma; +Cc: David Miller, netdev, kvm, linux-kernel

On Mon, Nov 01, 2010 at 01:17:53PM -0700, Shirley Ma wrote:
> On Sat, 2010-10-30 at 22:06 +0200, Michael S. Tsirkin wrote:
> > On Fri, Oct 29, 2010 at 08:43:08AM -0700, Shirley Ma wrote:
> > > On Fri, 2010-10-29 at 10:10 +0200, Michael S. Tsirkin wrote:
> > > > Hmm. I don't yet understand. We are still doing copies into the
> > per-vq
> > > > buffer, and the data copied is really small.  Is it about cache
> > line
> > > > bounces?  Could you try figuring it out?
> > > 
> > > per-vq buffer is much less expensive than 3 put_copy() call. I will
> > > collect the profiling data to show that.
> > 
> > What about __put_user? Maybe the access checks are the ones
> > that add the cost here? I attach patches to strip access checks:
> > they are not needed as we do them on setup time already, anyway.
> > Can you try them out and see if performance is improved for you
> > please?
> > On top of this, we will need to add some scheme to accumulate signals,
> > but that is a separate issue.
> 
> Yes, moving from put_user/get_user to __put_user/__get_user does improve
> the performance by removing the checking.

I mean in practice, you see a benefit from this patch?

> My concern here is whether checking only in set up would be sufficient
> for security?

It better be sufficient because the checks that put_user does
are not effictive when run from the kernel thread, anyway.

> Would be there is a case guest could corrupt the ring
> later? If not, that's OK.

You mean change the pointer after it's checked?
If you see such a case, please holler.

> > > > > > 2. How about flushing out queued stuff before we exit
> > > > > >    the handle_tx loop? That would address most of
> > > > > >    the spec issue. 
> > > > > 
> > > > > The performance is almost as same as the previous patch. I will
> > > > resubmit
> > > > > the modified one, adding vhost_add_used_and_signal_n after
> > handle_tx
> > > > > loop for processing pending queue.
> > > > > 
> > > > > This patch was a part of modified macvtap zero copy which I
> > haven't
> > > > > submitted yet. I found this helped vhost TX in general. This
> > pending
> > > > > queue will be used by DMA done later, so I put it in vq instead
> > of a
> > > > > local variable in handle_tx.
> > > > > 
> > > > > Thanks
> > > > > Shirley
> > > > 
> > > > BTW why do we need another array? Isn't heads field exactly what
> > we
> > > > need
> > > > here?
> > > 
> > > head field is only for up to 32, the more used buffers add and
> > signal
> > > accumulated the better performance is from test results.
> > 
> > I think we should separate the used update and signalling.  Interrupts
> > are expensive so I can believe accumulating even up to 100 of them
> > helps. But used head copies are already prety cheap. If we cut the
> > overhead by x32, that should make them almost free?
> 
> I can separate the used update and signaling to see the best
> performance.
> 
> > > That's was one
> > > of the reason I didn't use heads. The other reason was I used these
> > > buffer for pending dma done in mavctap zero copy patch. It could be
> > up
> > > to vq->num in worse case.
> > 
> > We can always increase that, not an issue. 
> 
> Good, I will change heads up to vq->num and use it.
> 
> Thanks
> Shirley

To clarify: the combination of __put_user and separate
signalling is giving the same performance benefit as your
patch?

I am mostly concerned with adding code that seems to help
speed for reasons we don't completely understand, because
then we might break the optimization easily without noticing.

-- 
MST

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [RFC PATCH 1/1] vhost: TX used buffer guest signal accumulation
  2010-11-03 10:48               ` Michael S. Tsirkin
@ 2010-11-04  5:38                 ` Shirley Ma
  2010-11-04  9:30                   ` Michael S. Tsirkin
  0 siblings, 1 reply; 20+ messages in thread
From: Shirley Ma @ 2010-11-04  5:38 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: David Miller, netdev, kvm, linux-kernel

On Wed, 2010-11-03 at 12:48 +0200, Michael S. Tsirkin wrote:
> I mean in practice, you see a benefit from this patch?

Yes, I tested it. It does benefit the performance.

> > My concern here is whether checking only in set up would be
> sufficient
> > for security?
> 
> It better be sufficient because the checks that put_user does
> are not effictive when run from the kernel thread, anyway.
> 
> > Would be there is a case guest could corrupt the ring
> > later? If not, that's OK.
> 
> You mean change the pointer after it's checked?
> If you see such a case, please holler.

I wonder about it, not a such case in mind.

> To clarify: the combination of __put_user and separate
> signalling is giving the same performance benefit as your
> patch?

Yes, it has similar performance, not I haven't finished all message
sizes comparison yet.

> I am mostly concerned with adding code that seems to help
> speed for reasons we don't completely understand, because
> then we might break the optimization easily without noticing.

I don't think the patch I submited would break up anything. It just
reduced the cost of per used buffer 3 put_user() calls and guest
signaling from one to one to many to one.

Thanks
Shirley


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [RFC PATCH 1/1] vhost: TX used buffer guest signal accumulation
  2010-11-04  5:38                 ` Shirley Ma
@ 2010-11-04  9:30                   ` Michael S. Tsirkin
  2010-11-04 21:37                     ` Shirley Ma
  0 siblings, 1 reply; 20+ messages in thread
From: Michael S. Tsirkin @ 2010-11-04  9:30 UTC (permalink / raw)
  To: Shirley Ma; +Cc: David Miller, netdev, kvm, linux-kernel

On Wed, Nov 03, 2010 at 10:38:46PM -0700, Shirley Ma wrote:
> On Wed, 2010-11-03 at 12:48 +0200, Michael S. Tsirkin wrote:
> > I mean in practice, you see a benefit from this patch?
> 
> Yes, I tested it. It does benefit the performance.
> 
> > > My concern here is whether checking only in set up would be
> > sufficient
> > > for security?
> > 
> > It better be sufficient because the checks that put_user does
> > are not effictive when run from the kernel thread, anyway.
> > 
> > > Would be there is a case guest could corrupt the ring
> > > later? If not, that's OK.
> > 
> > You mean change the pointer after it's checked?
> > If you see such a case, please holler.
> 
> I wonder about it, not a such case in mind.
> 
> > To clarify: the combination of __put_user and separate
> > signalling is giving the same performance benefit as your
> > patch?
> 
> Yes, it has similar performance, not I haven't finished all message
> sizes comparison yet.
> 
> > I am mostly concerned with adding code that seems to help
> > speed for reasons we don't completely understand, because
> > then we might break the optimization easily without noticing.
> 
> I don't think the patch I submited would break up anything.

No, I just meant that when a patch gives some benefit, I'd like
to understand where the benefit comes from so that I don't
break it later.

> It just
> reduced the cost of per used buffer 3 put_user() calls and guest
> signaling from one to one to many to one.

One thing to note is that deferred signalling needs to be
benchmarked with old guests which don't orphan skbs on xmit
(or disable orphaning in both networking stack and virtio-net).

> 
> Thanks
> Shirley

OK, so I guess I'll queue the __put_user etc patches for net-next, on top of this
I think a patch which defers signalling would be nice to have,
then we can figure out whether a separate heads array still has benefits
for non zero copy case: if yes what they are, if no whether it should be
used for zero copy only for both both non-zero copy and zero copy.

Makes sense?

-- 
MST

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [RFC PATCH 1/1] vhost: TX used buffer guest signal accumulation
  2010-11-04  9:30                   ` Michael S. Tsirkin
@ 2010-11-04 21:37                     ` Shirley Ma
  0 siblings, 0 replies; 20+ messages in thread
From: Shirley Ma @ 2010-11-04 21:37 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: David Miller, netdev, kvm, linux-kernel

On Thu, 2010-11-04 at 11:30 +0200, Michael S. Tsirkin wrote:
> One thing to note is that deferred signalling needs to be
> benchmarked with old guests which don't orphan skbs on xmit
> (or disable orphaning in both networking stack and virtio-net).

Yes, we need run more test.

> 
> OK, so I guess I'll queue the __put_user etc patches for net-next, on
> top of this
> I think a patch which defers signalling would be nice to have,
> then we can figure out whether a separate heads array still has
> benefits
> for non zero copy case: if yes what they are, if no whether it should
> be
> used for zero copy only for both both non-zero copy and zero copy.
> 
> Makes sense? 

Agree.

Shirley

^ permalink raw reply	[flat|nested] 20+ messages in thread

end of thread, other threads:[~2010-11-04 21:37 UTC | newest]

Thread overview: 20+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2010-10-27 21:58 [RFC PATCH 1/1] vhost: TX used buffer guest signal accumulation Shirley Ma
2010-10-28  4:40 ` Shirley Ma
2010-10-28  5:20   ` Michael S. Tsirkin
2010-10-28 15:24     ` Shirley Ma
2010-10-28 17:14     ` Shirley Ma
2010-10-29  8:10       ` Michael S. Tsirkin
2010-10-29 15:43         ` Shirley Ma
2010-10-30 20:06           ` Michael S. Tsirkin
2010-11-01 20:17             ` Shirley Ma
2010-11-03 10:48               ` Michael S. Tsirkin
2010-11-04  5:38                 ` Shirley Ma
2010-11-04  9:30                   ` Michael S. Tsirkin
2010-11-04 21:37                     ` Shirley Ma
2010-10-28 19:32     ` Shirley Ma
2010-10-28 20:13       ` Shirley Ma
2010-10-28 21:04         ` Sridhar Samudrala
2010-10-28 21:40           ` Shirley Ma
2010-10-29  8:11             ` Michael S. Tsirkin
2010-10-29  8:12         ` Michael S. Tsirkin
2010-10-29  8:03       ` Michael S. Tsirkin

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).