netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* Re: [v2 RFC PATCH 0/4] Implement multiqueue virtio-net
@ 2010-10-06 13:34 Michael S. Tsirkin
  2010-10-06 13:34 ` [PATCH 1/2] vhost: put mm after thread stop Michael S. Tsirkin
                   ` (3 more replies)
  0 siblings, 4 replies; 11+ messages in thread
From: Michael S. Tsirkin @ 2010-10-06 13:34 UTC (permalink / raw)
  To: Krishna Kumar; +Cc: rusty, davem, kvm, arnd, netdev, avi, anthony

On Fri, Sep 17, 2010 at 03:33:07PM +0530, Krishna Kumar wrote:
> For 1 TCP netperf, I ran 7 iterations and summed it. Explanation
> for degradation for 1 stream case:

I thought about possible RX/TX contention reasons, and I realized that
we get/put the mm counter all the time.  So I write the following: I
haven't seen any performance gain from this in a single queue case, but
maybe this will help multiqueue?

Thanks,

Michael S. Tsirkin (2):
  vhost: put mm after thread stop
  vhost-net: batch use/unuse mm

 drivers/vhost/net.c   |    7 -------
 drivers/vhost/vhost.c |   16 ++++++++++------
 2 files changed, 10 insertions(+), 13 deletions(-)

-- 
1.7.3-rc1

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH 1/2] vhost: put mm after thread stop
  2010-10-06 13:34 [v2 RFC PATCH 0/4] Implement multiqueue virtio-net Michael S. Tsirkin
@ 2010-10-06 13:34 ` Michael S. Tsirkin
  2010-10-06 13:34 ` [PATCH 2/2] vhost-net: batch use/unuse mm Michael S. Tsirkin
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 11+ messages in thread
From: Michael S. Tsirkin @ 2010-10-06 13:34 UTC (permalink / raw)
  To: Krishna Kumar; +Cc: rusty, davem, kvm, arnd, netdev, avi, anthony

makes it possible to batch use/unuse mm

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/vhost/vhost.c |    9 ++++-----
 1 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 677d112..8b9d474 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -207,7 +207,7 @@ static int vhost_worker(void *data)
 		if (work) {
 			__set_current_state(TASK_RUNNING);
 			work->fn(work);
-			if (n++) {
+			if (dev->nvqs <= ++n) {
 				__set_current_state(TASK_RUNNING);
 				schedule();
 				n = 0;
@@ -409,15 +409,14 @@ void vhost_dev_cleanup(struct vhost_dev *dev)
 	/* No one will access memory at this point */
 	kfree(dev->memory);
 	dev->memory = NULL;
-	if (dev->mm)
-		mmput(dev->mm);
-	dev->mm = NULL;
-
 	WARN_ON(!list_empty(&dev->work_list));
 	if (dev->worker) {
 		kthread_stop(dev->worker);
 		dev->worker = NULL;
 	}
+	if (dev->mm)
+		mmput(dev->mm);
+	dev->mm = NULL;
 }
 
 static int log_access_ok(void __user *log_base, u64 addr, unsigned long sz)
-- 
1.7.3-rc1


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH 2/2] vhost-net: batch use/unuse mm
  2010-10-06 13:34 [v2 RFC PATCH 0/4] Implement multiqueue virtio-net Michael S. Tsirkin
  2010-10-06 13:34 ` [PATCH 1/2] vhost: put mm after thread stop Michael S. Tsirkin
@ 2010-10-06 13:34 ` Michael S. Tsirkin
  2010-10-06 17:02 ` [v2 RFC PATCH 0/4] Implement multiqueue virtio-net Krishna Kumar2
  2010-10-11  7:21 ` Krishna Kumar2
  3 siblings, 0 replies; 11+ messages in thread
From: Michael S. Tsirkin @ 2010-10-06 13:34 UTC (permalink / raw)
  To: Krishna Kumar; +Cc: rusty, davem, kvm, arnd, netdev, avi, anthony

Move use/unuse mm to vhost.c which makes it possible to batch these
operations.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/vhost/net.c   |    7 -------
 drivers/vhost/vhost.c |    7 ++++++-
 2 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 271678e..ff02ea4 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -10,7 +10,6 @@
 #include <linux/eventfd.h>
 #include <linux/vhost.h>
 #include <linux/virtio_net.h>
-#include <linux/mmu_context.h>
 #include <linux/miscdevice.h>
 #include <linux/module.h>
 #include <linux/mutex.h>
@@ -136,7 +135,6 @@ static void handle_tx(struct vhost_net *net)
 		return;
 	}
 
-	use_mm(net->dev.mm);
 	mutex_lock(&vq->mutex);
 	vhost_disable_notify(vq);
 
@@ -197,7 +195,6 @@ static void handle_tx(struct vhost_net *net)
 	}
 
 	mutex_unlock(&vq->mutex);
-	unuse_mm(net->dev.mm);
 }
 
 static int peek_head_len(struct sock *sk)
@@ -302,7 +299,6 @@ static void handle_rx_big(struct vhost_net *net)
 	if (!sock || skb_queue_empty(&sock->sk->sk_receive_queue))
 		return;
 
-	use_mm(net->dev.mm);
 	mutex_lock(&vq->mutex);
 	vhost_disable_notify(vq);
 	hdr_size = vq->vhost_hlen;
@@ -381,7 +377,6 @@ static void handle_rx_big(struct vhost_net *net)
 	}
 
 	mutex_unlock(&vq->mutex);
-	unuse_mm(net->dev.mm);
 }
 
 /* Expects to be always run from workqueue - which acts as
@@ -413,7 +408,6 @@ static void handle_rx_mergeable(struct vhost_net *net)
 	if (!sock || skb_queue_empty(&sock->sk->sk_receive_queue))
 		return;
 
-	use_mm(net->dev.mm);
 	mutex_lock(&vq->mutex);
 	vhost_disable_notify(vq);
 	vhost_hlen = vq->vhost_hlen;
@@ -490,7 +484,6 @@ static void handle_rx_mergeable(struct vhost_net *net)
 	}
 
 	mutex_unlock(&vq->mutex);
-	unuse_mm(net->dev.mm);
 }
 
 static void handle_rx(struct vhost_net *net)
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 8b9d474..c83d1c2 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -15,6 +15,7 @@
 #include <linux/vhost.h>
 #include <linux/virtio_net.h>
 #include <linux/mm.h>
+#include <linux/mmu_context.h>
 #include <linux/miscdevice.h>
 #include <linux/mutex.h>
 #include <linux/rcupdate.h>
@@ -179,6 +180,8 @@ static int vhost_worker(void *data)
 	unsigned uninitialized_var(seq);
 	int n = 0;
 
+	use_mm(dev->mm);
+
 	for (;;) {
 		/* mb paired w/ kthread_stop */
 		set_current_state(TASK_INTERRUPTIBLE);
@@ -193,7 +196,7 @@ static int vhost_worker(void *data)
 		if (kthread_should_stop()) {
 			spin_unlock_irq(&dev->work_lock);
 			__set_current_state(TASK_RUNNING);
-			return 0;
+			break;
 		}
 		if (!list_empty(&dev->work_list)) {
 			work = list_first_entry(&dev->work_list,
@@ -218,6 +221,8 @@ static int vhost_worker(void *data)
 		}
 
 	}
+	unuse_mm(dev->mm);
+	return 0;
 }
 
 /* Helper to allocate iovec buffers for all vqs. */
-- 
1.7.3-rc1

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [v2 RFC PATCH 0/4] Implement multiqueue virtio-net
  2010-10-06 13:34 [v2 RFC PATCH 0/4] Implement multiqueue virtio-net Michael S. Tsirkin
  2010-10-06 13:34 ` [PATCH 1/2] vhost: put mm after thread stop Michael S. Tsirkin
  2010-10-06 13:34 ` [PATCH 2/2] vhost-net: batch use/unuse mm Michael S. Tsirkin
@ 2010-10-06 17:02 ` Krishna Kumar2
  2010-10-11  7:21 ` Krishna Kumar2
  3 siblings, 0 replies; 11+ messages in thread
From: Krishna Kumar2 @ 2010-10-06 17:02 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: anthony, arnd, avi, davem, kvm, netdev, rusty

"Michael S. Tsirkin" <mst@redhat.com> wrote on 10/06/2010 07:04:31 PM:

> "Michael S. Tsirkin" <mst@redhat.com>
> 10/06/2010 07:04 PM
>
> To
>
> Krishna Kumar2/India/IBM@IBMIN
>
> cc
>
> rusty@rustcorp.com.au, davem@davemloft.net, kvm@vger.kernel.org,
> arnd@arndb.de, netdev@vger.kernel.org, avi@redhat.com,
anthony@codemonkey.ws
>
> Subject
>
> Re: [v2 RFC PATCH 0/4] Implement multiqueue virtio-net
>
> On Fri, Sep 17, 2010 at 03:33:07PM +0530, Krishna Kumar wrote:
> > For 1 TCP netperf, I ran 7 iterations and summed it. Explanation
> > for degradation for 1 stream case:
>
> I thought about possible RX/TX contention reasons, and I realized that
> we get/put the mm counter all the time.  So I write the following: I
> haven't seen any performance gain from this in a single queue case, but
> maybe this will help multiqueue?

Great! I am on vacation tomorrow, but will test with this patch
tomorrow night.

Thanks,

- KK


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [v2 RFC PATCH 0/4] Implement multiqueue virtio-net
  2010-10-06 13:34 [v2 RFC PATCH 0/4] Implement multiqueue virtio-net Michael S. Tsirkin
                   ` (2 preceding siblings ...)
  2010-10-06 17:02 ` [v2 RFC PATCH 0/4] Implement multiqueue virtio-net Krishna Kumar2
@ 2010-10-11  7:21 ` Krishna Kumar2
  2010-10-12 17:09   ` Michael S. Tsirkin
  3 siblings, 1 reply; 11+ messages in thread
From: Krishna Kumar2 @ 2010-10-11  7:21 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: anthony, arnd, avi, davem, kvm, netdev, rusty

"Michael S. Tsirkin" <mst@redhat.com> wrote on 10/06/2010 07:04:31 PM:

> On Fri, Sep 17, 2010 at 03:33:07PM +0530, Krishna Kumar wrote:
> > For 1 TCP netperf, I ran 7 iterations and summed it. Explanation
> > for degradation for 1 stream case:
>
> I thought about possible RX/TX contention reasons, and I realized that
> we get/put the mm counter all the time.  So I write the following: I
> haven't seen any performance gain from this in a single queue case, but
> maybe this will help multiqueue?

Sorry for the delay, I was sick last couple of days. The results
with your patch are (%'s over original code):

Code               BW%       CPU%       RemoteCPU
MQ     (#txq=16)   31.4%     38.42%     6.41%
MQ+MST (#txq=16)   28.3%     18.9%      -10.77%

The patch helps CPU utilization but didn't help single stream
drop.

Thanks,

- KK


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [v2 RFC PATCH 0/4] Implement multiqueue virtio-net
  2010-10-11  7:21 ` Krishna Kumar2
@ 2010-10-12 17:09   ` Michael S. Tsirkin
  2010-10-14  7:58     ` Krishna Kumar2
  0 siblings, 1 reply; 11+ messages in thread
From: Michael S. Tsirkin @ 2010-10-12 17:09 UTC (permalink / raw)
  To: Krishna Kumar2; +Cc: anthony, arnd, avi, davem, kvm, netdev, rusty

On Mon, Oct 11, 2010 at 12:51:27PM +0530, Krishna Kumar2 wrote:
> "Michael S. Tsirkin" <mst@redhat.com> wrote on 10/06/2010 07:04:31 PM:
> 
> > On Fri, Sep 17, 2010 at 03:33:07PM +0530, Krishna Kumar wrote:
> > > For 1 TCP netperf, I ran 7 iterations and summed it. Explanation
> > > for degradation for 1 stream case:
> >
> > I thought about possible RX/TX contention reasons, and I realized that
> > we get/put the mm counter all the time.  So I write the following: I
> > haven't seen any performance gain from this in a single queue case, but
> > maybe this will help multiqueue?
> 
> Sorry for the delay, I was sick last couple of days. The results
> with your patch are (%'s over original code):
> 
> Code               BW%       CPU%       RemoteCPU
> MQ     (#txq=16)   31.4%     38.42%     6.41%
> MQ+MST (#txq=16)   28.3%     18.9%      -10.77%
> 
> The patch helps CPU utilization but didn't help single stream
> drop.
> 
> Thanks,

What other shared TX/RX locks are there?  In your setup, is the same
macvtap socket structure used for RX and TX?  If yes this will create
cacheline bounces as sk_wmem_alloc/sk_rmem_alloc share a cache line,
there might also be contention on the lock in sk_sleep waitqueue.
Anything else?

-- 
MST

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [v2 RFC PATCH 0/4] Implement multiqueue virtio-net
  2010-10-12 17:09   ` Michael S. Tsirkin
@ 2010-10-14  7:58     ` Krishna Kumar2
  2010-10-14  8:17       ` Michael S. Tsirkin
  0 siblings, 1 reply; 11+ messages in thread
From: Krishna Kumar2 @ 2010-10-14  7:58 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: anthony, arnd, avi, davem, kvm, netdev, rusty

"Michael S. Tsirkin" <mst@redhat.com> wrote on 10/12/2010 10:39:07 PM:

> > Sorry for the delay, I was sick last couple of days. The results
> > with your patch are (%'s over original code):
> >
> > Code               BW%       CPU%       RemoteCPU
> > MQ     (#txq=16)   31.4%     38.42%     6.41%
> > MQ+MST (#txq=16)   28.3%     18.9%      -10.77%
> >
> > The patch helps CPU utilization but didn't help single stream
> > drop.
> >
> > Thanks,
>
> What other shared TX/RX locks are there?  In your setup, is the same
> macvtap socket structure used for RX and TX?  If yes this will create
> cacheline bounces as sk_wmem_alloc/sk_rmem_alloc share a cache line,
> there might also be contention on the lock in sk_sleep waitqueue.
> Anything else?

The patch is not introducing any locking (both vhost and virtio-net).
The single stream drop is due to different vhost threads handling the
RX/TX traffic.

I added a heuristic (fuzzy) to determine if more than one flow
is being used on the device, and if not, use vhost[0] for both
tx and rx (vhost_poll_queue figures this out before waking up
the suitable vhost thread).  Testing shows that single stream
performance is as good as the original code.

__________________________________________________________________________
		       #txqs = 2 (#vhosts = 3)
#     BW1     BW2   (%)       CPU1    CPU2 (%)       RCPU1   RCPU2 (%)
__________________________________________________________________________
1     77344   74973 (-3.06)   172     143 (-16.86)   358     324 (-9.49)
2     20924   21107 (.87)     107     103 (-3.73)    220     217 (-1.36)
4     21629   32911 (52.16)   214     391 (82.71)    446     616 (38.11)
8     21678   34359 (58.49)   428     845 (97.42)    892     1286 (44.17)
16    22046   34401 (56.04)   841     1677 (99.40)   1785    2585 (44.81)
24    22396   35117 (56.80)   1272    2447 (92.37)   2667    3863 (44.84)
32    22750   35158 (54.54)   1719    3233 (88.07)   3569    5143 (44.10)
40    23041   35345 (53.40)   2219    3970 (78.90)   4478    6410 (43.14)
48    23209   35219 (51.74)   2707    4685 (73.06)   5386    7684 (42.66)
64    23215   35209 (51.66)   3639    6195 (70.23)   7206    10218 (41.79)
80    23443   35179 (50.06)   4633    7625 (64.58)   9051    12745 (40.81)
96    24006   36108 (50.41)   5635    9096 (61.41)   10864   15283 (40.67)
128   23601   35744 (51.45)   7475    12104 (61.92)  14495   20405 (40.77)
__________________________________________________________________________
SUM:     BW: (37.6)     CPU: (69.0)     RCPU: (41.2)

__________________________________________________________________________
		       #txqs = 8 (#vhosts = 5)
#     BW1     BW2    (%)      CPU1     CPU2 (%)      RCPU1     RCPU2 (%)
__________________________________________________________________________
1     77344   75341 (-2.58)   172     171 (-.58)     358     356 (-.55)
2     20924   26872 (28.42)   107     135 (26.16)    220     262 (19.09)
4     21629   33594 (55.31)   214     394 (84.11)    446     615 (37.89)
8     21678   39714 (83.19)   428     949 (121.72)   892     1358 (52.24)
16    22046   39879 (80.88)   841     1791 (112.96)  1785    2737 (53.33)
24    22396   38436 (71.61)   1272    2111 (65.95)   2667    3453 (29.47)
32    22750   38776 (70.44)   1719    3594 (109.07)  3569    5421 (51.89)
40    23041   38023 (65.02)   2219    4358 (96.39)   4478    6507 (45.31)
48    23209   33811 (45.68)   2707    4047 (49.50)   5386    6222 (15.52)
64    23215   30212 (30.13)   3639    3858 (6.01)    7206    5819 (-19.24)
80    23443   34497 (47.15)   4633    7214 (55.70)   9051    10776 (19.05)
96    24006   30990 (29.09)   5635    5731 (1.70)    10864   8799 (-19.00)
128   23601   29413 (24.62)   7475    7804 (4.40)    14495   11638 (-19.71)
__________________________________________________________________________
SUM:     BW: (40.1)     CPU: (35.7)     RCPU: (4.1)
_______________________________________________________________________________


The SD numbers are also good (same table as before, but SD
instead of CPU:

__________________________________________________________________________
		       #txqs = 2 (#vhosts = 3)
#     BW%       SD1     SD2 (%)        RSD1     RSD2 (%)
__________________________________________________________________________
1     -3.06)    5       4 (-20.00)     21       19 (-9.52)
2     .87       6       6 (0)          27       27 (0)
4     52.16     26      32 (23.07)     108      103 (-4.62)
8     58.49     103     146 (41.74)    431      445 (3.24)
16    56.04     407     514 (26.28)    1729     1586 (-8.27)
24    56.80     934     1161 (24.30)   3916     3665 (-6.40)
32    54.54     1668    2160 (29.49)   6925     6872 (-.76)
40    53.40     2655    3317 (24.93)   10712    10707 (-.04)
48    51.74     3920    4486 (14.43)   15598    14715 (-5.66)
64    51.66     7096    8250 (16.26)   28099    27211 (-3.16)
80    50.06     11240   12586 (11.97)  43913    42070 (-4.19)
96    50.41     16342   16976 (3.87)   63017    57048 (-9.47)
128   51.45     29254   32069 (9.62)   113451   108113 (-4.70)
__________________________________________________________________________
SUM:     BW: (37.6)     SD: (10.9)     RSD: (-5.3)

__________________________________________________________________________
		       #txqs = 8 (#vhosts = 5)
#     BW%       SD1     SD2 (%)         RSD1     RSD2 (%)
__________________________________________________________________________
1     -2.58     5       5 (0)           21       21 (0)
2     28.42     6       6 (0)           27       25 (-7.40)
4     55.31     26      32 (23.07)      108      102 (-5.55)
8     83.19     103     128 (24.27)     431      368 (-14.61)
16    80.88     407     593 (45.70)     1729     1814 (4.91)
24    71.61     934     965 (3.31)      3916     3156 (-19.40)
32    70.44     1668    3232 (93.76)    6925     9752 (40.82)
40    65.02     2655    5134 (93.37)    10712    15340 (43.20)
48    45.68     3920    4592 (17.14)    15598    14122 (-9.46)
64    30.13     7096    3928 (-44.64)   28099    11880 (-57.72)
80    47.15     11240   18389 (63.60)   43913    55154 (25.59)
96    29.09     16342   21695 (32.75)   63017    66892 (6.14)
128   24.62     29254   36371 (24.32)   113451   109219 (-3.73)
__________________________________________________________________________
SUM:     BW: (40.1)     SD: (29.0)     RSD: (0)

This approach works nicely for both single and multiple stream.
Does this look good?

Thanks,

- KK


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [v2 RFC PATCH 0/4] Implement multiqueue virtio-net
  2010-10-14  7:58     ` Krishna Kumar2
@ 2010-10-14  8:17       ` Michael S. Tsirkin
  2010-10-14  9:04         ` Krishna Kumar2
       [not found]         ` <OFEC86A094.39835EBF-ON652577BC.002F9AAF-652577BC.003186B5@LocalDomain>
  0 siblings, 2 replies; 11+ messages in thread
From: Michael S. Tsirkin @ 2010-10-14  8:17 UTC (permalink / raw)
  To: Krishna Kumar2; +Cc: anthony, arnd, avi, davem, kvm, netdev, rusty

On Thu, Oct 14, 2010 at 01:28:58PM +0530, Krishna Kumar2 wrote:
> "Michael S. Tsirkin" <mst@redhat.com> wrote on 10/12/2010 10:39:07 PM:
> 
> > > Sorry for the delay, I was sick last couple of days. The results
> > > with your patch are (%'s over original code):
> > >
> > > Code               BW%       CPU%       RemoteCPU
> > > MQ     (#txq=16)   31.4%     38.42%     6.41%
> > > MQ+MST (#txq=16)   28.3%     18.9%      -10.77%
> > >
> > > The patch helps CPU utilization but didn't help single stream
> > > drop.
> > >
> > > Thanks,
> >
> > What other shared TX/RX locks are there?  In your setup, is the same
> > macvtap socket structure used for RX and TX?  If yes this will create
> > cacheline bounces as sk_wmem_alloc/sk_rmem_alloc share a cache line,
> > there might also be contention on the lock in sk_sleep waitqueue.
> > Anything else?
> 
> The patch is not introducing any locking (both vhost and virtio-net).
> The single stream drop is due to different vhost threads handling the
> RX/TX traffic.
> 
> I added a heuristic (fuzzy) to determine if more than one flow
> is being used on the device, and if not, use vhost[0] for both
> tx and rx (vhost_poll_queue figures this out before waking up
> the suitable vhost thread).  Testing shows that single stream
> performance is as good as the original code.

...

> This approach works nicely for both single and multiple stream.
> Does this look good?
> 
> Thanks,
> 
> - KK

Yes, but I guess it depends on the heuristic :) What's the logic?

-- 
MST

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [v2 RFC PATCH 0/4] Implement multiqueue virtio-net
  2010-10-14  8:17       ` Michael S. Tsirkin
@ 2010-10-14  9:04         ` Krishna Kumar2
       [not found]         ` <OFEC86A094.39835EBF-ON652577BC.002F9AAF-652577BC.003186B5@LocalDomain>
  1 sibling, 0 replies; 11+ messages in thread
From: Krishna Kumar2 @ 2010-10-14  9:04 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: anthony, arnd, avi, davem, kvm, netdev, rusty

> "Michael S. Tsirkin" <mst@redhat.com>
> > > What other shared TX/RX locks are there?  In your setup, is the same
> > > macvtap socket structure used for RX and TX?  If yes this will create
> > > cacheline bounces as sk_wmem_alloc/sk_rmem_alloc share a cache line,
> > > there might also be contention on the lock in sk_sleep waitqueue.
> > > Anything else?
> >
> > The patch is not introducing any locking (both vhost and virtio-net).
> > The single stream drop is due to different vhost threads handling the
> > RX/TX traffic.
> >
> > I added a heuristic (fuzzy) to determine if more than one flow
> > is being used on the device, and if not, use vhost[0] for both
> > tx and rx (vhost_poll_queue figures this out before waking up
> > the suitable vhost thread).  Testing shows that single stream
> > performance is as good as the original code.
>
> ...
>
> > This approach works nicely for both single and multiple stream.
> > Does this look good?
> >
> > Thanks,
> >
> > - KK
>
> Yes, but I guess it depends on the heuristic :) What's the logic?

I define how recently a txq was used. If 0 or 1 txq's were used
recently, use vq[0] (which also handles rx). Otherwise, use
multiple txq (vq[1-n]). The code is:

/*
 * Algorithm for selecting vq:
 *
 * Condition                                    Return
 * RX vq                                        vq[0]
 * If all txqs unused                           vq[0]
 * If one txq used, and new txq is same         vq[0]
 * If one txq used, and new txq is different    vq[vq->qnum]
 * If > 1 txqs used                             vq[vq->qnum]
 *      Where "used" means the txq was used in the last 'n' jiffies.
 *
 * Note: locking is not required as an update race will only result in
 * a different worker being woken up.
 */
static inline struct vhost_virtqueue *vhost_find_vq(struct vhost_poll
*poll)
{
	if (poll->vq->qnum) {
		struct vhost_dev *dev = poll->vq->dev;
		struct vhost_virtqueue *vq = &dev->vqs[0];
		unsigned long max_time = jiffies - 5; /* Some macro needed */
		unsigned long *table = dev->jiffies;
		int i, used = 0;

		for (i = 0; i < dev->nvqs - 1; i++) {
			if (time_after_eq(table[i], max_time) && ++used > 1) {
				vq = poll->vq;
				break;
			}
		}
		table[poll->vq->qnum - 1] = jiffies;
		return vq;
	}

	/* RX is handled by the same worker thread */
	return poll->vq;
}

void vhost_poll_queue(struct vhost_poll *poll)
{
        struct vhost_virtqueue *vq = vhost_find_vq(poll);

        vhost_work_queue(vq, &poll->work);
}

Since poll batches packets, find_vq does not seem to add much
to the CPU utilization (or BW). I am sure that code can be
optimized much better.

The results I sent in my last mail were without your use_mm
patch, and the only tuning was to make vhost threads run on
only cpus 0-3 (though the performance is good even without
that). I will test it later today with the use_mm patch too.

Thanks,

- KK


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [v2 RFC PATCH 0/4] Implement multiqueue virtio-net
       [not found]         ` <OFEC86A094.39835EBF-ON652577BC.002F9AAF-652577BC.003186B5@LocalDomain>
@ 2010-10-14 12:17           ` Krishna Kumar2
       [not found]           ` <OF0BDA6B3A.F673A449-ON652577BC.00422911-652577BC.0043474B@LocalDomain>
  1 sibling, 0 replies; 11+ messages in thread
From: Krishna Kumar2 @ 2010-10-14 12:17 UTC (permalink / raw)
  To: Krishna Kumar2
  Cc: anthony, arnd, avi, davem, kvm, Michael S. Tsirkin, netdev, rusty

Krishna Kumar2/India/IBM wrote on 10/14/2010 02:34:01 PM:

> void vhost_poll_queue(struct vhost_poll *poll)
> {
>         struct vhost_virtqueue *vq = vhost_find_vq(poll);
>
>         vhost_work_queue(vq, &poll->work);
> }
>
> Since poll batches packets, find_vq does not seem to add much
> to the CPU utilization (or BW). I am sure that code can be
> optimized much better.
>
> The results I sent in my last mail were without your use_mm
> patch, and the only tuning was to make vhost threads run on
> only cpus 0-3 (though the performance is good even without
> that). I will test it later today with the use_mm patch too.

There's a significant reduction in CPU/SD utilization with your
patch. Following is the performance of ORG vs MQ+mm patch:

_________________________________________________
               Org vs MQ+mm patch txq=2
#     BW%     CPU/RCPU%         SD/RSD%
_________________________________________________
1     2.26    -1.16    .27      -20.00  0
2     35.07   29.90    21.81     0      -11.11
4     55.03   84.57    37.66     26.92  -4.62
8     73.16   118.69   49.21     45.63  -.46
16    77.43   98.81    47.89     24.07  -7.80
24    71.59   105.18   48.44     62.84  18.18
32    70.91   102.38   47.15     49.22  8.54
40    63.26   90.58    41.00     85.27  37.33
48    45.25   45.99    11.23     14.31  -12.91
64    42.78   41.82    5.50      .43    -25.12
80    31.40   7.31     -18.69    15.78  -11.93
96    27.60   7.79     -18.54    17.39  -10.98
128   23.46   -11.89   -34.41    -.41   -25.53
_________________________________________________
BW: 40.2  CPU/RCPU: 29.9,-2.2   SD/RSD: 12.0,-15.6


Following is the performance of MQ vs MQ+mm patch:
_____________________________________________________
            MQ vs MQ+mm patch
#     BW%      CPU%       RCPU%    SD%      RSD%
_____________________________________________________
1      4.98    -.58       .84      -20.00    0
2      5.17     2.96      2.29      0       -4.00
4     -.18      .25      -.16       3.12     .98
8     -5.47    -1.36     -1.98      17.18    16.57
16    -1.90    -6.64     -3.54     -14.83   -12.12
24    -.01      23.63     14.65     57.61    46.64
32     .27     -3.19      -3.11    -22.98   -22.91
40    -1.06    -2.96      -2.96    -4.18    -4.10
48    -.28     -2.34      -3.71    -2.41    -3.81
64     9.71     33.77      30.65    81.44    77.09
80    -10.69    -31.07    -31.70   -29.22   -29.88
96    -1.14     5.98       .56     -11.57   -16.14
128   -.93     -15.60     -18.31   -19.89   -22.65
_____________________________________________________
  BW: 0   CPU/RCPU: -4.2,-6.1  SD/RSD: -13.1,-15.6
_____________________________________________________

Each test case is for 60 secs, sum over two runs (except
when number of netperf sessions is 1, which has 7 runs
of 10 secs each), numcpus=4, numtxqs=8, etc. No tuning
other than taskset each vhost to cpus 0-3.

Thanks,

- KK


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [v2 RFC PATCH 0/4] Implement multiqueue virtio-net
       [not found]           ` <OF0BDA6B3A.F673A449-ON652577BC.00422911-652577BC.0043474B@LocalDomain>
@ 2010-10-14 12:47             ` Krishna Kumar2
  0 siblings, 0 replies; 11+ messages in thread
From: Krishna Kumar2 @ 2010-10-14 12:47 UTC (permalink / raw)
  To: Krishna Kumar2
  Cc: anthony, arnd, avi, davem, kvm, Michael S. Tsirkin, netdev, rusty

Krishna Kumar2/India/IBM wrote on 10/14/2010 05:47:54 PM:

Sorry, it should read "txq=8" below.

- KK

> There's a significant reduction in CPU/SD utilization with your
> patch. Following is the performance of ORG vs MQ+mm patch:
>
> _________________________________________________
>                Org vs MQ+mm patch txq=2
> #     BW%     CPU/RCPU%         SD/RSD%
> _________________________________________________
> 1     2.26    -1.16    .27      -20.00  0
> 2     35.07   29.90    21.81     0      -11.11
> 4     55.03   84.57    37.66     26.92  -4.62
> 8     73.16   118.69   49.21     45.63  -.46
> 16    77.43   98.81    47.89     24.07  -7.80
> 24    71.59   105.18   48.44     62.84  18.18
> 32    70.91   102.38   47.15     49.22  8.54
> 40    63.26   90.58    41.00     85.27  37.33
> 48    45.25   45.99    11.23     14.31  -12.91
> 64    42.78   41.82    5.50      .43    -25.12
> 80    31.40   7.31     -18.69    15.78  -11.93
> 96    27.60   7.79     -18.54    17.39  -10.98
> 128   23.46   -11.89   -34.41    -.41   -25.53
> _________________________________________________
> BW: 40.2  CPU/RCPU: 29.9,-2.2   SD/RSD: 12.0,-15.6
>
> Following is the performance of MQ vs MQ+mm patch:
> _____________________________________________________
>             MQ vs MQ+mm patch
> #     BW%      CPU%       RCPU%    SD%      RSD%
> _____________________________________________________
> 1      4.98    -.58       .84      -20.00    0
> 2      5.17     2.96      2.29      0       -4.00
> 4     -.18      .25      -.16       3.12     .98
> 8     -5.47    -1.36     -1.98      17.18    16.57
> 16    -1.90    -6.64     -3.54     -14.83   -12.12
> 24    -.01      23.63     14.65     57.61    46.64
> 32     .27     -3.19      -3.11    -22.98   -22.91
> 40    -1.06    -2.96      -2.96    -4.18    -4.10
> 48    -.28     -2.34      -3.71    -2.41    -3.81
> 64     9.71     33.77      30.65    81.44    77.09
> 80    -10.69    -31.07    -31.70   -29.22   -29.88
> 96    -1.14     5.98       .56     -11.57   -16.14
> 128   -.93     -15.60     -18.31   -19.89   -22.65
> _____________________________________________________
>   BW: 0   CPU/RCPU: -4.2,-6.1  SD/RSD: -13.1,-15.6
> _____________________________________________________
>
> Each test case is for 60 secs, sum over two runs (except
> when number of netperf sessions is 1, which has 7 runs
> of 10 secs each), numcpus=4, numtxqs=8, etc. No tuning
> other than taskset each vhost to cpus 0-3.
>
> Thanks,
>
> - KK


^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2010-10-14 12:47 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2010-10-06 13:34 [v2 RFC PATCH 0/4] Implement multiqueue virtio-net Michael S. Tsirkin
2010-10-06 13:34 ` [PATCH 1/2] vhost: put mm after thread stop Michael S. Tsirkin
2010-10-06 13:34 ` [PATCH 2/2] vhost-net: batch use/unuse mm Michael S. Tsirkin
2010-10-06 17:02 ` [v2 RFC PATCH 0/4] Implement multiqueue virtio-net Krishna Kumar2
2010-10-11  7:21 ` Krishna Kumar2
2010-10-12 17:09   ` Michael S. Tsirkin
2010-10-14  7:58     ` Krishna Kumar2
2010-10-14  8:17       ` Michael S. Tsirkin
2010-10-14  9:04         ` Krishna Kumar2
     [not found]         ` <OFEC86A094.39835EBF-ON652577BC.002F9AAF-652577BC.003186B5@LocalDomain>
2010-10-14 12:17           ` Krishna Kumar2
     [not found]           ` <OF0BDA6B3A.F673A449-ON652577BC.00422911-652577BC.0043474B@LocalDomain>
2010-10-14 12:47             ` Krishna Kumar2

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).