* [PATCH 1/2] vhost: put mm after thread stop
2010-10-06 13:34 [v2 RFC PATCH 0/4] Implement multiqueue virtio-net Michael S. Tsirkin
@ 2010-10-06 13:34 ` Michael S. Tsirkin
2010-10-06 13:34 ` [PATCH 2/2] vhost-net: batch use/unuse mm Michael S. Tsirkin
` (2 subsequent siblings)
3 siblings, 0 replies; 11+ messages in thread
From: Michael S. Tsirkin @ 2010-10-06 13:34 UTC (permalink / raw)
To: Krishna Kumar; +Cc: rusty, davem, kvm, arnd, netdev, avi, anthony
makes it possible to batch use/unuse mm
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
drivers/vhost/vhost.c | 9 ++++-----
1 files changed, 4 insertions(+), 5 deletions(-)
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 677d112..8b9d474 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -207,7 +207,7 @@ static int vhost_worker(void *data)
if (work) {
__set_current_state(TASK_RUNNING);
work->fn(work);
- if (n++) {
+ if (dev->nvqs <= ++n) {
__set_current_state(TASK_RUNNING);
schedule();
n = 0;
@@ -409,15 +409,14 @@ void vhost_dev_cleanup(struct vhost_dev *dev)
/* No one will access memory at this point */
kfree(dev->memory);
dev->memory = NULL;
- if (dev->mm)
- mmput(dev->mm);
- dev->mm = NULL;
-
WARN_ON(!list_empty(&dev->work_list));
if (dev->worker) {
kthread_stop(dev->worker);
dev->worker = NULL;
}
+ if (dev->mm)
+ mmput(dev->mm);
+ dev->mm = NULL;
}
static int log_access_ok(void __user *log_base, u64 addr, unsigned long sz)
--
1.7.3-rc1
^ permalink raw reply related [flat|nested] 11+ messages in thread
* [PATCH 2/2] vhost-net: batch use/unuse mm
2010-10-06 13:34 [v2 RFC PATCH 0/4] Implement multiqueue virtio-net Michael S. Tsirkin
2010-10-06 13:34 ` [PATCH 1/2] vhost: put mm after thread stop Michael S. Tsirkin
@ 2010-10-06 13:34 ` Michael S. Tsirkin
2010-10-06 17:02 ` [v2 RFC PATCH 0/4] Implement multiqueue virtio-net Krishna Kumar2
2010-10-11 7:21 ` Krishna Kumar2
3 siblings, 0 replies; 11+ messages in thread
From: Michael S. Tsirkin @ 2010-10-06 13:34 UTC (permalink / raw)
To: Krishna Kumar; +Cc: rusty, davem, kvm, arnd, netdev, avi, anthony
Move use/unuse mm to vhost.c which makes it possible to batch these
operations.
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
drivers/vhost/net.c | 7 -------
drivers/vhost/vhost.c | 7 ++++++-
2 files changed, 6 insertions(+), 8 deletions(-)
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 271678e..ff02ea4 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -10,7 +10,6 @@
#include <linux/eventfd.h>
#include <linux/vhost.h>
#include <linux/virtio_net.h>
-#include <linux/mmu_context.h>
#include <linux/miscdevice.h>
#include <linux/module.h>
#include <linux/mutex.h>
@@ -136,7 +135,6 @@ static void handle_tx(struct vhost_net *net)
return;
}
- use_mm(net->dev.mm);
mutex_lock(&vq->mutex);
vhost_disable_notify(vq);
@@ -197,7 +195,6 @@ static void handle_tx(struct vhost_net *net)
}
mutex_unlock(&vq->mutex);
- unuse_mm(net->dev.mm);
}
static int peek_head_len(struct sock *sk)
@@ -302,7 +299,6 @@ static void handle_rx_big(struct vhost_net *net)
if (!sock || skb_queue_empty(&sock->sk->sk_receive_queue))
return;
- use_mm(net->dev.mm);
mutex_lock(&vq->mutex);
vhost_disable_notify(vq);
hdr_size = vq->vhost_hlen;
@@ -381,7 +377,6 @@ static void handle_rx_big(struct vhost_net *net)
}
mutex_unlock(&vq->mutex);
- unuse_mm(net->dev.mm);
}
/* Expects to be always run from workqueue - which acts as
@@ -413,7 +408,6 @@ static void handle_rx_mergeable(struct vhost_net *net)
if (!sock || skb_queue_empty(&sock->sk->sk_receive_queue))
return;
- use_mm(net->dev.mm);
mutex_lock(&vq->mutex);
vhost_disable_notify(vq);
vhost_hlen = vq->vhost_hlen;
@@ -490,7 +484,6 @@ static void handle_rx_mergeable(struct vhost_net *net)
}
mutex_unlock(&vq->mutex);
- unuse_mm(net->dev.mm);
}
static void handle_rx(struct vhost_net *net)
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 8b9d474..c83d1c2 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -15,6 +15,7 @@
#include <linux/vhost.h>
#include <linux/virtio_net.h>
#include <linux/mm.h>
+#include <linux/mmu_context.h>
#include <linux/miscdevice.h>
#include <linux/mutex.h>
#include <linux/rcupdate.h>
@@ -179,6 +180,8 @@ static int vhost_worker(void *data)
unsigned uninitialized_var(seq);
int n = 0;
+ use_mm(dev->mm);
+
for (;;) {
/* mb paired w/ kthread_stop */
set_current_state(TASK_INTERRUPTIBLE);
@@ -193,7 +196,7 @@ static int vhost_worker(void *data)
if (kthread_should_stop()) {
spin_unlock_irq(&dev->work_lock);
__set_current_state(TASK_RUNNING);
- return 0;
+ break;
}
if (!list_empty(&dev->work_list)) {
work = list_first_entry(&dev->work_list,
@@ -218,6 +221,8 @@ static int vhost_worker(void *data)
}
}
+ unuse_mm(dev->mm);
+ return 0;
}
/* Helper to allocate iovec buffers for all vqs. */
--
1.7.3-rc1
^ permalink raw reply related [flat|nested] 11+ messages in thread
* Re: [v2 RFC PATCH 0/4] Implement multiqueue virtio-net
2010-10-06 13:34 [v2 RFC PATCH 0/4] Implement multiqueue virtio-net Michael S. Tsirkin
2010-10-06 13:34 ` [PATCH 1/2] vhost: put mm after thread stop Michael S. Tsirkin
2010-10-06 13:34 ` [PATCH 2/2] vhost-net: batch use/unuse mm Michael S. Tsirkin
@ 2010-10-06 17:02 ` Krishna Kumar2
2010-10-11 7:21 ` Krishna Kumar2
3 siblings, 0 replies; 11+ messages in thread
From: Krishna Kumar2 @ 2010-10-06 17:02 UTC (permalink / raw)
To: Michael S. Tsirkin; +Cc: anthony, arnd, avi, davem, kvm, netdev, rusty
"Michael S. Tsirkin" <mst@redhat.com> wrote on 10/06/2010 07:04:31 PM:
> "Michael S. Tsirkin" <mst@redhat.com>
> 10/06/2010 07:04 PM
>
> To
>
> Krishna Kumar2/India/IBM@IBMIN
>
> cc
>
> rusty@rustcorp.com.au, davem@davemloft.net, kvm@vger.kernel.org,
> arnd@arndb.de, netdev@vger.kernel.org, avi@redhat.com,
anthony@codemonkey.ws
>
> Subject
>
> Re: [v2 RFC PATCH 0/4] Implement multiqueue virtio-net
>
> On Fri, Sep 17, 2010 at 03:33:07PM +0530, Krishna Kumar wrote:
> > For 1 TCP netperf, I ran 7 iterations and summed it. Explanation
> > for degradation for 1 stream case:
>
> I thought about possible RX/TX contention reasons, and I realized that
> we get/put the mm counter all the time. So I write the following: I
> haven't seen any performance gain from this in a single queue case, but
> maybe this will help multiqueue?
Great! I am on vacation tomorrow, but will test with this patch
tomorrow night.
Thanks,
- KK
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [v2 RFC PATCH 0/4] Implement multiqueue virtio-net
2010-10-06 13:34 [v2 RFC PATCH 0/4] Implement multiqueue virtio-net Michael S. Tsirkin
` (2 preceding siblings ...)
2010-10-06 17:02 ` [v2 RFC PATCH 0/4] Implement multiqueue virtio-net Krishna Kumar2
@ 2010-10-11 7:21 ` Krishna Kumar2
2010-10-12 17:09 ` Michael S. Tsirkin
3 siblings, 1 reply; 11+ messages in thread
From: Krishna Kumar2 @ 2010-10-11 7:21 UTC (permalink / raw)
To: Michael S. Tsirkin; +Cc: anthony, arnd, avi, davem, kvm, netdev, rusty
"Michael S. Tsirkin" <mst@redhat.com> wrote on 10/06/2010 07:04:31 PM:
> On Fri, Sep 17, 2010 at 03:33:07PM +0530, Krishna Kumar wrote:
> > For 1 TCP netperf, I ran 7 iterations and summed it. Explanation
> > for degradation for 1 stream case:
>
> I thought about possible RX/TX contention reasons, and I realized that
> we get/put the mm counter all the time. So I write the following: I
> haven't seen any performance gain from this in a single queue case, but
> maybe this will help multiqueue?
Sorry for the delay, I was sick last couple of days. The results
with your patch are (%'s over original code):
Code BW% CPU% RemoteCPU
MQ (#txq=16) 31.4% 38.42% 6.41%
MQ+MST (#txq=16) 28.3% 18.9% -10.77%
The patch helps CPU utilization but didn't help single stream
drop.
Thanks,
- KK
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [v2 RFC PATCH 0/4] Implement multiqueue virtio-net
2010-10-11 7:21 ` Krishna Kumar2
@ 2010-10-12 17:09 ` Michael S. Tsirkin
2010-10-14 7:58 ` Krishna Kumar2
0 siblings, 1 reply; 11+ messages in thread
From: Michael S. Tsirkin @ 2010-10-12 17:09 UTC (permalink / raw)
To: Krishna Kumar2; +Cc: anthony, arnd, avi, davem, kvm, netdev, rusty
On Mon, Oct 11, 2010 at 12:51:27PM +0530, Krishna Kumar2 wrote:
> "Michael S. Tsirkin" <mst@redhat.com> wrote on 10/06/2010 07:04:31 PM:
>
> > On Fri, Sep 17, 2010 at 03:33:07PM +0530, Krishna Kumar wrote:
> > > For 1 TCP netperf, I ran 7 iterations and summed it. Explanation
> > > for degradation for 1 stream case:
> >
> > I thought about possible RX/TX contention reasons, and I realized that
> > we get/put the mm counter all the time. So I write the following: I
> > haven't seen any performance gain from this in a single queue case, but
> > maybe this will help multiqueue?
>
> Sorry for the delay, I was sick last couple of days. The results
> with your patch are (%'s over original code):
>
> Code BW% CPU% RemoteCPU
> MQ (#txq=16) 31.4% 38.42% 6.41%
> MQ+MST (#txq=16) 28.3% 18.9% -10.77%
>
> The patch helps CPU utilization but didn't help single stream
> drop.
>
> Thanks,
What other shared TX/RX locks are there? In your setup, is the same
macvtap socket structure used for RX and TX? If yes this will create
cacheline bounces as sk_wmem_alloc/sk_rmem_alloc share a cache line,
there might also be contention on the lock in sk_sleep waitqueue.
Anything else?
--
MST
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [v2 RFC PATCH 0/4] Implement multiqueue virtio-net
2010-10-12 17:09 ` Michael S. Tsirkin
@ 2010-10-14 7:58 ` Krishna Kumar2
2010-10-14 8:17 ` Michael S. Tsirkin
0 siblings, 1 reply; 11+ messages in thread
From: Krishna Kumar2 @ 2010-10-14 7:58 UTC (permalink / raw)
To: Michael S. Tsirkin; +Cc: anthony, arnd, avi, davem, kvm, netdev, rusty
"Michael S. Tsirkin" <mst@redhat.com> wrote on 10/12/2010 10:39:07 PM:
> > Sorry for the delay, I was sick last couple of days. The results
> > with your patch are (%'s over original code):
> >
> > Code BW% CPU% RemoteCPU
> > MQ (#txq=16) 31.4% 38.42% 6.41%
> > MQ+MST (#txq=16) 28.3% 18.9% -10.77%
> >
> > The patch helps CPU utilization but didn't help single stream
> > drop.
> >
> > Thanks,
>
> What other shared TX/RX locks are there? In your setup, is the same
> macvtap socket structure used for RX and TX? If yes this will create
> cacheline bounces as sk_wmem_alloc/sk_rmem_alloc share a cache line,
> there might also be contention on the lock in sk_sleep waitqueue.
> Anything else?
The patch is not introducing any locking (both vhost and virtio-net).
The single stream drop is due to different vhost threads handling the
RX/TX traffic.
I added a heuristic (fuzzy) to determine if more than one flow
is being used on the device, and if not, use vhost[0] for both
tx and rx (vhost_poll_queue figures this out before waking up
the suitable vhost thread). Testing shows that single stream
performance is as good as the original code.
__________________________________________________________________________
#txqs = 2 (#vhosts = 3)
# BW1 BW2 (%) CPU1 CPU2 (%) RCPU1 RCPU2 (%)
__________________________________________________________________________
1 77344 74973 (-3.06) 172 143 (-16.86) 358 324 (-9.49)
2 20924 21107 (.87) 107 103 (-3.73) 220 217 (-1.36)
4 21629 32911 (52.16) 214 391 (82.71) 446 616 (38.11)
8 21678 34359 (58.49) 428 845 (97.42) 892 1286 (44.17)
16 22046 34401 (56.04) 841 1677 (99.40) 1785 2585 (44.81)
24 22396 35117 (56.80) 1272 2447 (92.37) 2667 3863 (44.84)
32 22750 35158 (54.54) 1719 3233 (88.07) 3569 5143 (44.10)
40 23041 35345 (53.40) 2219 3970 (78.90) 4478 6410 (43.14)
48 23209 35219 (51.74) 2707 4685 (73.06) 5386 7684 (42.66)
64 23215 35209 (51.66) 3639 6195 (70.23) 7206 10218 (41.79)
80 23443 35179 (50.06) 4633 7625 (64.58) 9051 12745 (40.81)
96 24006 36108 (50.41) 5635 9096 (61.41) 10864 15283 (40.67)
128 23601 35744 (51.45) 7475 12104 (61.92) 14495 20405 (40.77)
__________________________________________________________________________
SUM: BW: (37.6) CPU: (69.0) RCPU: (41.2)
__________________________________________________________________________
#txqs = 8 (#vhosts = 5)
# BW1 BW2 (%) CPU1 CPU2 (%) RCPU1 RCPU2 (%)
__________________________________________________________________________
1 77344 75341 (-2.58) 172 171 (-.58) 358 356 (-.55)
2 20924 26872 (28.42) 107 135 (26.16) 220 262 (19.09)
4 21629 33594 (55.31) 214 394 (84.11) 446 615 (37.89)
8 21678 39714 (83.19) 428 949 (121.72) 892 1358 (52.24)
16 22046 39879 (80.88) 841 1791 (112.96) 1785 2737 (53.33)
24 22396 38436 (71.61) 1272 2111 (65.95) 2667 3453 (29.47)
32 22750 38776 (70.44) 1719 3594 (109.07) 3569 5421 (51.89)
40 23041 38023 (65.02) 2219 4358 (96.39) 4478 6507 (45.31)
48 23209 33811 (45.68) 2707 4047 (49.50) 5386 6222 (15.52)
64 23215 30212 (30.13) 3639 3858 (6.01) 7206 5819 (-19.24)
80 23443 34497 (47.15) 4633 7214 (55.70) 9051 10776 (19.05)
96 24006 30990 (29.09) 5635 5731 (1.70) 10864 8799 (-19.00)
128 23601 29413 (24.62) 7475 7804 (4.40) 14495 11638 (-19.71)
__________________________________________________________________________
SUM: BW: (40.1) CPU: (35.7) RCPU: (4.1)
_______________________________________________________________________________
The SD numbers are also good (same table as before, but SD
instead of CPU:
__________________________________________________________________________
#txqs = 2 (#vhosts = 3)
# BW% SD1 SD2 (%) RSD1 RSD2 (%)
__________________________________________________________________________
1 -3.06) 5 4 (-20.00) 21 19 (-9.52)
2 .87 6 6 (0) 27 27 (0)
4 52.16 26 32 (23.07) 108 103 (-4.62)
8 58.49 103 146 (41.74) 431 445 (3.24)
16 56.04 407 514 (26.28) 1729 1586 (-8.27)
24 56.80 934 1161 (24.30) 3916 3665 (-6.40)
32 54.54 1668 2160 (29.49) 6925 6872 (-.76)
40 53.40 2655 3317 (24.93) 10712 10707 (-.04)
48 51.74 3920 4486 (14.43) 15598 14715 (-5.66)
64 51.66 7096 8250 (16.26) 28099 27211 (-3.16)
80 50.06 11240 12586 (11.97) 43913 42070 (-4.19)
96 50.41 16342 16976 (3.87) 63017 57048 (-9.47)
128 51.45 29254 32069 (9.62) 113451 108113 (-4.70)
__________________________________________________________________________
SUM: BW: (37.6) SD: (10.9) RSD: (-5.3)
__________________________________________________________________________
#txqs = 8 (#vhosts = 5)
# BW% SD1 SD2 (%) RSD1 RSD2 (%)
__________________________________________________________________________
1 -2.58 5 5 (0) 21 21 (0)
2 28.42 6 6 (0) 27 25 (-7.40)
4 55.31 26 32 (23.07) 108 102 (-5.55)
8 83.19 103 128 (24.27) 431 368 (-14.61)
16 80.88 407 593 (45.70) 1729 1814 (4.91)
24 71.61 934 965 (3.31) 3916 3156 (-19.40)
32 70.44 1668 3232 (93.76) 6925 9752 (40.82)
40 65.02 2655 5134 (93.37) 10712 15340 (43.20)
48 45.68 3920 4592 (17.14) 15598 14122 (-9.46)
64 30.13 7096 3928 (-44.64) 28099 11880 (-57.72)
80 47.15 11240 18389 (63.60) 43913 55154 (25.59)
96 29.09 16342 21695 (32.75) 63017 66892 (6.14)
128 24.62 29254 36371 (24.32) 113451 109219 (-3.73)
__________________________________________________________________________
SUM: BW: (40.1) SD: (29.0) RSD: (0)
This approach works nicely for both single and multiple stream.
Does this look good?
Thanks,
- KK
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [v2 RFC PATCH 0/4] Implement multiqueue virtio-net
2010-10-14 7:58 ` Krishna Kumar2
@ 2010-10-14 8:17 ` Michael S. Tsirkin
2010-10-14 9:04 ` Krishna Kumar2
[not found] ` <OFEC86A094.39835EBF-ON652577BC.002F9AAF-652577BC.003186B5@LocalDomain>
0 siblings, 2 replies; 11+ messages in thread
From: Michael S. Tsirkin @ 2010-10-14 8:17 UTC (permalink / raw)
To: Krishna Kumar2; +Cc: anthony, arnd, avi, davem, kvm, netdev, rusty
On Thu, Oct 14, 2010 at 01:28:58PM +0530, Krishna Kumar2 wrote:
> "Michael S. Tsirkin" <mst@redhat.com> wrote on 10/12/2010 10:39:07 PM:
>
> > > Sorry for the delay, I was sick last couple of days. The results
> > > with your patch are (%'s over original code):
> > >
> > > Code BW% CPU% RemoteCPU
> > > MQ (#txq=16) 31.4% 38.42% 6.41%
> > > MQ+MST (#txq=16) 28.3% 18.9% -10.77%
> > >
> > > The patch helps CPU utilization but didn't help single stream
> > > drop.
> > >
> > > Thanks,
> >
> > What other shared TX/RX locks are there? In your setup, is the same
> > macvtap socket structure used for RX and TX? If yes this will create
> > cacheline bounces as sk_wmem_alloc/sk_rmem_alloc share a cache line,
> > there might also be contention on the lock in sk_sleep waitqueue.
> > Anything else?
>
> The patch is not introducing any locking (both vhost and virtio-net).
> The single stream drop is due to different vhost threads handling the
> RX/TX traffic.
>
> I added a heuristic (fuzzy) to determine if more than one flow
> is being used on the device, and if not, use vhost[0] for both
> tx and rx (vhost_poll_queue figures this out before waking up
> the suitable vhost thread). Testing shows that single stream
> performance is as good as the original code.
...
> This approach works nicely for both single and multiple stream.
> Does this look good?
>
> Thanks,
>
> - KK
Yes, but I guess it depends on the heuristic :) What's the logic?
--
MST
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [v2 RFC PATCH 0/4] Implement multiqueue virtio-net
2010-10-14 8:17 ` Michael S. Tsirkin
@ 2010-10-14 9:04 ` Krishna Kumar2
[not found] ` <OFEC86A094.39835EBF-ON652577BC.002F9AAF-652577BC.003186B5@LocalDomain>
1 sibling, 0 replies; 11+ messages in thread
From: Krishna Kumar2 @ 2010-10-14 9:04 UTC (permalink / raw)
To: Michael S. Tsirkin; +Cc: anthony, arnd, avi, davem, kvm, netdev, rusty
> "Michael S. Tsirkin" <mst@redhat.com>
> > > What other shared TX/RX locks are there? In your setup, is the same
> > > macvtap socket structure used for RX and TX? If yes this will create
> > > cacheline bounces as sk_wmem_alloc/sk_rmem_alloc share a cache line,
> > > there might also be contention on the lock in sk_sleep waitqueue.
> > > Anything else?
> >
> > The patch is not introducing any locking (both vhost and virtio-net).
> > The single stream drop is due to different vhost threads handling the
> > RX/TX traffic.
> >
> > I added a heuristic (fuzzy) to determine if more than one flow
> > is being used on the device, and if not, use vhost[0] for both
> > tx and rx (vhost_poll_queue figures this out before waking up
> > the suitable vhost thread). Testing shows that single stream
> > performance is as good as the original code.
>
> ...
>
> > This approach works nicely for both single and multiple stream.
> > Does this look good?
> >
> > Thanks,
> >
> > - KK
>
> Yes, but I guess it depends on the heuristic :) What's the logic?
I define how recently a txq was used. If 0 or 1 txq's were used
recently, use vq[0] (which also handles rx). Otherwise, use
multiple txq (vq[1-n]). The code is:
/*
* Algorithm for selecting vq:
*
* Condition Return
* RX vq vq[0]
* If all txqs unused vq[0]
* If one txq used, and new txq is same vq[0]
* If one txq used, and new txq is different vq[vq->qnum]
* If > 1 txqs used vq[vq->qnum]
* Where "used" means the txq was used in the last 'n' jiffies.
*
* Note: locking is not required as an update race will only result in
* a different worker being woken up.
*/
static inline struct vhost_virtqueue *vhost_find_vq(struct vhost_poll
*poll)
{
if (poll->vq->qnum) {
struct vhost_dev *dev = poll->vq->dev;
struct vhost_virtqueue *vq = &dev->vqs[0];
unsigned long max_time = jiffies - 5; /* Some macro needed */
unsigned long *table = dev->jiffies;
int i, used = 0;
for (i = 0; i < dev->nvqs - 1; i++) {
if (time_after_eq(table[i], max_time) && ++used > 1) {
vq = poll->vq;
break;
}
}
table[poll->vq->qnum - 1] = jiffies;
return vq;
}
/* RX is handled by the same worker thread */
return poll->vq;
}
void vhost_poll_queue(struct vhost_poll *poll)
{
struct vhost_virtqueue *vq = vhost_find_vq(poll);
vhost_work_queue(vq, &poll->work);
}
Since poll batches packets, find_vq does not seem to add much
to the CPU utilization (or BW). I am sure that code can be
optimized much better.
The results I sent in my last mail were without your use_mm
patch, and the only tuning was to make vhost threads run on
only cpus 0-3 (though the performance is good even without
that). I will test it later today with the use_mm patch too.
Thanks,
- KK
^ permalink raw reply [flat|nested] 11+ messages in thread
[parent not found: <OFEC86A094.39835EBF-ON652577BC.002F9AAF-652577BC.003186B5@LocalDomain>]
* Re: [v2 RFC PATCH 0/4] Implement multiqueue virtio-net
[not found] ` <OFEC86A094.39835EBF-ON652577BC.002F9AAF-652577BC.003186B5@LocalDomain>
@ 2010-10-14 12:17 ` Krishna Kumar2
[not found] ` <OF0BDA6B3A.F673A449-ON652577BC.00422911-652577BC.0043474B@LocalDomain>
1 sibling, 0 replies; 11+ messages in thread
From: Krishna Kumar2 @ 2010-10-14 12:17 UTC (permalink / raw)
To: Krishna Kumar2
Cc: anthony, arnd, avi, davem, kvm, Michael S. Tsirkin, netdev, rusty
Krishna Kumar2/India/IBM wrote on 10/14/2010 02:34:01 PM:
> void vhost_poll_queue(struct vhost_poll *poll)
> {
> struct vhost_virtqueue *vq = vhost_find_vq(poll);
>
> vhost_work_queue(vq, &poll->work);
> }
>
> Since poll batches packets, find_vq does not seem to add much
> to the CPU utilization (or BW). I am sure that code can be
> optimized much better.
>
> The results I sent in my last mail were without your use_mm
> patch, and the only tuning was to make vhost threads run on
> only cpus 0-3 (though the performance is good even without
> that). I will test it later today with the use_mm patch too.
There's a significant reduction in CPU/SD utilization with your
patch. Following is the performance of ORG vs MQ+mm patch:
_________________________________________________
Org vs MQ+mm patch txq=2
# BW% CPU/RCPU% SD/RSD%
_________________________________________________
1 2.26 -1.16 .27 -20.00 0
2 35.07 29.90 21.81 0 -11.11
4 55.03 84.57 37.66 26.92 -4.62
8 73.16 118.69 49.21 45.63 -.46
16 77.43 98.81 47.89 24.07 -7.80
24 71.59 105.18 48.44 62.84 18.18
32 70.91 102.38 47.15 49.22 8.54
40 63.26 90.58 41.00 85.27 37.33
48 45.25 45.99 11.23 14.31 -12.91
64 42.78 41.82 5.50 .43 -25.12
80 31.40 7.31 -18.69 15.78 -11.93
96 27.60 7.79 -18.54 17.39 -10.98
128 23.46 -11.89 -34.41 -.41 -25.53
_________________________________________________
BW: 40.2 CPU/RCPU: 29.9,-2.2 SD/RSD: 12.0,-15.6
Following is the performance of MQ vs MQ+mm patch:
_____________________________________________________
MQ vs MQ+mm patch
# BW% CPU% RCPU% SD% RSD%
_____________________________________________________
1 4.98 -.58 .84 -20.00 0
2 5.17 2.96 2.29 0 -4.00
4 -.18 .25 -.16 3.12 .98
8 -5.47 -1.36 -1.98 17.18 16.57
16 -1.90 -6.64 -3.54 -14.83 -12.12
24 -.01 23.63 14.65 57.61 46.64
32 .27 -3.19 -3.11 -22.98 -22.91
40 -1.06 -2.96 -2.96 -4.18 -4.10
48 -.28 -2.34 -3.71 -2.41 -3.81
64 9.71 33.77 30.65 81.44 77.09
80 -10.69 -31.07 -31.70 -29.22 -29.88
96 -1.14 5.98 .56 -11.57 -16.14
128 -.93 -15.60 -18.31 -19.89 -22.65
_____________________________________________________
BW: 0 CPU/RCPU: -4.2,-6.1 SD/RSD: -13.1,-15.6
_____________________________________________________
Each test case is for 60 secs, sum over two runs (except
when number of netperf sessions is 1, which has 7 runs
of 10 secs each), numcpus=4, numtxqs=8, etc. No tuning
other than taskset each vhost to cpus 0-3.
Thanks,
- KK
^ permalink raw reply [flat|nested] 11+ messages in thread
[parent not found: <OF0BDA6B3A.F673A449-ON652577BC.00422911-652577BC.0043474B@LocalDomain>]
* Re: [v2 RFC PATCH 0/4] Implement multiqueue virtio-net
[not found] ` <OF0BDA6B3A.F673A449-ON652577BC.00422911-652577BC.0043474B@LocalDomain>
@ 2010-10-14 12:47 ` Krishna Kumar2
0 siblings, 0 replies; 11+ messages in thread
From: Krishna Kumar2 @ 2010-10-14 12:47 UTC (permalink / raw)
To: Krishna Kumar2
Cc: anthony, arnd, avi, davem, kvm, Michael S. Tsirkin, netdev, rusty
Krishna Kumar2/India/IBM wrote on 10/14/2010 05:47:54 PM:
Sorry, it should read "txq=8" below.
- KK
> There's a significant reduction in CPU/SD utilization with your
> patch. Following is the performance of ORG vs MQ+mm patch:
>
> _________________________________________________
> Org vs MQ+mm patch txq=2
> # BW% CPU/RCPU% SD/RSD%
> _________________________________________________
> 1 2.26 -1.16 .27 -20.00 0
> 2 35.07 29.90 21.81 0 -11.11
> 4 55.03 84.57 37.66 26.92 -4.62
> 8 73.16 118.69 49.21 45.63 -.46
> 16 77.43 98.81 47.89 24.07 -7.80
> 24 71.59 105.18 48.44 62.84 18.18
> 32 70.91 102.38 47.15 49.22 8.54
> 40 63.26 90.58 41.00 85.27 37.33
> 48 45.25 45.99 11.23 14.31 -12.91
> 64 42.78 41.82 5.50 .43 -25.12
> 80 31.40 7.31 -18.69 15.78 -11.93
> 96 27.60 7.79 -18.54 17.39 -10.98
> 128 23.46 -11.89 -34.41 -.41 -25.53
> _________________________________________________
> BW: 40.2 CPU/RCPU: 29.9,-2.2 SD/RSD: 12.0,-15.6
>
> Following is the performance of MQ vs MQ+mm patch:
> _____________________________________________________
> MQ vs MQ+mm patch
> # BW% CPU% RCPU% SD% RSD%
> _____________________________________________________
> 1 4.98 -.58 .84 -20.00 0
> 2 5.17 2.96 2.29 0 -4.00
> 4 -.18 .25 -.16 3.12 .98
> 8 -5.47 -1.36 -1.98 17.18 16.57
> 16 -1.90 -6.64 -3.54 -14.83 -12.12
> 24 -.01 23.63 14.65 57.61 46.64
> 32 .27 -3.19 -3.11 -22.98 -22.91
> 40 -1.06 -2.96 -2.96 -4.18 -4.10
> 48 -.28 -2.34 -3.71 -2.41 -3.81
> 64 9.71 33.77 30.65 81.44 77.09
> 80 -10.69 -31.07 -31.70 -29.22 -29.88
> 96 -1.14 5.98 .56 -11.57 -16.14
> 128 -.93 -15.60 -18.31 -19.89 -22.65
> _____________________________________________________
> BW: 0 CPU/RCPU: -4.2,-6.1 SD/RSD: -13.1,-15.6
> _____________________________________________________
>
> Each test case is for 60 secs, sum over two runs (except
> when number of netperf sessions is 1, which has 7 runs
> of 10 secs each), numcpus=4, numtxqs=8, etc. No tuning
> other than taskset each vhost to cpus 0-3.
>
> Thanks,
>
> - KK
^ permalink raw reply [flat|nested] 11+ messages in thread