* [PATCH net-next v2] ipv4: defer fib_compute_spec_dst() call
From: Eric Dumazet @ 2012-07-05 8:30 UTC (permalink / raw)
To: David Miller; +Cc: netdev
From: Eric Dumazet <edumazet@google.com>
ip_options_compile() can avoid calling fib_compute_spec_dst()
by default, and perform the call only if needed.
David suggested to add a helper to make the call only once.
Signed-off-by: Eric Dumazet <edumazet@google.com>
---
net/ipv4/ip_options.c | 15 ++++++++++++---
1 file changed, 12 insertions(+), 3 deletions(-)
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index 1f02251..a19d647 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -242,6 +242,15 @@ void ip_options_fragment(struct sk_buff *skb)
opt->ts_needtime = 0;
}
+/* helper used by ip_options_compile() to call fib_compute_spec_dst()
+ * at most one time.
+ */
+static void spec_dst_fill(__be32 *spec_dst, struct sk_buff *skb)
+{
+ if (*spec_dst == htonl(INADDR_ANY))
+ *spec_dst = fib_compute_spec_dst(skb);
+}
+
/*
* Verify options and fill pointers in struct options.
* Caller should clear *opt, and set opt->data.
@@ -251,7 +260,7 @@ void ip_options_fragment(struct sk_buff *skb)
int ip_options_compile(struct net *net,
struct ip_options *opt, struct sk_buff *skb)
{
- __be32 spec_dst = (__force __be32) 0;
+ __be32 spec_dst = htonl(INADDR_ANY);
unsigned char *pp_ptr = NULL;
struct rtable *rt = NULL;
unsigned char *optptr;
@@ -260,8 +269,6 @@ int ip_options_compile(struct net *net,
if (skb != NULL) {
rt = skb_rtable(skb);
- if (rt)
- spec_dst = fib_compute_spec_dst(skb);
optptr = (unsigned char *)&(ip_hdr(skb)[1]);
} else
optptr = opt->__data;
@@ -334,6 +341,7 @@ int ip_options_compile(struct net *net,
goto error;
}
if (rt) {
+ spec_dst_fill(&spec_dst, skb);
memcpy(&optptr[optptr[2]-1], &spec_dst, 4);
opt->is_changed = 1;
}
@@ -376,6 +384,7 @@ int ip_options_compile(struct net *net,
}
opt->ts = optptr - iph;
if (rt) {
+ spec_dst_fill(&spec_dst, skb);
memcpy(&optptr[optptr[2]-1], &spec_dst, 4);
timeptr = &optptr[optptr[2]+3];
}
^ permalink raw reply related
* [PATCH] cgroup: fix panic in netprio_cgroup
From: Gao feng @ 2012-07-05 8:31 UTC (permalink / raw)
To: davem; +Cc: netdev, linux-kernel, nhorman, tj, lizefan, Gao feng
we set max_prioidx to the first zero bit index of prioidx_map in
function get_prioidx.
So when we delete the low index netprio cgroup and adding a new
netprio cgroup again,the max_prioidx will be set to the low index.
when we set the high index cgroup's net_prio.ifpriomap,the function
write_priomap will call update_netdev_tables to alloc memory which
size is sizeof(struct netprio_map) + sizeof(u32) * (max_prioidx + 1),
so the size of array that map->priomap point to is max_prioidx +1,
which is low than what we actually need.
fix this by adding check in get_prioidx,only set max_prioidx when
max_prioidx low than the new prioidx.
Signed-off-by: Gao feng <gaofeng@cn.fujitsu.com>
---
net/core/netprio_cgroup.c | 3 ++-
1 files changed, 2 insertions(+), 1 deletions(-)
diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c
index 5b8aa2f..586f7d9 100644
--- a/net/core/netprio_cgroup.c
+++ b/net/core/netprio_cgroup.c
@@ -50,7 +50,8 @@ static int get_prioidx(u32 *prio)
}
set_bit(prioidx, prioidx_map);
spin_unlock_irqrestore(&prioidx_map_lock, flags);
- atomic_set(&max_prioidx, prioidx);
+ if (atomic_read(&max_prioidx) < prioidx)
+ atomic_set(&max_prioidx, prioidx);
*prio = prioidx;
return 0;
}
--
1.7.7.6
^ permalink raw reply related
* Re: TCP transmit performance regression
From: Eric Dumazet @ 2012-07-05 8:33 UTC (permalink / raw)
To: Ming Lei; +Cc: Network Development, David Miller
In-Reply-To: <CACVXFVNxcdEYd-KmkUe9=8+x_9s-ZVuoM=FfZ=QXa7w_qRiTnw@mail.gmail.com>
On Thu, 2012-07-05 at 16:27 +0800, Ming Lei wrote:
> After some investigation, the problem is caused by enabling
> DEBUG_SLAB, so it is not a regression.
>
Strange, unless your machine is a _very_ slow one maybe ?
>
> Looks no improvement. I still don't know why the window size becomes so
> small even in good situation(disabling DEBUG_SLAB), and the small
> window size will cause almost every tcp data packet acked.
You are probably missing the fact that window scaling is enabled.
If you dont post a pcap, I am afraid we cant really help.
^ permalink raw reply
* Re: TCP transmit performance regression
From: Ming Lei @ 2012-07-05 8:42 UTC (permalink / raw)
To: Eric Dumazet; +Cc: Network Development, David Miller
In-Reply-To: <1341477192.2583.3415.camel@edumazet-glaptop>
[-- Attachment #1: Type: text/plain, Size: 759 bytes --]
On Thu, Jul 5, 2012 at 4:33 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> On Thu, 2012-07-05 at 16:27 +0800, Ming Lei wrote:
>
>> After some investigation, the problem is caused by enabling
>> DEBUG_SLAB, so it is not a regression.
>>
>
> Strange, unless your machine is a _very_ slow one maybe ?
It is a beagle-xm board, and its cpu is ARMv7, 1GHz.
>
>>
>> Looks no improvement. I still don't know why the window size becomes so
>> small even in good situation(disabling DEBUG_SLAB), and the small
>> window size will cause almost every tcp data packet acked.
>
> You are probably missing the fact that window scaling is enabled.
>
> If you dont post a pcap, I am afraid we cant really help.
See attachment for the pcap trace.
Thanks,
--
Ming Lei
[-- Attachment #2: tcp.pcap --]
[-- Type: application/octet-stream, Size: 97922 bytes --]
^ permalink raw reply
* Re: [PATCH] cgroup: fix panic in netprio_cgroup
From: Eric Dumazet @ 2012-07-05 8:43 UTC (permalink / raw)
To: Gao feng; +Cc: davem, netdev, linux-kernel, nhorman, tj, lizefan
In-Reply-To: <1341477102-16988-1-git-send-email-gaofeng@cn.fujitsu.com>
On Thu, 2012-07-05 at 16:31 +0800, Gao feng wrote:
> we set max_prioidx to the first zero bit index of prioidx_map in
> function get_prioidx.
>
> So when we delete the low index netprio cgroup and adding a new
> netprio cgroup again,the max_prioidx will be set to the low index.
>
> when we set the high index cgroup's net_prio.ifpriomap,the function
> write_priomap will call update_netdev_tables to alloc memory which
> size is sizeof(struct netprio_map) + sizeof(u32) * (max_prioidx + 1),
> so the size of array that map->priomap point to is max_prioidx +1,
> which is low than what we actually need.
>
> fix this by adding check in get_prioidx,only set max_prioidx when
> max_prioidx low than the new prioidx.
>
> Signed-off-by: Gao feng <gaofeng@cn.fujitsu.com>
> ---
> net/core/netprio_cgroup.c | 3 ++-
> 1 files changed, 2 insertions(+), 1 deletions(-)
>
> diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c
> index 5b8aa2f..586f7d9 100644
> --- a/net/core/netprio_cgroup.c
> +++ b/net/core/netprio_cgroup.c
> @@ -50,7 +50,8 @@ static int get_prioidx(u32 *prio)
> }
> set_bit(prioidx, prioidx_map);
> spin_unlock_irqrestore(&prioidx_map_lock, flags);
> - atomic_set(&max_prioidx, prioidx);
> + if (atomic_read(&max_prioidx) < prioidx)
> + atomic_set(&max_prioidx, prioidx);
> *prio = prioidx;
> return 0;
> }
This is still racy.
Please do this before the
spin_unlock_irqrestore(&prioidx_map_lock, flags);
^ permalink raw reply
* Re: [PATCH] cgroup: fix panic in netprio_cgroup
From: David Miller @ 2012-07-05 8:58 UTC (permalink / raw)
To: gaofeng; +Cc: netdev, linux-kernel, nhorman, tj, lizefan
In-Reply-To: <1341477102-16988-1-git-send-email-gaofeng@cn.fujitsu.com>
Why did you post this twice?
Is there a difference between the first patch and the second
one you posted? If so, what is that difference?
^ permalink raw reply
* Re: [PATCH] cgroup: fix panic in netprio_cgroup
From: Gao feng @ 2012-07-05 9:10 UTC (permalink / raw)
To: Eric Dumazet; +Cc: davem, netdev, linux-kernel, nhorman, tj, lizefan
In-Reply-To: <1341477809.2583.3437.camel@edumazet-glaptop>
于 2012年07月05日 16:43, Eric Dumazet 写道:
> On Thu, 2012-07-05 at 16:31 +0800, Gao feng wrote:
>> we set max_prioidx to the first zero bit index of prioidx_map in
>> function get_prioidx.
>>
>> So when we delete the low index netprio cgroup and adding a new
>> netprio cgroup again,the max_prioidx will be set to the low index.
>>
>> when we set the high index cgroup's net_prio.ifpriomap,the function
>> write_priomap will call update_netdev_tables to alloc memory which
>> size is sizeof(struct netprio_map) + sizeof(u32) * (max_prioidx + 1),
>> so the size of array that map->priomap point to is max_prioidx +1,
>> which is low than what we actually need.
>>
>> fix this by adding check in get_prioidx,only set max_prioidx when
>> max_prioidx low than the new prioidx.
>>
>> Signed-off-by: Gao feng <gaofeng@cn.fujitsu.com>
>> ---
>> net/core/netprio_cgroup.c | 3 ++-
>> 1 files changed, 2 insertions(+), 1 deletions(-)
>>
>> diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c
>> index 5b8aa2f..586f7d9 100644
>> --- a/net/core/netprio_cgroup.c
>> +++ b/net/core/netprio_cgroup.c
>> @@ -50,7 +50,8 @@ static int get_prioidx(u32 *prio)
>> }
>> set_bit(prioidx, prioidx_map);
>> spin_unlock_irqrestore(&prioidx_map_lock, flags);
>> - atomic_set(&max_prioidx, prioidx);
>> + if (atomic_read(&max_prioidx) < prioidx)
>> + atomic_set(&max_prioidx, prioidx);
>> *prio = prioidx;
>> return 0;
>> }
>
> This is still racy.
>
> Please do this before the
> spin_unlock_irqrestore(&prioidx_map_lock, flags);
>
Thanks Eric,you are right
I will fix and resent it.
^ permalink raw reply
* Re: [PATCH] cgroup: fix panic in netprio_cgroup
From: Gao feng @ 2012-07-05 9:15 UTC (permalink / raw)
To: David Miller; +Cc: netdev, linux-kernel, nhorman, tj, lizefan
In-Reply-To: <20120705.015841.2231353345763821829.davem@davemloft.net>
于 2012年07月05日 16:58, David Miller 写道:
>
> Why did you post this twice?
Sorry to confuse you, there are something wrong with my git sendmail config.
I sent the first patch but I can't find it in the maillist,so I
sent it again.
>
> Is there a difference between the first patch and the second
> one you posted? If so, what is that difference?
there isn't a difference between them.
Sorry again.
Thanks.
^ permalink raw reply
* [PATCH v2] cgroup: fix panic in netprio_cgroup
From: Gao feng @ 2012-07-05 9:28 UTC (permalink / raw)
To: davem; +Cc: netdev, linux-kernel, nhorman, tj, lizefan, eric.dumazet,
Gao feng
we set max_prioidx to the first zero bit index of prioidx_map in
function get_prioidx.
So when we delete the low index netprio cgroup and adding a new
netprio cgroup again,the max_prioidx will be set to the low index.
when we set the high index cgroup's net_prio.ifpriomap,the function
write_priomap will call update_netdev_tables to alloc memory which
size is sizeof(struct netprio_map) + sizeof(u32) * (max_prioidx + 1),
so the size of array that map->priomap point to is max_prioidx +1,
which is low than what we actually need.
fix this by adding check in get_prioidx,only set max_prioidx when
max_prioidx low than the new prioidx.
Signed-off-by: Gao feng <gaofeng@cn.fujitsu.com>
---
net/core/netprio_cgroup.c | 3 ++-
1 files changed, 2 insertions(+), 1 deletions(-)
diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c
index 5b8aa2f..aa907ed 100644
--- a/net/core/netprio_cgroup.c
+++ b/net/core/netprio_cgroup.c
@@ -49,8 +49,9 @@ static int get_prioidx(u32 *prio)
return -ENOSPC;
}
set_bit(prioidx, prioidx_map);
+ if (atomic_read(&max_prioidx) < prioidx)
+ atomic_set(&max_prioidx, prioidx);
spin_unlock_irqrestore(&prioidx_map_lock, flags);
- atomic_set(&max_prioidx, prioidx);
*prio = prioidx;
return 0;
}
--
1.7.7.6
^ permalink raw reply related
* Re: TCP transmit performance regression
From: Eric Dumazet @ 2012-07-05 9:49 UTC (permalink / raw)
To: Ming Lei; +Cc: Network Development, David Miller
In-Reply-To: <CACVXFVPTXB7t=zwkm+HTgDaF3bA02bzff_52S+UAr51PfpvpCg@mail.gmail.com>
On Thu, 2012-07-05 at 16:42 +0800, Ming Lei wrote:
> On Thu, Jul 5, 2012 at 4:33 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> > On Thu, 2012-07-05 at 16:27 +0800, Ming Lei wrote:
> >
> >> After some investigation, the problem is caused by enabling
> >> DEBUG_SLAB, so it is not a regression.
> >>
> >
> > Strange, unless your machine is a _very_ slow one maybe ?
>
> It is a beagle-xm board, and its cpu is ARMv7, 1GHz.
OK, driver seems buggy, please try following patch (on both sides if
possible)
drivers/net/usb/smsc95xx.c | 11 ++++-------
1 file changed, 4 insertions(+), 7 deletions(-)
diff --git a/drivers/net/usb/smsc95xx.c b/drivers/net/usb/smsc95xx.c
index b1112e7..0a4ae35 100644
--- a/drivers/net/usb/smsc95xx.c
+++ b/drivers/net/usb/smsc95xx.c
@@ -1084,26 +1084,23 @@ static int smsc95xx_rx_fixup(struct usbnet *dev, struct sk_buff *skb)
if (skb->len == size) {
if (dev->net->features & NETIF_F_RXCSUM)
smsc95xx_rx_csum_offload(skb);
- skb_trim(skb, skb->len - 4); /* remove fcs */
+ __skb_trim(skb, skb->len - 4); /* remove fcs */
skb->truesize = size + sizeof(struct sk_buff);
return 1;
}
- ax_skb = skb_clone(skb, GFP_ATOMIC);
+ ax_skb = netdev_alloc_skb_ip_align(dev->net, size);
if (unlikely(!ax_skb)) {
netdev_warn(dev->net, "Error allocating skb\n");
return 0;
}
- ax_skb->len = size;
- ax_skb->data = packet;
- skb_set_tail_pointer(ax_skb, size);
+ memcpy(skb_put(ax_skb, size), packet, size);
if (dev->net->features & NETIF_F_RXCSUM)
smsc95xx_rx_csum_offload(ax_skb);
- skb_trim(ax_skb, ax_skb->len - 4); /* remove fcs */
- ax_skb->truesize = size + sizeof(struct sk_buff);
+ __skb_trim(ax_skb, ax_skb->len - 4); /* remove fcs */
usbnet_skb_return(dev, ax_skb);
}
^ permalink raw reply related
* Re: [PATCH 0/19] Disconnect neigh from dst_entry
From: David Miller @ 2012-07-05 9:55 UTC (permalink / raw)
To: netdev
In-Reply-To: <20120703.024543.1597240990462633709.davem@davemloft.net>
From: David Miller <davem@davemloft.net>
Date: Tue, 03 Jul 2012 02:45:43 -0700 (PDT)
> This finally severs neighbour table entries from dst_entry enough that
> we no longer depend upon them outside of the individual protocols.
I'm pushing this now to net-next, with three minor changes.
1) I fubar'd the neigh lookup in the sch_teql changes, I needed to
add the following code block to __teql_resolve():
if (dst->dev != dev) {
struct neighbour *mn;
mn = __neigh_lookup_errno(n->tbl, n->primary_key, dev);
neigh_release(n);
if (IS_ERR(mn))
return PTR_ERR(mn);
n = mn;
}
2) I adjusted the comment in the neigh backlog handler of
neigh_update() to read as follows:
/* Why not just use 'neigh' as-is? The problem is that
* things such as shaper, eql, and sch_teql can end up
* using alternative, different, neigh objects to output
* the packet in the output path. So what we need to do
* here is re-lookup the top-level neigh in the path so
* we can reinject the packet there.
*/
3) The redirect network event needs to also pass in the path
destination address so that we can have it available for
all callers of t3_l2t_get().
^ permalink raw reply
* Re: [PATCH next-next] ppp: change default for incoming protocol filter to NPMODE_DROP
From: David Miller @ 2012-07-05 10:00 UTC (permalink / raw)
To: bcrl; +Cc: netdev, linux-ppp
In-Reply-To: <20120704013258.GA26225@kvack.org>
From: Benjamin LaHaise <bcrl@kvack.org>
Date: Tue, 3 Jul 2012 21:32:58 -0400
> By default, the ppp_generic code initializes the npmode array that filters
> incoming packet to accept packets for all protocols. This behaviour is
> incorrect, as it results in packets for protocols that an older version
> of a PPP implementation may not be aware of to be incorrectly accepted.
> This behaviour is visible, for example, when sending IPv6 packets across a
> ppp link where pppd has only been configured to use IPv4.
>
> This change should be safe since pppd will correctly set the protocols it
> negotiates to NPMODE_PASS as the appropriate protocols transition to an Up
> state.
>
> Signed-off-by: Benjamin LaHaise <bcrl@kvack.org>
As far as I can tell, this has been this way for a very long time.
Therefore it is the applications responsibility to adjust the filters
to suit their needs and we really can't make such adjustments to this
behavior.
^ permalink raw reply
* Re: TCP transmit performance regression
From: David Miller @ 2012-07-05 10:02 UTC (permalink / raw)
To: eric.dumazet; +Cc: tom.leiming, netdev
In-Reply-To: <1341481760.2583.3579.camel@edumazet-glaptop>
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Thu, 05 Jul 2012 11:49:20 +0200
> - ax_skb->data = packet;
That's really scary.
^ permalink raw reply
* Re: [PATCH net-next v2] ipv4: defer fib_compute_spec_dst() call
From: David Miller @ 2012-07-05 10:03 UTC (permalink / raw)
To: eric.dumazet; +Cc: netdev
In-Reply-To: <1341477009.2583.3406.camel@edumazet-glaptop>
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Thu, 05 Jul 2012 10:30:09 +0200
> From: Eric Dumazet <edumazet@google.com>
>
> ip_options_compile() can avoid calling fib_compute_spec_dst()
> by default, and perform the call only if needed.
>
> David suggested to add a helper to make the call only once.
>
> Signed-off-by: Eric Dumazet <edumazet@google.com>
Applied, thanks a lot Eric.
^ permalink raw reply
* Re: [PATCH net-next 00/11] default maximal number of RSS queues in mq drivers
From: David Miller @ 2012-07-05 10:07 UTC (permalink / raw)
To: yuvalmin
Cc: netdev, eilong, divy, ogerlitz, jdmason, anirban.chakraborty,
jitendra.kalsaria, ron.mercer, jeffrey.t.kirsher, mason, gallatin,
sathya.perla, subbu.seetharaman, ajit.khaparde, mcarlson, mchan,
eric.dumazet, bhutchings
In-Reply-To: <1341148740-7375-1-git-send-email-yuvalmin@broadcom.com>
From: "Yuval Mintz" <yuvalmin@broadcom.com>
Date: Sun, 1 Jul 2012 16:18:49 +0300
> Different vendors support different number of RSS queues by default. Today,
> there exists an ethtool API through which users can change the number of
> channels their driver supports; This enables us to pursue the goal of using
> a default number of RSS queues in various multi-queue drivers.
>
> This patch intendeds to achieve the above default, by upper-limiting the number
> of interrupts multi-queue drivers request (by default, not via the new API)
> with correlation to the number of cpus on the machine.
Applied to net-next, thanks a lot.
^ permalink raw reply
* Re: [PATCH net-next 1/2] ipv6: remove unnecessary codes in tcp_ipv6.c
From: David Miller @ 2012-07-05 10:13 UTC (permalink / raw)
To: eric.dumazet; +Cc: roy.qing.li, netdev
In-Reply-To: <1341220067.5269.37.camel@edumazet-glaptop>
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Mon, 02 Jul 2012 11:07:47 +0200
> On Mon, 2012-07-02 at 11:18 +0800, roy.qing.li@gmail.com wrote:
>> From: RongQing.Li <roy.qing.li@gmail.com>
>>
>> opt always equals np->opts, so it is meaningless to define opt, and
>> check if opt does not equal np->opts and then try to free opt.
>>
>> Signed-off-by: RongQing.Li <roy.qing.li@gmail.com>
>> ---
>> net/ipv6/tcp_ipv6.c | 16 +++-------------
>> 1 files changed, 3 insertions(+), 13 deletions(-)
>
> Acked-by: Eric Dumazet <edumazet@google.com>
Ok I now understand better why these changes are correct,
applied.
^ permalink raw reply
* Re: [PATCH net-next 2/2] dccp: remove unnecessary codes in ipv6.c
From: David Miller @ 2012-07-05 10:13 UTC (permalink / raw)
To: eric.dumazet; +Cc: roy.qing.li, netdev
In-Reply-To: <1341220130.5269.38.camel@edumazet-glaptop>
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Mon, 02 Jul 2012 11:08:50 +0200
> On Mon, 2012-07-02 at 11:19 +0800, roy.qing.li@gmail.com wrote:
>> From: RongQing.Li <roy.qing.li@gmail.com>
>>
>> opt always equals np->opts, so it is meaningless to define opt, and
>> check if opt does not equal np->opts and then try to free opt.
>>
>> Signed-off-by: RongQing.Li <roy.qing.li@gmail.com>
>> ---
>
> Acked-by: Eric Dumazet <edumazet@google.com>
Also applied, thank you.
^ permalink raw reply
* Re: [PATCH net-next] 6lowpan: revert 'reuse eth_mac_addr()'
From: David Miller @ 2012-07-05 10:13 UTC (permalink / raw)
To: alex.bluesman.smirnov; +Cc: netdev, danny.kukawka, dbaryshkov
In-Reply-To: <1341208726-2793-1-git-send-email-alex.bluesman.smirnov@gmail.com>
From: Alexander Smirnov <alex.bluesman.smirnov@gmail.com>
Date: Mon, 2 Jul 2012 09:58:46 +0400
> This reverts the commit cdf49c283e2e105da86ca575ad35b453f5ff24ea which
> replaces lowpan '.ndo_set_mac_address' method by ethernet's one.
>
> Accorind to the IEEE 802.15.4 standard, device has 8-byte length address,
> so this hook loses the last 2 bytes which may rise a compatibility problems
> with other IEEE 802.15.4 standard implementations.
>
> Signed-off-by: Alexander Smirnov <alex.bluesman.smirnov@gmail.com>
Applied.
^ permalink raw reply
* Re: [PATCH net-next 2/2] drivers/ieee802154/at231rf230: remove unused return status
From: David Miller @ 2012-07-05 10:13 UTC (permalink / raw)
To: alex.bluesman.smirnov; +Cc: netdev, dbaryshkov
In-Reply-To: <1341209912-6030-3-git-send-email-alex.bluesman.smirnov@gmail.com>
From: Alexander Smirnov <alex.bluesman.smirnov@gmail.com>
Date: Mon, 2 Jul 2012 10:18:32 +0400
> Remove excessive variable used for the return status.
>
> Signed-off-by: Alexander Smirnov <alex.bluesman.smirnov@gmail.com>
Applied.
^ permalink raw reply
* Re: [PATCH 0/5] rtcache remove respin
From: David Miller @ 2012-07-05 10:15 UTC (permalink / raw)
To: eric.dumazet; +Cc: netdev
In-Reply-To: <1341225841.5269.69.camel@edumazet-glaptop>
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Mon, 02 Jul 2012 12:44:01 +0200
> If we still want __refcnt being on cache line boundary, we might find a
> better way to accomplish this.
Back to this issue again.
Eric, if you take a look at net-next right now, I left a dummy padding
in dst_entry where the neighbour pointer used to be.
Can you come up with some way to make use of that new space?
^ permalink raw reply
* Re: [PATCH net] net: qmi_wwan: add ZTE MF60
From: David Miller @ 2012-07-05 10:16 UTC (permalink / raw)
To: bjorn; +Cc: netdev
In-Reply-To: <1341251597-21124-1-git-send-email-bjorn@mork.no>
From: Bjørn Mork <bjorn@mork.no>
Date: Mon, 2 Jul 2012 19:53:17 +0200
> +static const struct driver_info qmi_wwan_force_int2 = {
Please get rid of that strange tab character and resubmit.
^ permalink raw reply
* [net-next RFC V5 0/5] Multiqueue virtio-net
From: Jason Wang @ 2012-07-05 10:29 UTC (permalink / raw)
To: mst, mashirle, krkumar2, habanero, rusty, netdev, linux-kernel,
virtualization, edumazet, tahm, jwhan, davem
Cc: kvm, sri
Hello All:
This series is an update version of multiqueue virtio-net driver based on
Krishna Kumar's work to let virtio-net use multiple rx/tx queues to do the
packets reception and transmission. Please review and comments.
Test Environment:
- Intel(R) Xeon(R) CPU E5620 @ 2.40GHz, 8 cores 2 numa nodes
- Two directed connected 82599
Test Summary:
- Highlights: huge improvements on TCP_RR test
- Lowlights: regression on small packet transmission, higher cpu utilization
than single queue, need further optimization
Analysis of the performance result:
- I count the number of packets sending/receiving during the test, and
multiqueue show much more ability in terms of packets per second.
- For the tx regression, multiqueue send about 1-2 times of more packets
compared to single queue, and the packets size were much smaller than single
queue does. I suspect tcp does less batching in multiqueue, so I hack the
tcp_write_xmit() to forece more batching, multiqueue works as well as
singlequeue for both small transmission and throughput
- I didn't pack the accelerate RFS with virtio-net in this sereis as it still
need further shaping, for the one that interested in this please see:
http://www.mail-archive.com/kvm@vger.kernel.org/msg64111.html
Changes from V4:
- Add ability to negotiate the number of queues through control virtqueue
- Ethtool -{L|l} support and default the tx/rx queue number to 1
- Expose the API to set irq affinity instead of irq itself
Changes from V3:
- Rebase to the net-next
- Let queue 2 to be the control virtqueue to obey the spec
- Prodives irq affinity
- Choose txq based on processor id
References:
- V4: https://lkml.org/lkml/2012/6/25/120
- V3: http://lwn.net/Articles/467283/
Test result:
1) 1 vm 2 vcpu 1q vs 2q, 1 - 1q, 2 - 2q, no pinning
- Guest to External Host TCP STREAM
sessions size throughput1 throughput2 norm1 norm2
1 64 650.55 655.61 100% 24.88 24.86 99%
2 64 1446.81 1309.44 90% 30.49 27.16 89%
4 64 1430.52 1305.59 91% 30.78 26.80 87%
8 64 1450.89 1270.82 87% 30.83 25.95 84%
1 256 1699.45 1779.58 104% 56.75 59.08 104%
2 256 4902.71 3446.59 70% 98.53 62.78 63%
4 256 4803.76 2980.76 62% 97.44 54.68 56%
8 256 5128.88 3158.74 61% 104.68 58.61 55%
1 512 2837.98 2838.42 100% 89.76 90.41 100%
2 512 6742.59 5495.83 81% 155.03 99.07 63%
4 512 9193.70 5900.17 64% 202.84 106.44 52%
8 512 9287.51 7107.79 76% 202.18 129.08 63%
1 1024 4166.42 4224.98 101% 128.55 129.86 101%
2 1024 6196.94 7823.08 126% 181.80 168.81 92%
4 1024 9113.62 9219.49 101% 235.15 190.93 81%
8 1024 9324.25 9402.66 100% 239.10 179.99 75%
1 2048 7441.63 6534.04 87% 248.01 215.63 86%
2 2048 7024.61 7414.90 105% 225.79 219.62 97%
4 2048 8971.49 9269.00 103% 278.94 220.84 79%
8 2048 9314.20 9359.96 100% 268.36 192.23 71%
1 4096 8282.60 8990.08 108% 277.45 320.05 115%
2 4096 9194.80 9293.78 101% 317.02 248.76 78%
4 4096 9340.73 9313.19 99% 300.34 230.35 76%
8 4096 9148.23 9347.95 102% 279.49 199.43 71%
1 16384 8787.89 8766.31 99% 312.38 316.53 101%
2 16384 9306.35 9156.14 98% 319.53 279.83 87%
4 16384 9177.81 9307.50 101% 312.69 230.07 73%
8 16384 9035.82 9188.00 101% 298.32 199.17 66%
- TCP RR
sessions size throughput1 throughput2 norm1 norm2
50 1 54695.41 84164.98 153% 1957.33 1901.31 97%
100 1 60141.88 88598.94 147% 2157.90 2000.45 92%
250 1 74763.56 135584.22 181% 2541.94 2628.59 103%
50 64 51628.38 82867.50 160% 1872.55 1812.16 96%
100 64 60367.73 84080.60 139% 2215.69 1867.69 84%
250 64 68502.70 124910.59 182% 2321.43 2495.76 107%
50 128 53477.08 77625.07 145% 1905.10 1870.99 98%
100 128 59697.56 74902.37 125% 2230.66 1751.03 78%
250 128 71248.74 133963.55 188% 2453.12 2711.72 110%
50 256 47663.86 67742.63 142% 1880.45 1735.30 92%
100 256 54051.84 68738.57 127% 2123.03 1778.59 83%
250 256 68250.06 124487.90 182% 2321.89 2598.60 111%
- External Host to Guest TCP STRAM
sessions size throughput1 throughput2 norm1 norm2
1 64 847.71 864.83 102% 57.99 57.93 99%
2 64 1690.82 1544.94 91% 80.13 55.09 68%
4 64 3434.98 3455.53 100% 127.17 89.00 69%
8 64 5890.19 6557.35 111% 194.70 146.52 75%
1 256 2094.04 2109.14 100% 130.73 127.14 97%
2 256 5218.13 3731.97 71% 219.15 114.02 52%
4 256 6734.51 9213.47 136% 227.87 208.31 91%
8 256 6452.86 9402.78 145% 224.83 207.77 92%
1 512 3945.07 4203.68 106% 279.72 273.30 97%
2 512 7878.96 8122.55 103% 278.25 231.71 83%
4 512 7645.89 9402.13 122% 252.10 217.42 86%
8 512 6657.06 9403.71 141% 239.81 214.89 89%
1 1024 5729.06 5111.21 89% 289.38 303.09 104%
2 1024 8097.27 8159.67 100% 269.29 242.97 90%
4 1024 7778.93 8919.02 114% 261.28 205.50 78%
8 1024 6458.02 9360.02 144% 221.26 208.09 94%
1 2048 6426.94 5195.59 80% 292.52 307.47 105%
2 2048 8221.90 9025.66 109% 283.80 242.25 85%
4 2048 7364.72 8527.79 115% 248.10 198.36 79%
8 2048 6760.63 9161.07 135% 230.53 205.12 88%
1 4096 7247.02 6874.21 94% 276.23 287.68 104%
2 4096 8346.04 8818.65 105% 281.49 254.81 90%
4 4096 6710.00 9354.59 139% 216.41 210.13 97%
8 4096 6265.69 9406.87 150% 206.69 210.92 102%
1 16384 8159.50 8048.79 98% 266.94 283.11 106%
2 16384 8525.66 8552.41 100% 294.36 239.27 81%
4 16384 6042.24 8447.86 139% 200.21 196.40 98%
8 16384 6432.63 9403.49 146% 211.48 206.13 97%
2) 1 vm 4 vcpu 1q vs 4q, 1 - 1q, 2 - 4q, no pinning
- Guest to External Host TCP STREAM
sessions size throughput1 throughput2 norm1 norm2
1 64 636.93 657.69 103% 23.55 24.42 103%
2 64 1457.46 1268.78 87% 30.97 26.02 84%
4 64 3062.86 2302.43 75% 41.00 29.64 72%
8 64 3107.68 2308.32 74% 41.62 29.07 69%
1 256 1743.50 1750.11 100% 59.00 56.63 95%
2 256 4582.61 2870.31 62% 92.47 51.97 56%
4 256 8440.96 4795.37 56% 135.10 56.39 41%
8 256 9240.31 6654.82 72% 144.76 74.89 51%
1 512 2918.25 2735.26 93% 91.08 86.47 94%
2 512 8978.32 5107.95 56% 200.00 94.97 47%
4 512 8850.39 6864.37 77% 190.32 101.09 53%
8 512 9270.30 8483.01 91% 193.44 118.73 61%
1 1024 4416.10 3679.70 83% 135.54 110.63 81%
2 1024 9085.20 8770.48 96% 242.23 175.59 72%
4 1024 9158.57 9011.56 98% 234.39 159.17 67%
8 1024 9345.89 9067.43 97% 233.35 138.73 59%
1 2048 8455.19 6077.94 71% 338.52 190.16 56%
2 2048 9223.32 8237.73 89% 270.00 198.27 73%
4 2048 9080.75 9257.63 101% 261.30 172.80 66%
8 2048 9177.39 8977.10 97% 256.89 147.50 57%
1 4096 8665.35 8394.78 96% 289.63 289.85 100%
2 4096 7850.73 8857.86 112% 253.33 252.62 99%
4 4096 9332.55 8508.37 91% 289.19 151.29 52%
8 4096 8482.30 9146.80 107% 255.41 156.02 61%
1 16384 8825.72 8778.26 99% 314.60 308.89 98%
2 16384 9283.85 8927.40 96% 316.48 246.98 78%
4 16384 7766.95 8708.06 112% 265.25 155.59 58%
8 16384 8945.55 8940.23 99% 298.45 151.32 50%
- TCP_RR
sessions size throughput1 throughput2 norm1 norm2
50 1 60848.70 81719.39 134% 2196.86 1551.05 70%
100 1 61886.19 81425.02 131% 2215.76 1517.52 68%
250 1 72058.41 162597.84 225% 2441.84 2278.14 93%
50 64 51646.93 74160.10 143% 1861.07 1322.22 71%
100 64 57574.86 83488.26 145% 2076.54 1479.79 71%
250 64 67583.35 138482.15 204% 2314.46 2022.83 87%
50 128 59931.51 71633.03 119% 2244.60 1309.18 58%
100 128 58329.80 73104.90 125% 2202.98 1329.52 60%
250 128 71021.55 161067.73 226% 2469.11 2205.28 89%
50 256 47509.24 64330.24 135% 1915.75 1269.90 66%
100 256 49293.03 68507.94 138% 1939.75 1263.64 65%
250 256 63169.07 138390.68 219% 2255.47 2098.13 93%
- External Host to Guest TCP STREAM
sessions size throughput1 throughput2 norm1 norm2
1 64 850.18 854.96 100% 56.94 58.25 102%
2 64 1659.12 1730.25 104% 81.65 67.57 82%
4 64 3254.70 3397.17 104% 118.57 76.21 64%
8 64 6251.97 6389.29 102% 207.68 104.21 50%
1 256 2029.14 2105.18 103% 116.45 119.69 102%
2 256 5412.02 4260.32 78% 240.87 139.73 58%
4 256 7777.28 8743.12 112% 263.20 174.65 66%
8 256 6459.51 9388.93 145% 218.94 158.37 72%
1 512 4566.31 4269.30 93% 274.74 289.83 105%
2 512 7444.52 8240.64 110% 286.24 243.74 85%
4 512 7722.29 9391.16 121% 261.96 180.36 68%
8 512 6228.50 9134.52 146% 209.17 161.00 76%
1 1024 4965.50 4953.68 99% 307.64 280.48 91%
2 1024 8270.08 7733.71 93% 288.32 197.04 68%
4 1024 7551.04 9394.58 124% 268.41 206.62 76%
8 1024 6307.78 9179.03 145% 216.67 159.63 73%
1 2048 5741.12 5948.80 103% 290.34 268.66 92%
2 2048 7932.79 8766.05 110% 262.96 215.90 82%
4 2048 6907.55 9255.97 133% 233.56 203.96 87%
8 2048 6037.22 9399.41 155% 197.14 164.09 83%
1 4096 7131.70 7535.10 105% 279.43 275.12 98%
2 4096 8109.17 9348.04 115% 274.29 211.49 77%
4 4096 6878.92 9319.13 135% 244.21 192.06 78%
8 4096 6265.92 9408.35 150% 211.85 159.26 75%
1 16384 8288.01 8596.39 103% 272.85 290.22 106%
2 16384 8166.29 9280.12 113% 277.04 236.61 85%
4 16384 6446.97 9382.22 145% 222.91 187.24 83%
8 16384 6066.98 9405.51 155% 198.98 157.09 78%
3) 2 vms each with 2 vcpus, 1q vs 2q - pin vhost/vcpu in the same node
- 2 Guests to External Hosts TCP STREAM
sessions size throughput1 throughput2 norm1 norm2
1 64 1442.07 1475.11 102% 30.82 31.21 101%
2 64 3124.87 2900.93 92% 40.29 35.95 89%
4 64 3166.52 2864.04 90% 40.70 35.47 87%
8 64 3141.45 2848.94 90% 40.38 35.34 87%
1 256 3628.54 3711.73 102% 68.47 70.22 102%
2 256 7806.95 7586.69 97% 111.23 84.38 75%
4 256 8823.65 7612.74 86% 132.92 85.04 63%
8 256 9194.89 9373.41 101% 135.98 119.62 87%
1 512 7106.67 7128.00 100% 124.79 124.30 99%
2 512 9190.22 9397.33 102% 180.84 149.34 82%
4 512 9401.01 9376.67 99% 173.00 140.15 81%
8 512 8572.84 9032.90 105% 150.49 127.58 84%
1 1024 9361.93 9379.24 100% 205.81 202.94 98%
2 1024 9386.69 9389.04 100% 201.78 165.75 82%
4 1024 9403.43 9378.54 99% 195.33 152.06 77%
8 1024 9213.63 9180.64 99% 178.99 141.51 79%
1 2048 9338.95 9384.67 100% 223.22 227.86 102%
2 2048 9389.28 9389.45 100% 202.37 170.08 84%
4 2048 9405.86 9388.71 99% 193.76 161.54 83%
8 2048 9352.40 9384.06 100% 189.16 157.06 83%
1 4096 9380.74 9384.90 100% 239.37 241.56 100%
2 4096 9393.47 9376.74 99% 213.84 195.61 91%
4 4096 9393.85 9381.50 99% 198.06 170.18 85%
8 4096 9400.41 9232.31 98% 192.87 163.56 84%
1 16384 9348.18 9335.55 99% 253.02 254.86 100%
2 16384 9384.97 9359.53 99% 218.56 208.59 95%
4 16384 9326.60 9382.15 100% 206.24 179.72 87%
8 16384 9355.82 9392.85 100% 198.22 172.89 87%
- TCP RR
sessions size throughput1 throughput2 norm1 norm2
50 1 200340.33 261750.19 130% 2935.27 3018.59 102%
100 1 236141.58 266304.49 112% 3452.16 3071.74 88%
250 1 361574.59 320825.08 88% 4972.98 3705.70 74%
50 64 225748.53 242671.12 107% 3011.48 2869.07 95%
100 64 249885.37 260453.72 104% 3240.21 3063.67 94%
250 64 360341.12 310775.60 86% 4682.42 3657.91 78%
50 128 227995.27 289320.38 126% 2950.92 3479.37 117%
100 128 239491.11 291135.77 121% 3099.55 3508.75 113%
250 128 390390.68 362484.35 92% 5042.30 4368.52 86%
50 256 222604.51 317140.97 142% 3058.08 3839.39 125%
100 256 254770.92 335606.03 131% 3326.16 4046.65 121%
250 256 400584.52 436749.22 109% 5220.79 5278.86 101%
- External Host to 2 Guests
sessions size throughput1 throughput2 norm1 norm2
1 64 1667.99 1684.50 100% 59.66 60.77 101%
2 64 3338.83 3379.97 101% 83.61 64.82 77%
4 64 6613.65 6619.11 100% 131.00 97.19 74%
8 64 6553.07 6418.31 97% 141.35 98.27 69%
1 256 3938.40 4068.52 103% 125.21 123.76 98%
2 256 9215.57 9210.88 99% 185.31 154.27 83%
4 256 9407.29 9008.13 95% 186.72 150.01 80%
8 256 9377.17 9385.57 100% 190.28 137.59 72%
1 512 7360.19 6984.80 94% 214.09 211.66 98%
2 512 9392.91 9401.88 100% 193.92 173.11 89%
4 512 9382.64 9394.34 100% 189.27 145.80 77%
8 512 9308.60 9094.08 97% 189.70 141.26 74%
1 1024 9153.26 9066.06 99% 223.07 219.95 98%
2 1024 9393.38 9398.43 100% 194.02 173.82 89%
4 1024 9395.92 8960.73 95% 192.61 145.82 75%
8 1024 9388.92 9399.08 100% 191.18 143.87 75%
1 2048 9355.32 9240.63 98% 221.50 223.03 100%
2 2048 9395.68 9399.62 100% 193.31 177.21 91%
4 2048 9397.67 9399.56 100% 195.25 157.53 80%
8 2048 9397.89 9401.70 100% 197.57 146.96 74%
1 4096 9375.84 9381.72 100% 223.06 225.06 100%
2 4096 9389.47 9396.00 100% 193.91 197.13 101%
4 4096 9397.45 9400.11 100% 192.33 163.60 85%
8 4096 9105.40 9415.76 103% 192.71 140.41 72%
1 16384 9381.53 9381.40 99% 223.53 225.66 100%
2 16384 9387.90 9395.44 100% 193.34 177.03 91%
4 16384 9397.92 9410.98 100% 195.04 151.14 77%
8 16384 9259.00 9419.48 101% 194.91 153.48 78%
4) Local vm to vm 2 vcpu 1q vs 2q - pin vcpu/thread in the same numa node
- VM to VM TCP STREAM
sessions size throughput1 throughput2 norm1 norm2
1 64 576.05 576.14 100% 12.25 12.32 100%
2 64 1266.75 1160.04 91% 19.10 16.05 84%
4 64 1267.34 1123.70 88% 19.08 15.51 81%
8 64 1230.88 1174.70 95% 18.53 15.58 84%
1 256 1311.00 1303.02 99% 25.34 25.35 100%
2 256 5400.26 2794.00 51% 75.92 36.43 47%
4 256 5200.67 2818.88 54% 72.81 33.92 46%
8 256 5234.55 2893.74 55% 73.10 34.97 47%
1 512 3244.09 3263.72 100% 56.48 56.65 100%
2 512 8172.16 4661.15 57% 119.05 67.89 57%
4 512 10567.44 7063.25 66% 147.76 77.27 52%
8 512 10477.87 8471.33 80% 145.94 102.91 70%
1 1024 5432.54 5333.99 98% 93.69 92.38 98%
2 1024 12590.24 9259.97 73% 185.37 135.28 72%
4 1024 15600.53 10731.93 68% 222.20 123.60 55%
8 1024 16222.87 10704.85 65% 227.05 113.81 50%
1 2048 6667.61 7484.37 112% 116.75 129.72 111%
2 2048 8180.43 11500.88 140% 137.84 156.64 113%
4 2048 15127.93 14416.16 95% 227.60 154.59 67%
8 2048 16381.79 14794.10 90% 244.29 158.45 64%
1 4096 7375.63 8948.90 121% 131.97 156.57 118%
2 4096 9321.16 14443.21 154% 161.24 163.74 101%
4 4096 13028.45 15984.94 122% 212.78 171.26 80%
8 4096 15611.28 18810.54 120% 245.15 198.65 81%
1 16384 15304.38 14202.08 92% 259.94 244.04 93%
2 16384 15508.97 15913.09 102% 261.30 244.26 93%
4 16384 14859.98 20164.34 135% 248.29 214.26 86%
8 16384 15594.59 19960.99 127% 253.79 211.27 83%
- TCP RR
sessions size throughput1 throughput2 norm1 norm2
50 1 54972.51 69820.99 127% 1133.58 1063.58 93%
100 1 55847.16 72407.93 129% 1155.73 1024.35 88%
250 1 60066.23 108266.50 180% 1114.30 1323.55 118%
50 64 48727.63 62378.32 128% 1014.29 888.78 87%
100 64 51804.65 69250.51 133% 1077.78 986.97 91%
250 64 61278.68 100015.78 163% 1076.93 1243.18 115%
50 256 51593.29 62046.22 120% 1069.14 871.08 81%
100 256 51647.00 68197.43 132% 1071.66 958.51 89%
250 256 60433.88 99072.59 163% 1072.41 1199.10 111%
50 512 52177.79 66483.77 127% 1082.65 960.82 88%
100 512 50351.67 62537.63 124% 1041.61 876.41 84%
250 512 60510.14 103856.79 171% 1055.21 1245.17 118%
Jason Wang (4):
virtio_ring: move queue_index to vring_virtqueue
virtio: intorduce an API to set affinity for a virtqueue
virtio_net: multiqueue support
virtio_net: support negotiating the number of queues through ctrl vq
Krishna Kumar (1):
virtio_net: Introduce VIRTIO_NET_F_MULTIQUEUE
drivers/net/virtio_net.c | 792 +++++++++++++++++++++++++++++------------
drivers/virtio/virtio_mmio.c | 5 +-
drivers/virtio/virtio_pci.c | 58 +++-
drivers/virtio/virtio_ring.c | 17 +
include/linux/virtio.h | 4 +
include/linux/virtio_config.h | 21 ++
include/linux/virtio_net.h | 10 +
7 files changed, 677 insertions(+), 230 deletions(-)
^ permalink raw reply
* [net-next RFC V5 1/5] virtio_net: Introduce VIRTIO_NET_F_MULTIQUEUE
From: Jason Wang @ 2012-07-05 10:29 UTC (permalink / raw)
To: mst, mashirle, krkumar2, habanero, rusty, netdev, linux-kernel,
virtualization, edumazet, tahm, jwhan, davem
Cc: kvm, sri
In-Reply-To: <1341484194-8108-1-git-send-email-jasowang@redhat.com>
From: Krishna Kumar <krkumar2@in.ibm.com>
Introduce VIRTIO_NET_F_MULTIQUEUE.
Signed-off-by: Krishna Kumar <krkumar2@in.ibm.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
include/linux/virtio_net.h | 1 +
1 files changed, 1 insertions(+), 0 deletions(-)
diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h
index 2470f54..1bc7e30 100644
--- a/include/linux/virtio_net.h
+++ b/include/linux/virtio_net.h
@@ -51,6 +51,7 @@
#define VIRTIO_NET_F_CTRL_RX_EXTRA 20 /* Extra RX mode control support */
#define VIRTIO_NET_F_GUEST_ANNOUNCE 21 /* Guest can announce device on the
* network */
+#define VIRTIO_NET_F_MULTIQUEUE 22 /* Device supports multiple TXQ/RXQ */
#define VIRTIO_NET_S_LINK_UP 1 /* Link is up */
#define VIRTIO_NET_S_ANNOUNCE 2 /* Announcement is needed */
--
1.7.1
^ permalink raw reply related
* [net-next RFC V5 2/5] virtio_ring: move queue_index to vring_virtqueue
From: Jason Wang @ 2012-07-05 10:29 UTC (permalink / raw)
To: mst, mashirle, krkumar2, habanero, rusty, netdev, linux-kernel,
virtualization, edumazet, tahm, jwhan, davem
Cc: kvm, sri
In-Reply-To: <1341484194-8108-1-git-send-email-jasowang@redhat.com>
Instead of storing the queue index in virtio infos, this patch moves them to
vring_virtqueue and introduces helpers to set and get the value. This would
simplify the management and tracing.
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
drivers/virtio/virtio_mmio.c | 5 +----
drivers/virtio/virtio_pci.c | 12 +++++-------
drivers/virtio/virtio_ring.c | 17 +++++++++++++++++
include/linux/virtio.h | 4 ++++
4 files changed, 27 insertions(+), 11 deletions(-)
diff --git a/drivers/virtio/virtio_mmio.c b/drivers/virtio/virtio_mmio.c
index 453db0c..f5432b6 100644
--- a/drivers/virtio/virtio_mmio.c
+++ b/drivers/virtio/virtio_mmio.c
@@ -131,9 +131,6 @@ struct virtio_mmio_vq_info {
/* the number of entries in the queue */
unsigned int num;
- /* the index of the queue */
- int queue_index;
-
/* the virtual address of the ring queue */
void *queue;
@@ -324,7 +321,6 @@ static struct virtqueue *vm_setup_vq(struct virtio_device *vdev, unsigned index,
err = -ENOMEM;
goto error_kmalloc;
}
- info->queue_index = index;
/* Allocate pages for the queue - start with a queue as big as
* possible (limited by maximum size allowed by device), drop down
@@ -363,6 +359,7 @@ static struct virtqueue *vm_setup_vq(struct virtio_device *vdev, unsigned index,
goto error_new_virtqueue;
}
+ virtqueue_set_queue_index(vq, index);
vq->priv = info;
info->vq = vq;
diff --git a/drivers/virtio/virtio_pci.c b/drivers/virtio/virtio_pci.c
index 2e03d41..adb24f2 100644
--- a/drivers/virtio/virtio_pci.c
+++ b/drivers/virtio/virtio_pci.c
@@ -79,9 +79,6 @@ struct virtio_pci_vq_info
/* the number of entries in the queue */
int num;
- /* the index of the queue */
- int queue_index;
-
/* the virtual address of the ring queue */
void *queue;
@@ -202,11 +199,11 @@ static void vp_reset(struct virtio_device *vdev)
static void vp_notify(struct virtqueue *vq)
{
struct virtio_pci_device *vp_dev = to_vp_device(vq->vdev);
- struct virtio_pci_vq_info *info = vq->priv;
/* we write the queue's selector into the notification register to
* signal the other end */
- iowrite16(info->queue_index, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_NOTIFY);
+ iowrite16(virtqueue_get_queue_index(vq),
+ vp_dev->ioaddr + VIRTIO_PCI_QUEUE_NOTIFY);
}
/* Handle a configuration change: Tell driver if it wants to know. */
@@ -402,7 +399,6 @@ static struct virtqueue *setup_vq(struct virtio_device *vdev, unsigned index,
if (!info)
return ERR_PTR(-ENOMEM);
- info->queue_index = index;
info->num = num;
info->msix_vector = msix_vec;
@@ -425,6 +421,7 @@ static struct virtqueue *setup_vq(struct virtio_device *vdev, unsigned index,
goto out_activate_queue;
}
+ virtqueue_set_queue_index(vq, index);
vq->priv = info;
info->vq = vq;
@@ -467,7 +464,8 @@ static void vp_del_vq(struct virtqueue *vq)
list_del(&info->node);
spin_unlock_irqrestore(&vp_dev->lock, flags);
- iowrite16(info->queue_index, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_SEL);
+ iowrite16(virtqueue_get_queue_index(vq),
+ vp_dev->ioaddr + VIRTIO_PCI_QUEUE_SEL);
if (vp_dev->msix_enabled) {
iowrite16(VIRTIO_MSI_NO_VECTOR,
diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index 5aa43c3..9c5aeea 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -106,6 +106,9 @@ struct vring_virtqueue
/* How to notify other side. FIXME: commonalize hcalls! */
void (*notify)(struct virtqueue *vq);
+ /* Index of the queue */
+ int queue_index;
+
#ifdef DEBUG
/* They're supposed to lock for us. */
unsigned int in_use;
@@ -171,6 +174,20 @@ static int vring_add_indirect(struct vring_virtqueue *vq,
return head;
}
+void virtqueue_set_queue_index(struct virtqueue *_vq, int queue_index)
+{
+ struct vring_virtqueue *vq = to_vvq(_vq);
+ vq->queue_index = queue_index;
+}
+EXPORT_SYMBOL_GPL(virtqueue_set_queue_index);
+
+int virtqueue_get_queue_index(struct virtqueue *_vq)
+{
+ struct vring_virtqueue *vq = to_vvq(_vq);
+ return vq->queue_index;
+}
+EXPORT_SYMBOL_GPL(virtqueue_get_queue_index);
+
/**
* virtqueue_add_buf - expose buffer to other end
* @vq: the struct virtqueue we're talking about.
diff --git a/include/linux/virtio.h b/include/linux/virtio.h
index 8efd28a..0d8ed46 100644
--- a/include/linux/virtio.h
+++ b/include/linux/virtio.h
@@ -50,6 +50,10 @@ void *virtqueue_detach_unused_buf(struct virtqueue *vq);
unsigned int virtqueue_get_vring_size(struct virtqueue *vq);
+void virtqueue_set_queue_index(struct virtqueue *vq, int queue_index);
+
+int virtqueue_get_queue_index(struct virtqueue *vq);
+
/**
* virtio_device - representation of a device using virtio
* @index: unique position on the virtio bus
--
1.7.1
^ permalink raw reply related
* [net-next RFC V5 3/5] virtio: intorduce an API to set affinity for a virtqueue
From: Jason Wang @ 2012-07-05 10:29 UTC (permalink / raw)
To: mst, mashirle, krkumar2, habanero, rusty, netdev, linux-kernel,
virtualization, edumazet, tahm, jwhan, davem
Cc: kvm, sri
In-Reply-To: <1341484194-8108-1-git-send-email-jasowang@redhat.com>
Sometimes, virtio device need to configure irq affiniry hint to maximize the
performance. Instead of just exposing the irq of a virtqueue, this patch
introduce an API to set the affinity for a virtqueue.
The api is best-effort, the affinity hint may not be set as expected due to
platform support, irq sharing or irq type. Currently, only pci method were
implemented and we set the affinity according to:
- if device uses INTX, we just ignore the request
- if device has per vq vector, we force the affinity hint
- if the virtqueues share MSI, make the affinity OR over all affinities
requested
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
drivers/virtio/virtio_pci.c | 46 +++++++++++++++++++++++++++++++++++++++++
include/linux/virtio_config.h | 21 ++++++++++++++++++
2 files changed, 67 insertions(+), 0 deletions(-)
diff --git a/drivers/virtio/virtio_pci.c b/drivers/virtio/virtio_pci.c
index adb24f2..2ff0451 100644
--- a/drivers/virtio/virtio_pci.c
+++ b/drivers/virtio/virtio_pci.c
@@ -48,6 +48,7 @@ struct virtio_pci_device
int msix_enabled;
int intx_enabled;
struct msix_entry *msix_entries;
+ cpumask_var_t *msix_affinity_masks;
/* Name strings for interrupts. This size should be enough,
* and I'm too lazy to allocate each name separately. */
char (*msix_names)[256];
@@ -276,6 +277,10 @@ static void vp_free_vectors(struct virtio_device *vdev)
for (i = 0; i < vp_dev->msix_used_vectors; ++i)
free_irq(vp_dev->msix_entries[i].vector, vp_dev);
+ for (i = 0; i < vp_dev->msix_vectors; i++)
+ if (vp_dev->msix_affinity_masks[i])
+ free_cpumask_var(vp_dev->msix_affinity_masks[i]);
+
if (vp_dev->msix_enabled) {
/* Disable the vector used for configuration */
iowrite16(VIRTIO_MSI_NO_VECTOR,
@@ -293,6 +298,8 @@ static void vp_free_vectors(struct virtio_device *vdev)
vp_dev->msix_names = NULL;
kfree(vp_dev->msix_entries);
vp_dev->msix_entries = NULL;
+ kfree(vp_dev->msix_affinity_masks);
+ vp_dev->msix_affinity_masks = NULL;
}
static int vp_request_msix_vectors(struct virtio_device *vdev, int nvectors,
@@ -311,6 +318,15 @@ static int vp_request_msix_vectors(struct virtio_device *vdev, int nvectors,
GFP_KERNEL);
if (!vp_dev->msix_names)
goto error;
+ vp_dev->msix_affinity_masks
+ = kzalloc(nvectors * sizeof *vp_dev->msix_affinity_masks,
+ GFP_KERNEL);
+ if (!vp_dev->msix_affinity_masks)
+ goto error;
+ for (i = 0; i < nvectors; ++i)
+ if (!alloc_cpumask_var(&vp_dev->msix_affinity_masks[i],
+ GFP_KERNEL))
+ goto error;
for (i = 0; i < nvectors; ++i)
vp_dev->msix_entries[i].entry = i;
@@ -607,6 +623,35 @@ static const char *vp_bus_name(struct virtio_device *vdev)
return pci_name(vp_dev->pci_dev);
}
+/* Setup the affinity for a virtqueue:
+ * - force the affinity for per vq vector
+ * - OR over all affinities for shared MSI
+ * - ignore the affinity request if we're using INTX
+ */
+static int vp_set_vq_affinity(struct virtqueue *vq, int cpu)
+{
+ struct virtio_device *vdev = vq->vdev;
+ struct virtio_pci_device *vp_dev = to_vp_device(vdev);
+ struct virtio_pci_vq_info *info = vq->priv;
+ struct cpumask *mask;
+ unsigned int irq;
+
+ if (!vq->callback)
+ return -EINVAL;
+
+ if (vp_dev->msix_enabled) {
+ mask = vp_dev->msix_affinity_masks[info->msix_vector];
+ irq = vp_dev->msix_entries[info->msix_vector].vector;
+ if (cpu == -1)
+ irq_set_affinity_hint(irq, NULL);
+ else {
+ cpumask_set_cpu(cpu, mask);
+ irq_set_affinity_hint(irq, mask);
+ }
+ }
+ return 0;
+}
+
static struct virtio_config_ops virtio_pci_config_ops = {
.get = vp_get,
.set = vp_set,
@@ -618,6 +663,7 @@ static struct virtio_config_ops virtio_pci_config_ops = {
.get_features = vp_get_features,
.finalize_features = vp_finalize_features,
.bus_name = vp_bus_name,
+ .set_vq_affinity = vp_set_vq_affinity,
};
static void virtio_pci_release_dev(struct device *_d)
diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h
index fc457f4..2c4a989 100644
--- a/include/linux/virtio_config.h
+++ b/include/linux/virtio_config.h
@@ -98,6 +98,7 @@
* vdev: the virtio_device
* This returns a pointer to the bus name a la pci_name from which
* the caller can then copy.
+ * @set_vq_affinity: set the affinity for a virtqueue.
*/
typedef void vq_callback_t(struct virtqueue *);
struct virtio_config_ops {
@@ -116,6 +117,7 @@ struct virtio_config_ops {
u32 (*get_features)(struct virtio_device *vdev);
void (*finalize_features)(struct virtio_device *vdev);
const char *(*bus_name)(struct virtio_device *vdev);
+ int (*set_vq_affinity)(struct virtqueue *vq, int cpu);
};
/* If driver didn't advertise the feature, it will never appear. */
@@ -190,5 +192,24 @@ const char *virtio_bus_name(struct virtio_device *vdev)
return vdev->config->bus_name(vdev);
}
+/**
+ * virtqueue_set_affinity - setting affinity for a virtqueue
+ * @vq: the virtqueue
+ * @cpu: the cpu no.
+ *
+ * Pay attention the function are best-effort: the affinity hint may not be set
+ * due to config support, irq type and sharing.
+ *
+ */
+static inline
+int virtqueue_set_affinity(struct virtqueue *vq, int cpu)
+{
+ struct virtio_device *vdev = vq->vdev;
+ if (vdev->config->set_vq_affinity)
+ return vdev->config->set_vq_affinity(vq, cpu);
+ return 0;
+}
+
+
#endif /* __KERNEL__ */
#endif /* _LINUX_VIRTIO_CONFIG_H */
--
1.7.1
^ permalink raw reply related
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox