Netdev List

Netdev List
 help / color / mirror / Atom feed

* Re: [PATCH 2/4] net: remove obsolete simple_strto<foo>
From: Neil Horman @ 2012-12-07 15:22 UTC (permalink / raw)
  To: Abhijit Pawar
  Cc: David S. Miller, Pablo Neira Ayuso, Patrick McHardy,
	Alexey Kuznetsov, James Morris, Hideaki YOSHIFUJI,
	John W. Linville, Johannes Berg, Cong Wang, Eric Dumazet,
	Joe Perches, netdev, linux-kernel, netfilter-devel, netfilter,
	coreteam, linux-wireless
In-Reply-To: <1354880998-23417-1-git-send-email-abhi.c.pawar@gmail.com>

On Fri, Dec 07, 2012 at 05:19:58PM +0530, Abhijit Pawar wrote:
> This patch replace the obsolete simple_strto<foo> with kstrto<foo>
> 
> Signed-off-by: Abhijit Pawar <abhi.c.pawar@gmail.com>
> ---
>  net/core/netpoll.c                 |    9 +++++++--
>  net/ipv4/netfilter/ipt_CLUSTERIP.c |    9 +++++++--
>  net/mac80211/debugfs_sta.c         |    4 +++-
>  net/netfilter/nf_conntrack_core.c  |    6 ++++--
>  4 files changed, 21 insertions(+), 7 deletions(-)
> 
> diff --git a/net/core/netpoll.c b/net/core/netpoll.c
> index 77a0388..596b127 100644
> --- a/net/core/netpoll.c
> +++ b/net/core/netpoll.c
> @@ -668,13 +668,16 @@ EXPORT_SYMBOL(netpoll_print_options);
>  
>  int netpoll_parse_options(struct netpoll *np, char *opt)
>  {
> +	int rc;
>  	char *cur=opt, *delim;
>  
>  	if (*cur != '@') {
>  		if ((delim = strchr(cur, '@')) == NULL)
>  			goto parse_failed;
>  		*delim = 0;
> -		np->local_port = simple_strtol(cur, NULL, 10);
> +		rc = kstrtol(cur, 10, &np->local_port);
> +		if (rc)
> +			goto parse_failed;
Perhaps consolidate this to:
if (kstrtol(cur, 10, &np->local_port)
	goto parse_failed

Then you don't have to declare the new stack variable

>  		cur = delim;
>  	}
>  	cur++;
> @@ -705,7 +708,9 @@ int netpoll_parse_options(struct netpoll *np, char *opt)
>  		*delim = 0;
>  		if (*cur == ' ' || *cur == '\t')
>  			np_info(np, "warning: whitespace is not allowed\n");
> -		np->remote_port = simple_strtol(cur, NULL, 10);
> +		rc = kstrtol(cur, 10, &np->remote_port);
> +		if (rc)
> +			goto parse_failed;
>  		cur = delim;
Ditto

>  	}
>  	cur++;
> diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
> index fe5daea..55e7b73 100644
> --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
> +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
> @@ -661,6 +661,7 @@ static ssize_t clusterip_proc_write(struct file *file, const char __user *input,
>  #define PROC_WRITELEN	10
>  	char buffer[PROC_WRITELEN+1];
>  	unsigned long nodenum;
> +	int rc;
>  
>  	if (size > PROC_WRITELEN)
>  		return -EIO;
> @@ -669,11 +670,15 @@ static ssize_t clusterip_proc_write(struct file *file, const char __user *input,
>  	buffer[size] = 0;
>  
>  	if (*buffer == '+') {
> -		nodenum = simple_strtoul(buffer+1, NULL, 10);
> +		rc = kstrtoul(buffer+1, 10, &nodenum);
> +		if (rc)
> +			return -EINVAL;
>  		if (clusterip_add_node(c, nodenum))
>  			return -ENOMEM;
>  	} else if (*buffer == '-') {
> -		nodenum = simple_strtoul(buffer+1, NULL,10);
> +		rc = kstrtoul(buffer+1, 10, &nodenum);
> +		if (rc)
> +			return -EINVAL;
>  		if (clusterip_del_node(c, nodenum))
>  			return -ENOENT;
Same deal with the rc variable, although in this case it might make sense to
return rc if kstrtoul fails, instead of just filtering it all down to -EINVAL.

>  	} else
> diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c
> index 89281d2..18754fd 100644
> --- a/net/mac80211/debugfs_sta.c
> +++ b/net/mac80211/debugfs_sta.c
> @@ -219,7 +219,9 @@ static ssize_t sta_agg_status_write(struct file *file, const char __user *userbu
>  	} else
>  		return -EINVAL;
>  
> -	tid = simple_strtoul(buf, NULL, 0);
> +	ret = kstrtoul(buf, 0, &tid);
> +	if (ret)
> +		return -EINVAL;
>  
>  	if (tid >= IEEE80211_NUM_TIDS)
>  		return -EINVAL;
> diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
> index af17516..18ce24b 100644
> --- a/net/netfilter/nf_conntrack_core.c
> +++ b/net/netfilter/nf_conntrack_core.c
> @@ -1409,7 +1409,7 @@ EXPORT_SYMBOL_GPL(nf_ct_alloc_hashtable);
>  
>  int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp)
>  {
> -	int i, bucket;
> +	int i, bucket, rc;
>  	unsigned int hashsize, old_size;
>  	struct hlist_nulls_head *hash, *old_hash;
>  	struct nf_conntrack_tuple_hash *h;
> @@ -1422,7 +1422,9 @@ int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp)
>  	if (!nf_conntrack_htable_size)
>  		return param_set_uint(val, kp);
>  
> -	hashsize = simple_strtoul(val, NULL, 0);
> +	rc = kstrtouint(val, 0, &hashsize);
> +	if (rc)
> +		return -EINVAL;
>  	if (!hashsize)
>  		return -EINVAL;
>  
As above, these call points might benefit from returning rc rather than just
EINVAL.

Neil

> -- 
> 1.7.7.6
> 
> 

^ permalink raw reply

* Tx timestamp for packet mmap.
From: Paul Chavent @ 2012-12-07 15:28 UTC (permalink / raw)
  To: netdev

[-- Attachment #1: Type: text/plain, Size: 1814 bytes --]

Hi.

I would like to be able to get tx timestamps of packets sent by the 
packet mmap interface...

Actually, I try to get them with the sample code below.

The problem is that it doesn't work without the joined patch.

I wonder if my current implementation is good. And if not, how should i 
get the timestamps ?

Wouldn't be a good idea to put timestamps in the ring buffer frame 
before give back the frame to the user ?

Thanks for your help and advices.

Paul.

8<-------------------------------


   struct timespec ts = {0,0};
   struct sockaddr from_addr;
   static uint8_t tmp_data[256];
   struct iovec msg_iov = {tmp_data, sizeof(tmp_data)};
   static uint8_t cmsg_buff[256];
   struct msghdr msghdr = {&from_addr, sizeof(from_addr),
                           &msg_iov, 1,
                           cmsg_buff, sizeof(cmsg_buff),
                           0};
   ssize_t err = recvmsg(itf->sock_fd, &msghdr, MSG_ERRQUEUE);
   if(err < 0)
     {
       perror("recvmsg failed");
       return -1;
     }

   struct cmsghdr *cmsg;
   for(cmsg = CMSG_FIRSTHDR(&msghdr); cmsg != NULL; cmsg = 
CMSG_NXTHDR(&msghdr, cmsg))
     {
       if(cmsg->cmsg_level == SOL_SOCKET && cmsg->cmsg_type == 
SCM_TIMESTAMPING)
         {
           ts = *(struct timespec *)CMSG_DATA(cmsg);
#if !defined(NDEBUG)
           if(itf->debug)
             {
               fprintf(stderr, "SCM_TIMESTAMPING available\n");
             }
#else
           break;
#endif
         }
       else if (cmsg->cmsg_level == SOL_PACKET && cmsg->cmsg_type == 
PACKET_TX_TIMESTAMP)
         {
           ts = *(struct timespec *)CMSG_DATA(cmsg);
#if !defined(NDEBUG)
           if(itf->debug)
             {
               fprintf(stderr, "PACKET_TX_TIMESTAMP available\n");
             }
#else
           break;
#endif
         }
     }

[-- Attachment #2: 0002-net-add-tx-timestamp-to-packet-mmap.patch --]
[-- Type: text/x-patch, Size: 763 bytes --]

>From 762a8e89d1453e629bbe9c255c0ba4ec207cca25 Mon Sep 17 00:00:00 2001
From: Paul Chavent <paul.chavent@onera.fr>
Date: Fri, 7 Dec 2012 16:15:44 +0100
Subject: [PATCH 2/2] net : add tx timestamp to packet mmap.

Signed-off-by: Paul Chavent <paul.chavent@onera.fr>
---
 net/packet/af_packet.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index e639645..948748b 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -1857,6 +1857,10 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
 	void *data;
 	int err;
 
+	err = sock_tx_timestamp(&po->sk, &skb_shinfo(skb)->tx_flags);
+	if (err < 0)
+		return err;
+
 	ph.raw = frame;
 
 	skb->protocol = proto;
-- 
1.7.12.1


^ permalink raw reply related

* [PATCH stable] ipv4: avoid passing NULL to inet_putpeer() in icmpv4_xrlim_allow()
From: CAI Qian @ 2012-12-07 15:46 UTC (permalink / raw)
  To: netdev; +Cc: Neal Cardwell, David S. Miller
In-Reply-To: <1411074275.5265862.1354894974886.JavaMail.root@redhat.com>

David, this patch looks applicable for the stable releases.

>From Neal Cardwell <ncardwell@google.com>

inet_getpeer_v4() can return NULL under OOM conditions, and while
inet_peer_xrlim_allow() is OK with a NULL peer, inet_putpeer() will
crash.

This code path now uses the same idiom as the others from:
1d861aa4b3fb08822055345f480850205ffe6170 ("inet: Minimize use of
cached route inetpeer.").

Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

Upstream-ID: e1a676424c290b1c8d757e3860170ac7ecd89af4
Stable-trees: 3.6.x
Signed-off-by: CAI Qian <caiqian@redhat.com>

diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index f2eccd5..17ff9fd 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -257,7 +257,8 @@ static inline bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt,
 		struct inet_peer *peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, 1);
 		rc = inet_peer_xrlim_allow(peer,
 					   net->ipv4.sysctl_icmp_ratelimit);
-		inet_putpeer(peer);
+		if (peer)
+			inet_putpeer(peer);
 	}
 out:
 	return rc;

^ permalink raw reply related

* Re: [RFC PATCH] dynamic_queue_limit.h: Make the struct ___cacheline_aligned_on_smp
From: Eric Dumazet @ 2012-12-07 15:55 UTC (permalink / raw)
  To: Joe Perches; +Cc: Tom Herbert, David Miller, netdev
In-Reply-To: <1354892334.29937.14.camel@joe-AO722>

2012/12/7 Joe Perches <joe@perches.com>:
> Given that the struct will always have limit at the start of
> a cacheline, why not make  struct ___cacheline_aligned_on_smp
> and make limit the first member?
>
> It could make other structs that use struct dql a bit more
> predictable or efficient to pack.
>
> (netdev_queue is size reduced from 256 to 192 on x86-32)
>

No, please.

Have you tested this on a range of hardware and check how it can hurt
performance ?

^ permalink raw reply

* [PATCH stable] ipv4: do not cache looped multicasts
From: CAI Qian @ 2012-12-07 15:58 UTC (permalink / raw)
  To: netdev; +Cc: Maxime Bizon, Julian Anastasov, David S. Miller
In-Reply-To: <2138496357.5269916.1354895709722.JavaMail.root@redhat.com>

David, this looks like applicable to the stable releases.

>From Maxime Bizon <mbizon@freebox.fr>,

	Starting from 3.6 we cache output routes for
multicasts only when using route to 224/4. For local receivers
we can set RTCF_LOCAL flag depending on the membership but
in such case we use maddr and saddr which are not caching
keys as before. Additionally, we can not use same place to
cache routes that differ in RTCF_LOCAL flag value.

	Fix it by caching only RTCF_MULTICAST entries
without RTCF_LOCAL (send-only, no loopback). As a side effect,
we avoid unneeded lookup for fnhe when not caching because
multicasts are not redirected and they do not learn PMTU.

	Thanks to Maxime Bizon for showing the caching
problems in __mkroute_output for 3.6 kernels: different
RTCF_LOCAL flag in cache can lead to wrong ip_mc_output or
ip_output call and the visible problem is that traffic can
not reach local receivers via loopback.

Reported-by: Maxime Bizon <mbizon@freebox.fr>
Tested-by: Maxime Bizon <mbizon@freebox.fr>
Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: David S. Miller <davem@davemloft.net>

Upstream-ID: 636174219b52b5a8bc51bc23bbcba97cd30a65e3
Stable-trees: 3.6.x
Signed-off-by: CAI Qian <caiqian@redhat.com>

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 200d287..df25142 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1785,6 +1785,7 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
 	if (dev_out->flags & IFF_LOOPBACK)
 		flags |= RTCF_LOCAL;
 
+	do_cache = true;
 	if (type == RTN_BROADCAST) {
 		flags |= RTCF_BROADCAST | RTCF_LOCAL;
 		fi = NULL;
@@ -1793,6 +1794,8 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
 		if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
 				     fl4->flowi4_proto))
 			flags &= ~RTCF_LOCAL;
+		else
+			do_cache = false;
 		/* If multicast route do not exist use
 		 * default one, but do not gateway in this case.
 		 * Yes, it is hack.
@@ -1802,8 +1805,8 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
 	}
 
 	fnhe = NULL;
-	do_cache = fi != NULL;
-	if (fi) {
+	do_cache &= fi != NULL;
+	if (do_cache) {
 		struct rtable __rcu **prth;
 		struct fib_nh *nh = &FIB_RES_NH(*res);

^ permalink raw reply related

* Re: [PATCH net-next] rps: overflow prevention for saturated cpus
From: Willem de Bruijn @ 2012-12-07 16:04 UTC (permalink / raw)
  To: Rick Jones; +Cc: netdev, David Miller, Eric Dumazet, Tom Herbert
In-Reply-To: <50C12E22.3030206@hp.com>

On Thu, Dec 6, 2012 at 6:45 PM, Rick Jones <rick.jones2@hp.com> wrote:
> On 12/06/2012 03:04 PM, Willem de Bruijn wrote:
>>
>> On Thu, Dec 6, 2012 at 5:25 PM, Rick Jones <rick.jones2@hp.com> wrote:
>>>
>>> I thought (one of) the ideas behind RFS at least was to give the CPU
>>> scheduler control over where network processing took place instead of it
>>> being dictated solely by the addressing. I would have expected the CPU
>>> scheduler to migrate some work off the saturated CPU.  Or will this only
>>> affect RPS and not RFS?
>>
>>
>> I wrote it with RPS in mind, indeed. With RFS, for sufficiently
>> multithreaded applications that are unpinned, the scheduler will
>> likely spread the threads across as many cpus as possible. In that
>> case, the mechanism will not kick in, or as quickly. Even with RFS,
>> pinned threads and single-threaded applications will likely also
>> benefit during high load from redirecting kernel receive
>> processing away from the cpu that runs the application thread. I
>> haven't tested that case independently.
>
>
> Unless that single-threaded application (or single receiving thread) is
> pinned to a CPU, isn't there a non-trivial chance that incoming traffic
> flowing up different CPUs will cause it to be bounced from one CPU to
> another, taking its cache lines with it and not just the "intra-stack" cache
> lines?

Yes. The patch restricts the offload cpus to rps_cpus, with the assumption
that this is a small subset of all cpus. In that case, other workloads will
eventually migrate to the remainder. I previously tested spreading across
all cpus, which indeed did interfere with the userspace threads.

> Long (?) ago and far away it was possible to say that a given IRQ should be
> potentially serviced by more than one CPU (if I recall though not phrase
> correctly).  Didn't that get taken away because it did such nasty things
> like reordering and such?  (Admittedly, I'm really stretching the limits of
> my dimm memory there)

Sounds familiar. Wasn't there a mechanism to periodically switch the
destination cpu? If at HZ granularity, that is very coarse grain compared to
Mpps, but out of order does seem likely. I assume that this patch will lead
to a steady state where userspace and kernel receive run on disjoint cpusets,
due to the rps_cpus set being hot with kernel receive processing. That said,
I can run a test with RFS enabled to see whether that actually holds.

>>> What kind of workload is this targeting that calls for
>>> such intra-flow parallelism?
>>
>>
>> Packet processing middeboxes that rather operate in degraded mode
>> (reordering) than drop packets. Intrusion detection systems and proxies,
>> for instance. These boxes are actually likely to have RPS enabled and
>> RFS disabled.
>>
>>> With respect to the examples given, what happens when it is TCP traffic
>>> rather than UDP?
>>
>>
>> That should be identical. RFS is supported for both protocols. In the
>> test, it is turned off to demonstrate the effect solely with RPS.
>
>
> Will it be identical with TCP?  If anything, I would think causing
> reordering of the TCP segments within flows would only further increase the
> workload of the middlebox because it will increase the ACK rates. Perhaps
> quite significantly if GRO was effective at the receivers before the
> reordering started.
>
> At least unless/until the reordering is bad enough to cause the sending TCPs
> to fast retransmit and so throttle back.  And unless we are talking about
> being overloaded by massive herds of "mice" I'd think that the TCP flows
> would be throttling back to what the single CPU in the middlebox could
> handle.

Agreed, I will try to get some data on the interaction with TCP flows. My
hunch is that they throttle down due to the reordering, but data is more useful.
The initial increase in ACKs, if any, will likely not increase rate beyond a
small factor.

The situations that this patch mean to address are more straightforward
DoS attacks, where a box can handle normal load with a big safety margin,
but falls over at a 10x or 100x flood of TCP SYN or similar packets.

> rick

^ permalink raw reply

* Re: [RFC PATCH] dynamic_queue_limit.h: Make the struct ___cacheline_aligned_on_smp
From: Joe Perches @ 2012-12-07 16:05 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Tom Herbert, David Miller, netdev
In-Reply-To: <CAL4Wiip12_Frb64WGk419_YApOuckTa3Jtd-nVROodLFTqvacg@mail.gmail.com>

On Fri, 2012-12-07 at 07:55 -0800, Eric Dumazet wrote:
> 2012/12/7 Joe Perches <joe@perches.com>:
> > Given that the struct will always have limit at the start of
> > a cacheline, why not make  struct ___cacheline_aligned_on_smp
> > and make limit the first member?
> >
> > It could make other structs that use struct dql a bit more
> > predictable or efficient to pack.
> >
> > (netdev_queue is size reduced from 256 to 192 on x86-32)
> >
> 
> No, please.
> 
> Have you tested this on a range of hardware and check how it can hurt
> performance ?

No.  Hence the RFC subject title and
unsigned patch.

I was wondering though about cacheline ping-pong 
effects.

I noted Tom's comment back in
http://patchwork.ozlabs.org/patch/108856/
"Also, the cache line containing the struct dql can ping-pong between
CPUs doing initiation and completion.  (I know we're aiming for these to
be the same, but we can't yet assume they will be.)"

So it seemed somewhat sensible to make the
entire struct in a single cacheline.

^ permalink raw reply

* Re: [PATCH V2 wireless-next] iwlwifi: iwlagn_request_scan: Fix check for priv->scan_request
From: Johannes Berg @ 2012-12-07 16:05 UTC (permalink / raw)
  To: Tim Gardner
  Cc: linux-kernel, Wey-Yi Guy, Intel Linux Wireless, John W. Linville,
	Emmanuel Grumbach, Don Fry, linux-wireless, netdev
In-Reply-To: <1354886914-7822-1-git-send-email-tim.gardner@canonical.com>

On Fri, 2012-12-07 at 06:28 -0700, Tim Gardner wrote:
> The WARN_ON_ONCE() check for scan_request will not correctly detect
> a NULL pointer for scan_type == IWL_SCAN_NORMAL. Make it explicit
> that the check only applies to normal scans.
> 
> Convert WARN_ON_ONCE to WARN_ON since priv->scan_request really _can't_
> be NULL for normal scans. If it is then we should emit frequent warnings.
> 
> This smatch warning led to scrutiny of iwlagn_request_scan():
> 
> drivers/net/wireless/iwlwifi/dvm/scan.c:894 iwlagn_request_scan() error: we previously assumed 'priv->scan_request' could be null (see line 792)
> 
> Cc: Johannes Berg <johannes.berg@intel.com>
> Cc: Wey-Yi Guy <wey-yi.w.guy@intel.com>
> Cc: Intel Linux Wireless <ilw@linux.intel.com>
> Cc: "John W. Linville" <linville@tuxdriver.com>
> Cc: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
> Cc: Don Fry <donald.h.fry@intel.com>
> Cc: linux-wireless@vger.kernel.org
> Cc: netdev@vger.kernel.org
> Signed-off-by: Tim Gardner <tim.gardner@canonical.com>
> ---
> 
> This patch does apply to 3.6.y, but it doesn't fix an existing
> bug so I don't think it qualifies. This patch simply makes
> the driver more robust for future development.
> 
> V2 - corrected indentation more like the rest of the source
> in this file.

Thanks, I've picked it up now, adding one space in the condition
still :)
It's in my internal tree for now, so it'll be a few days until it
trickles out to iwlwifi-next.

johannes

^ permalink raw reply

* Re: [patch v2] bridge: make buffer larger in br_setlink()
From: walter harms @ 2012-12-07 16:07 UTC (permalink / raw)
  To: Dan Carpenter
  Cc: Stephen Hemminger, David S. Miller, bridge, netdev,
	kernel-janitors, Thomas Graf
In-Reply-To: <20121207111045.GA9676@elgon.mountain>



Am 07.12.2012 12:10, schrieb Dan Carpenter:
> We pass IFLA_BRPORT_MAX to nla_parse_nested() so we need
> IFLA_BRPORT_MAX + 1 elements.  Also Smatch complains that we read past
> the end of the array when in br_set_port_flag() when it's called with
> IFLA_BRPORT_FAST_LEAVE.
> 



I have no clue why nla_parse_nested() need IFLA_BRPORT_MAX elements.
but the majory of loop look like
for(i=0;i<max;++)
most programmers will think this way.
So it seems the place to fix is nla_parse_nested().
doing not so is asking for trouble (in the long run).
At least this function needs a big warning label that (max-1)
is actually needed.


just my two cents,
 wh


> Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
> ---
> v2: Style tweak.
> 
> Only needed in linux-next.
> 
> diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
> index 850b7d1..cfc5cfe 100644
> --- a/net/bridge/br_netlink.c
> +++ b/net/bridge/br_netlink.c
> @@ -239,7 +239,7 @@ int br_setlink(struct net_device *dev, struct nlmsghdr *nlh)
>  	struct ifinfomsg *ifm;
>  	struct nlattr *protinfo;
>  	struct net_bridge_port *p;
> -	struct nlattr *tb[IFLA_BRPORT_MAX];
> +	struct nlattr *tb[IFLA_BRPORT_MAX + 1];
>  	int err;
>  
>  	ifm = nlmsg_data(nlh);
> 
> --
> To unsubscribe from this list: send the line "unsubscribe kernel-janitors" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
> 

^ permalink raw reply

* Re: [PATCH net-next 02/10] tipc: eliminate aggregate sk_receive_queue limit
From: Neil Horman @ 2012-12-07 16:07 UTC (permalink / raw)
  To: Paul Gortmaker; +Cc: David Miller, netdev, Jon Maloy, Ying Xue
In-Reply-To: <1354890498-6448-3-git-send-email-paul.gortmaker@windriver.com>

On Fri, Dec 07, 2012 at 09:28:10AM -0500, Paul Gortmaker wrote:
> From: Ying Xue <ying.xue@windriver.com>
> 
> As a complement to the per-socket sk_recv_queue limit, TIPC keeps a
> global atomic counter for the sum of sk_recv_queue sizes across all
> tipc sockets. When incremented, the counter is compared to an upper
> threshold value, and if this is reached, the message is rejected
> with error code TIPC_OVERLOAD.
> 
> This check was originally meant to protect the node against
> buffer exhaustion and general CPU overload. However, all experience
> indicates that the feature not only is redundant on Linux, but even
> harmful. Users run into the limit very often, causing disturbances
> for their applications, while removing it seems to have no negative
> effects at all. We have also seen that overall performance is
> boosted significantly when this bottleneck is removed.
> 
> Furthermore, we don't see any other network protocols maintaining
> such a mechanism, something strengthening our conviction that this
> control can be eliminated.
> 
> Signed-off-by: Ying Xue <ying.xue@windriver.com>
> Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
> Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
> ---
>  net/tipc/socket.c | 7 +------
>  1 file changed, 1 insertion(+), 6 deletions(-)
> 
> diff --git a/net/tipc/socket.c b/net/tipc/socket.c
> index 1a720c8..a059ed0 100644
> --- a/net/tipc/socket.c
> +++ b/net/tipc/socket.c
> @@ -2,7 +2,7 @@
>   * net/tipc/socket.c: TIPC socket API
>   *
>   * Copyright (c) 2001-2007, Ericsson AB
> - * Copyright (c) 2004-2008, 2010-2011, Wind River Systems
> + * Copyright (c) 2004-2008, 2010-2012, Wind River Systems
>   * All rights reserved.
>   *
>   * Redistribution and use in source and binary forms, with or without
> @@ -1241,11 +1241,6 @@ static u32 filter_rcv(struct sock *sk, struct sk_buff *buf)
>  	}
>  
>  	/* Reject message if there isn't room to queue it */
> -	recv_q_len = (u32)atomic_read(&tipc_queue_size);
> -	if (unlikely(recv_q_len >= OVERLOAD_LIMIT_BASE)) {
> -		if (rx_queue_full(msg, recv_q_len, OVERLOAD_LIMIT_BASE))
> -			return TIPC_ERR_OVERLOAD;
> -	}
If you're going to remove the one place that you read this variable, don't you
also want to remove the points where you increment/decrement the atomic as well,
and for that matter eliminate the definition itself?

Neil

^ permalink raw reply

* [PATCH net-next] bonding: Fix check for ethtool get_link operation support
From: Ben Hutchings @ 2012-12-07 16:15 UTC (permalink / raw)
  To: Jay Vosburgh, Andy Gospodarek; +Cc: netdev

Since commit 2c60db037034 ('net: provide a default dev->ethtool_ops')
all devices have a non-null ethtool_ops.  Test only
dev->ethtool_ops->get_link in both places where we care.

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
---
Compile-tested only.

Ben.

 drivers/net/bonding/bond_main.c |   17 ++++++-----------
 1 files changed, 6 insertions(+), 11 deletions(-)

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index c8bff3e..800a897 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -615,15 +615,9 @@ static int bond_check_dev_link(struct bonding *bond,
 		return netif_carrier_ok(slave_dev) ? BMSR_LSTATUS : 0;
 
 	/* Try to get link status using Ethtool first. */
-	if (slave_dev->ethtool_ops) {
-		if (slave_dev->ethtool_ops->get_link) {
-			u32 link;
-
-			link = slave_dev->ethtool_ops->get_link(slave_dev);
-
-			return link ? BMSR_LSTATUS : 0;
-		}
-	}
+	if (slave_dev->ethtool_ops->get_link)
+		return slave_dev->ethtool_ops->get_link(slave_dev) ?
+			BMSR_LSTATUS : 0;
 
 	/* Ethtool can't be used, fallback to MII ioctls. */
 	ioctl = slave_ops->ndo_do_ioctl;
@@ -1510,8 +1504,9 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev)
 	int link_reporting;
 	int res = 0;
 
-	if (!bond->params.use_carrier && slave_dev->ethtool_ops == NULL &&
-		slave_ops->ndo_do_ioctl == NULL) {
+	if (!bond->params.use_carrier &&
+	    slave_dev->ethtool_ops->get_link == NULL &&
+	    slave_ops->ndo_do_ioctl == NULL) {
 		pr_warning("%s: Warning: no link monitoring support for %s\n",
 			   bond_dev->name, slave_dev->name);
 	}
-- 
1.7.7.6



-- 
Ben Hutchings, Staff Engineer, Solarflare
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.

^ permalink raw reply related

* [PATCH net-next 1/2] caif_usb: Check driver name before reading driver state in netdev notifier
From: Ben Hutchings @ 2012-12-07 16:17 UTC (permalink / raw)
  To: Sjur Braendeland; +Cc: netdev

In cfusbl_device_notify(), the usbnet and usbdev variables are
initialised before the driver name has been checked.  In case the
device's driver is not cdc_ncm, this may result in reading beyond the
end of the netdev private area.  Move the initialisation below the
driver name check.

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
---
Compile-tested only.

Ben.

 net/caif/caif_usb.c |    7 +++++--
 1 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/net/caif/caif_usb.c b/net/caif/caif_usb.c
index fd7cbf5..582f80c 100644
--- a/net/caif/caif_usb.c
+++ b/net/caif/caif_usb.c
@@ -126,8 +126,8 @@ static int cfusbl_device_notify(struct notifier_block *me, unsigned long what,
 	struct net_device *dev = arg;
 	struct caif_dev_common common;
 	struct cflayer *layer, *link_support;
-	struct usbnet	*usbnet = netdev_priv(dev);
-	struct usb_device	*usbdev = usbnet->udev;
+	struct usbnet *usbnet;
+	struct usb_device *usbdev;
 	struct ethtool_drvinfo drvinfo;
 
 	/*
@@ -141,6 +141,9 @@ static int cfusbl_device_notify(struct notifier_block *me, unsigned long what,
 	if (strncmp(drvinfo.driver, "cdc_ncm", 7) != 0)
 		return 0;
 
+	usbnet = netdev_priv(dev);
+	usbdev = usbnet->udev;
+
 	pr_debug("USB CDC NCM device VID:0x%4x PID:0x%4x\n",
 		le16_to_cpu(usbdev->descriptor.idVendor),
 		le16_to_cpu(usbdev->descriptor.idProduct));
-- 
1.7.7.6



-- 
Ben Hutchings, Staff Engineer, Solarflare
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.

^ permalink raw reply related

* Re: [RFC PATCH] dynamic_queue_limit.h: Make the struct ___cacheline_aligned_on_smp
From: Eric Dumazet @ 2012-12-07 16:19 UTC (permalink / raw)
  To: Joe Perches; +Cc: Tom Herbert, David Miller, netdev
In-Reply-To: <1354896346.29937.43.camel@joe-AO722>

On Fri, 2012-12-07 at 08:05 -0800, Joe Perches wrote:

> So it seemed somewhat sensible to make the
> entire struct in a single cacheline.

Any layout change in an object used in network fast path need a complete
performance study.

Even if you provide such a study, we'll need to reproduce your numbers
here.

BQL/DQL is not on our radars, spending two cache lines on a critical
object is fine.

Thanks

^ permalink raw reply

* [PATCH net-next 2/2] caif_usb: Make the driver name check more efficient
From: Ben Hutchings @ 2012-12-07 16:20 UTC (permalink / raw)
  To: Sjur Braendeland; +Cc: netdev

Use the device model to get just the name, rather than using the
ethtool API to get all driver information.

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
---
Compile-tested only.  I'm assuming that the strncmp() is not really
necessary, but perhaps there is some OOT variant of cdc_ncm that is also
supposed to be supported?

Ben.

 net/caif/caif_usb.c |   13 +++----------
 1 files changed, 3 insertions(+), 10 deletions(-)

diff --git a/net/caif/caif_usb.c b/net/caif/caif_usb.c
index 582f80c..3ebc8cb 100644
--- a/net/caif/caif_usb.c
+++ b/net/caif/caif_usb.c
@@ -128,17 +128,10 @@ static int cfusbl_device_notify(struct notifier_block *me, unsigned long what,
 	struct cflayer *layer, *link_support;
 	struct usbnet *usbnet;
 	struct usb_device *usbdev;
-	struct ethtool_drvinfo drvinfo;
 
-	/*
-	 * Quirks: High-jack ethtool to find if we have a NCM device,
-	 * and find it's VID/PID.
-	 */
-	if (dev->ethtool_ops == NULL || dev->ethtool_ops->get_drvinfo == NULL)
-		return 0;
-
-	dev->ethtool_ops->get_drvinfo(dev, &drvinfo);
-	if (strncmp(drvinfo.driver, "cdc_ncm", 7) != 0)
+	/* Check whether we have a NCM device, and find its VID/PID. */
+	if (!(dev->dev.parent && dev->dev.parent->driver &&
+	      strcmp(dev->dev.parent->driver->name, "cdc_ncm") == 0))
 		return 0;
 
 	usbnet = netdev_priv(dev);
-- 
1.7.7.6


-- 
Ben Hutchings, Staff Engineer, Solarflare
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.

^ permalink raw reply related

* Re: [RFC] wireless: check against default_ethtool_ops
From: Ben Hutchings @ 2012-12-07 16:27 UTC (permalink / raw)
  To: Stanislaw Gruszka
  Cc: Eric Dumazet, netdev, Ben Greear, linux-wireless, Bjørn Mork
In-Reply-To: <20121207121617.GA4356@redhat.com>

On Fri, 2012-12-07 at 13:16 +0100, Stanislaw Gruszka wrote:
> Since:
> 
> commit 2c60db037034d27f8c636403355d52872da92f81
> Author: Eric Dumazet <edumazet@google.com>
> Date:   Sun Sep 16 09:17:26 2012 +0000
> 
>     net: provide a default dev->ethtool_ops
> 
> wireless core does not correctly assign ethtool_ops. In order to fix
> the problem, and avoid assigning ethtool_ops on each individual cfg80211
> driver, we check against default_ethool_ops pointer instead of NULL in
> wireless core.
> 
> Signed-off-by: Stanislaw Gruszka <sgruszka@redhat.com>
[...]
Acked-by: Ben Hutchings <bhutchings@solarflare.com>

Ideally you could do this assignment unconditionally in the setup
function for the device, but it doesn't seem like there's a common
allocation path that you could do that in.

Ben.

-- 
Ben Hutchings, Staff Engineer, Solarflare
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.

^ permalink raw reply

* Re: [RFC PATCH] dynamic_queue_limit.h: Make the struct ___cacheline_aligned_on_smp
From: Joe Perches @ 2012-12-07 16:29 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Tom Herbert, David Miller, netdev
In-Reply-To: <1354897144.26405.4.camel@edumazet-glaptop>

On Fri, 2012-12-07 at 08:19 -0800, Eric Dumazet wrote:
> On Fri, 2012-12-07 at 08:05 -0800, Joe Perches wrote:
> 
> > So it seemed somewhat sensible to make the
> > entire struct in a single cacheline.
> 
> Any layout change in an object used in network fast path need a complete
> performance study.
> 
> Even if you provide such a study, we'll need to reproduce your numbers
> here.
> 
> BQL/DQL is not on our radars, spending two cache lines on a critical
> object is fine.

Well Maybe Tom can provide some information as to why
the limit variable was cacheline_aligned_in_smp and not
the struct.

I didn't find any discussion about it.

^ permalink raw reply

* Re: [PATCH net-next] rps: overflow prevention for saturated cpus
From: Willem de Bruijn @ 2012-12-07 16:41 UTC (permalink / raw)
  To: Ben Hutchings; +Cc: netdev, David Miller, Eric Dumazet, Tom Herbert
In-Reply-To: <1354891889.2707.2.camel@bwh-desktop.uk.solarflarecom.com>

On Fri, Dec 7, 2012 at 9:51 AM, Ben Hutchings <bhutchings@solarflare.com> wrote:
> On Thu, 2012-12-06 at 15:36 -0500, Willem de Bruijn wrote:
>> RPS and RFS balance load across cpus with flow affinity. This can
>> cause local bottlenecks, where a small number or single large flow
>> (DoS) can saturate one CPU while others are idle.
>>
>> This patch maintains flow affinity in normal conditions, but
>> trades it for throughput when a cpu becomes saturated. Then, packets
>> destined to that cpu (only) are redirected to the lightest loaded cpu
>> in the rxqueue's rps_map. This breaks flow affinity under high load
>> for some flows, in favor of processing packets up to the capacity
>> of the complete rps_map cpuset in all circumstances.
> [...]
>> --- a/Documentation/networking/scaling.txt
>> +++ b/Documentation/networking/scaling.txt
>> @@ -135,6 +135,18 @@ packets have been queued to their backlog queue. The IPI wakes backlog
>>  processing on the remote CPU, and any queued packets are then processed
>>  up the networking stack.
>>
>> +==== RPS Overflow Protection
>> +
>> +By selecting the same cpu from the cpuset for each packet in the same
>> +flow, RPS will cause load imbalance when input flows are not uniformly
>> +random. In the extreme case, a single flow, all packets are handled on a
>> +single CPU, which limits the throughput of the machine to the throughput
>> +of that CPU. RPS has optional overflow protection, which disables flow
>> +affinity when an RPS CPU becomes saturated: during overload, its packets
>> +will be sent to the least loaded other CPU in the RPS cpuset. To enable
>> +this option, set sysctl net.core.netdev_max_rps_backlog to be smaller than
>> +net.core.netdev_max_backlog. Setting it to half is a reasonable heuristic.
> [...]
>
> This only seems to be suitable for specialised applications where a high
> degree of reordering is tolerable.  This documentation should make that
> very clear.

Good point. I'll revise that when I respin the patch.

I wasn't too concerned with this earlier, but there may be a way
to reduce the amount of reordering imposed, in
particular in the case where normal load has many small flows and the
exception is the normal case plus a small number of very high rate flows
(think synflood).

It is possible for a single high rate flow to exceed a single cpu
capacity, so those flows will always either drop packets or span
cpus and thus witness reordering (they are unlikely to be tcp
connections). It would be an improvement if the smaller flows
would at least not see reordering.

If the algorithm only redistributes packets from high rate flows, or an
approximation thereof, this will be the case. Keeping a hashtable,
counting arrivals per bucket and redirecting the highest fraction of buckets,
will do this (not my idea: a variation on a drop strategy that Eric mentioned
to me earlier). I can implement this, instead, if that sounds like a better
idea.

Because of the constraint that a single flow may exceed a single cpu
capacity, redistributed packets will always have to be redistributed
without flow affinity, I think.

> Ben.
>
> --
> Ben Hutchings, Staff Engineer, Solarflare
> Not speaking for my employer; that's the marketing department's job.
> They asked us to note that Solarflare product names are trademarked.
>

^ permalink raw reply

* Re: [RFC PATCH] dynamic_queue_limit.h: Make the struct ___cacheline_aligned_on_smp
From: Ben Hutchings @ 2012-12-07 16:42 UTC (permalink / raw)
  To: Joe Perches; +Cc: Eric Dumazet, Tom Herbert, David Miller, netdev
In-Reply-To: <1354897761.29937.45.camel@joe-AO722>

On Fri, 2012-12-07 at 08:29 -0800, Joe Perches wrote:
> On Fri, 2012-12-07 at 08:19 -0800, Eric Dumazet wrote:
> > On Fri, 2012-12-07 at 08:05 -0800, Joe Perches wrote:
> > 
> > > So it seemed somewhat sensible to make the
> > > entire struct in a single cacheline.
> > 
> > Any layout change in an object used in network fast path need a complete
> > performance study.
> > 
> > Even if you provide such a study, we'll need to reproduce your numbers
> > here.
> > 
> > BQL/DQL is not on our radars, spending two cache lines on a critical
> > object is fine.
> 
> Well Maybe Tom can provide some information as to why
> the limit variable was cacheline_aligned_in_smp and not
> the struct.
> 
> I didn't find any discussion about it.

Structure alignment has to be at least the maximum of each member's
alignment, so the struct *is* effectively cacheline_aligned_in_smp.

Ben.

-- 
Ben Hutchings, Staff Engineer, Solarflare
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.

^ permalink raw reply

* Re: [PATCH v3 1/4] net: Add support for hardware-offloaded encapsulation
From: Alexander Duyck @ 2012-12-07 16:45 UTC (permalink / raw)
  To: Ben Hutchings
  Cc: Joseph Gasparakis, davem, shemminger, chrisw, gospo, netdev,
	linux-kernel, dmitry, saeed.bishara, Peter P Waskiewicz Jr
In-Reply-To: <1354874847.20838.49.camel@deadeye.wl.decadent.org.uk>

On 12/07/2012 02:07 AM, Ben Hutchings wrote:
> On Thu, 2012-12-06 at 17:56 -0800, Joseph Gasparakis wrote:
>> This patch adds support in the kernel for offloading in the NIC Tx and Rx
>> checksumming for encapsulated packets (such as VXLAN and IP GRE).
> [...]
>> --- a/include/linux/netdevice.h
>> +++ b/include/linux/netdevice.h
>> @@ -1063,6 +1063,8 @@ struct net_device {
>>  	netdev_features_t	wanted_features;
>>  	/* mask of features inheritable by VLAN devices */
>>  	netdev_features_t	vlan_features;
>> +	/* mask of features inherited by encapsulating devices */
>> +	netdev_features_t	hw_enc_features;
> [...]
> 
> How will the networking core know *which* encapsulations this applies
> to?  I notice that your implementation in ixgbe does not set
> NETIF_F_HW_CSUM here, so presumably the hardware will parse headers to
> find which ranges should be checksummed and it won't cover the next
> encapsulation protocol that comes along.
> 
> Ben.
> 

Actually the offload is generic to any encapsulation that does not
compute a checksum on the inner headers.  So as long as you can treat
the outer headers as one giant L2 header you can pretty much ignore what
is in there as long as the inner network and transport header values are
set.  There are a number of tunnels that fall into that category since
most just use IP as the L2 and the L3 usually doesn't contain any checksum.

Thanks,

Alex

^ permalink raw reply

* Re: [RFC PATCH] dynamic_queue_limit.h: Make the struct ___cacheline_aligned_on_smp
From: Eric Dumazet @ 2012-12-07 16:54 UTC (permalink / raw)
  To: Joe Perches; +Cc: Tom Herbert, David Miller, netdev
In-Reply-To: <1354897761.29937.45.camel@joe-AO722>

On Fri, 2012-12-07 at 08:29 -0800, Joe Perches wrote:

> Well Maybe Tom can provide some information as to why
> the limit variable was cacheline_aligned_in_smp and not
> the struct.
> 
> I didn't find any discussion about it.
> 

The struct _is_ cache line aligned, since it contains one field needing
cache line alignment. Its pretty clear to us.

There are two cache lines in it.

We don't one a single cache line, but two, for performance reasons.

If you believe its wrong, you have to provide the performance study, not
me, as I don't have time to spend cycles on this. We did this a long
time ago.

If you want to save few bytes on your kernel, redefine
__cacheline_aligned_on_smp to empty, and you'll be ok.

^ permalink raw reply

* Re: [PATCH 2/2] netfilter: add xt_bpf xtables match
From: Willem de Bruijn @ 2012-12-07 16:56 UTC (permalink / raw)
  To: Pablo Neira Ayuso
  Cc: netfilter-devel, netdev, Eric Dumazet, David Miller, kaber
In-Reply-To: <20121207131638.GA3019@1984>

On Fri, Dec 7, 2012 at 8:16 AM, Pablo Neira Ayuso <pablo@netfilter.org> wrote:
> On Wed, Dec 05, 2012 at 03:10:13PM -0500, Willem de Bruijn wrote:
>> On Wed, Dec 5, 2012 at 2:48 PM, Pablo Neira Ayuso <pablo@netfilter.org> wrote:
>> > Hi Willem,
>> >
>> > On Wed, Dec 05, 2012 at 02:22:19PM -0500, Willem de Bruijn wrote:
>> >> A new match that executes sk_run_filter on every packet. BPF filters
>> >> can access skbuff fields that are out of scope for existing iptables
>> >> rules, allow more expressive logic, and on platforms with JIT support
>> >> can even be faster.
>> >>
>> >> I have a corresponding iptables patch that takes `tcpdump -ddd`
>> >> output, as used in the examples below. The two parts communicate
>> >> using a variable length structure. This is similar to ebt_among,
>> >> but new for iptables.
>> >>
>> >> Verified functionality by inserting an ip source filter on chain
>> >> INPUT and an ip dest filter on chain OUTPUT and noting that ping
>> >> failed while a rule was active:
>> >>
>> >> iptables -v -A INPUT -m bpf --bytecode '4,32 0 0 12,21 0 1 $SADDR,6 0 0 96,6 0 0 0,' -j DROP
>> >> iptables -v -A OUTPUT -m bpf --bytecode '4,32 0 0 16,21 0 1 $DADDR,6 0 0 96,6 0 0 0,' -j DROP
>> >
>> > I like this BPF idea for iptables.
>> >
>> > I made a similar extension time ago, but it was taking a file as
>> > parameter. That file contained in BPF code. I made a simple bison
>> > parser that takes BPF code and put it into the bpf array of
>> > instructions. It would be a bit more intuitive to define a filter and
>> > we can distribute it with iptables.
>>
>> That's cleaner, indeed. I actually like how tcpdump operates as a
>> code generator if you pass -ddd. Unfortunately, it generates code only
>> for link layer types of its supported devices, such as DLT_EN10MB and
>> DLT_LINUX_SLL. The network layer interface of basic iptables
>> (forgetting device dependent mechanisms as used in xt_mac) is DLT_RAW,
>> but that is rarely supported.
>
> Indeed, you'll have to hack on tcpdump to select the offset. In
> iptables the base is the layer 3 header. With that change you could
> use tcpdump for generate code automagically from their syntax.
>
>> > Let me check on my internal trees, I can put that user-space code
>> > somewhere in case you're interested.
>>
>> Absolutely. I'll be happy to revise to get it in. I'm also considering
>> sending a patch to tcpdump to make it generate code independent of the
>> installed hardware when specifying -y.
>
> I found a version of the old parser code I made:
>
> http://1984.lsi.us.es/git/nfbpf/
>
> It interprets a filter expressed in a similar way to tcpdump -dd but
> it's using the BPF constants. It's quite preliminary and simple if you
> look at the code.
>
> Extending it to interpret some syntax similar to tcpdump -d would even
> make more readable the BPF filter.
>
> Time ago I also thought about taking the kernel code that checks that
> the filter is correct. Currently you get -EINVAL if you pass a
> handcrafted filter which is incorrect, so it's hard task to debug what
> you made wrong.
>
> It could be added to the iptables tree. Or if generic enough for BPF
> and the effort is worth, just provide some small library that iptables
> can link with and a small compiler/checker to help people develop BPF
> filters.

Or use pcap_compile? I went with the tcpdump output to avoid
introducing a direct dependency on pcap to iptables. One possible
downside I see to pcap_compile vs. developing from scratch is that it
might lag in supporting the LSF ancillary data fields.

> Back to your xt_bpf thing, we can use the file containing the code
> instead:
>
> iptables -v -A INPUT -m bpf --bytecode-file filter1.bpf -j DROP
> iptables -v -A OUTPUT -m bpf --bytecode-file filter2.bpf -j DROP
>
> We can still allow the inlined filter via --bytecode if you want.

I'll add that. I'd like to keep --bytecode to able to generate the
code inline using backticks.

^ permalink raw reply

* [PATCH net-next v3 0/3] Multiqueue support in virtio-net
From: Jason Wang @ 2012-12-07 17:04 UTC (permalink / raw)
  To: mst, rusty, davem, virtualization, netdev, linux-kernel
  Cc: krkumar2, kvm, bhutchings, jwhan, shiyer

Hi all:

This series is an update version (hope the final version) of multiqueue
(VIRTIO_NET_F_MQ) support in virtio-net driver. All previous comments were
addressed, the work were based on Krishna Kumar's work to let virtio-net use
multiple rx/tx queues to do the packets reception and transmission. Performance
test show the aggregate latency were increased greately but may get some
regression in small packet transmission. Due to this, multiqueue were disabled
by default. If user want to benefit form the multiqueue, ethtool -L could be
used to enable the feature.

Please review and comments.

A protype implementation of qemu-kvm support could by found in
git://github.com/jasowang/qemu-kvm-mq.git. To start a guest with two queues, you
could specify the queues parameters to both tap and virtio-net like:

./qemu-kvm -netdev tap,queues=2,... -device virtio-net-pci,queues=2,...

then enable the multiqueue through ethtool by:

ethtool -L eth0 combined 2

Changes from V2:
Align the implementation to V6 virtio-spec
- Change the name of feature and name from _{RFS|rfs} to _{MQ|mq}

Changes from V1:
Addressing Michael's comments:
- fix typos in commit log
- don't move virtnet_open()
- don't set to NULL in virtnet_free_queues()
- style & comment fixes
- conditionally set the irq affinity hint based on online cpus and queue pairs
- move the virnet_del_vqs to patch 1
- change the meaningless kzalloc() to kmalloc()
- open code the err handling
- store the name of virtqueue in send/receive queue
- avoid type cast in virtnet_find_vqs()
- fix the mem leak and freeing issue of names in virtnet_find_vqs()
- check cvq during before setting the max_queue_pairs in virtnet_probe()
- check the cvq and VIRTIO_NET_F_RFS in virtnet_set_queues()
- set the curr_queue_pairs in virtnet_set_queue()
- use the err report by virtnet_set_queue() as the return value of
  ethtool_set_channels()

Changes from RFC v7:
Addressing Rusty's comments:
- align the implementation (location of cvq) to v5.
- fix the style issue.
- use a global refill instead of per-vq one.
- check the VIRTIO_NET_F_RFS before calling virtnet_set_queues()

Addresing Michael's comments
- rename the curr_queue_pairs in virtnet_probe() to max_queue_pairs
- validate the number of queue pairs supported by the device against
  VIRTIO_NET_CTRL_RFS_VQ_PAIRS_MIN and VIRTIO_NET_CTRL_RFS_VQ_PAIRS_MAX.
- don't crash when failing to change the number of virtqueues
- don't set the affinity hint when onle single queue is used or there's too much
  virtqueues
- add a TODO of handling cpu hotplug
- allow user to set the nubmer of queue pairs between 1 and max_queue_pairs

Changes from RFC v6:
- Align the implementation with the RFC spec update v5
- Addressing Rusty's comments:
  * split the patches
  * rename to max_queue_pairs and curr_queue_pairs
  * remove the useless status
  * fix the hibernation bug
- Addressing Ben's comments:
  * check other parameters in ethtool_set_queues

Changes from RFC v5:
- Align the implementation with the RFC spec update v4
- Switch the mode between single mode and multiqueue mode without reset
- Remove the 256 limitation of queues
- Use helpers to do the mapping between virtqueues and tx/rx queues
- Use commbined channels instead of separated rx/tx queus when do the queue
  number configuartion
- Other coding style comments from Michael

Changes from RFC v4:
- Add ability to negotiate the number of queues through control virtqueue
- Ethtool -{L|l} support and default the tx/rx queue number to 1
- Expose the API to set irq affinity instead of irq itself

Changes from RFC v3:
- Rebase to the net-next
- Let queue 2 to be the control virtqueue to obey the spec
- Prodives irq affinity
- Choose txq based on processor id

Reference:
- V6 virtio-spec: http://marc.info/?l=linux-netdev&m=135488976031512&w=2
- V2: https://lkml.org/lkml/2012/12/5/90
- V1: https://lkml.org/lkml/2012/11/27/177
- RFC V7: https://lkml.org/lkml/2012/11/27/177a
- RFC V6: https://lkml.org/lkml/2012/10/30/127
- RFC V5: http://lwn.net/Articles/505388/
- RFC V4: https://lkml.org/lkml/2012/6/25/120
- RFC V2: http://lwn.net/Articles/467283/

Perf Numbers:
- pktgen shows multqueue has much more ability to send/receive more packets
  comapred to single queue.
- netperf request-reponse test shows multiqueue improves a lot in aggregate
  latency.
- netperf stream test shows some regression especially for small packets since
  TCP batch less when latency is improved.

1 Pktgen test:

1.0 Test Environment:

One 2.0G AMD Opteron(tm) Processor 6168. Pktgen to stress the virtio-net in
guest to test Guest TX. Pktgen to stress tap in host to test Guest RX.

2.1 Guest TX: Unfortunately current pktgen does not support virtio-net well
since virtio-net may not free the skb during tx completion. So I test through a
patch (https://lkml.org/lkml/2012/11/26/31) that don't wait for this freeing
with a guest of 4 vcpu:

#q | kpps  | +improvement%
 1 | 589K  | 0%
 2 | 952K  | 62%
 3 | 1290K | 120%
 4 | 1578K | 168%

2.2 Guest RX: After commit 5d097109257c03a71845729f8db6b5770c4bbedc (tun: only
queue packets on device), pktgen start to report a unbelievable huge
kpps. (>2099kpps even for one queue). The problem if tun report NETDEV_TX_OK
even when it drops packet which confuse the pktgen. After change it to
NET_XMIT_DROP, the value makes more sense but not very stable even doing some
pining manually. Even this, multiqueue get a good speedup in the test. Will
continue to investigate.

2 Netperf test:

2.0 Test Environment:

Two Intel(R) Xeon(R) CPU E5620  @ 2.40GHz with two directed connected intel
82599EB 10 Gigabit Ethernet controller. A script to launch multiple parallelized
netperf sessions in demo mode, and a post-process script to compare the
timestamp and calculate the aggregate performance.

available: 2 nodes (0-1)
node 0 cpus: 0 1 2 3
node 0 size: 8175 MB
node 0 free: 7359 MB
node 1 cpus: 4 5 6 7
node 1 size: 8192 MB
node 1 free: 7731 MB
node distances
node   0   1
  0:  10  20
  1:  20  10

Host/Guest kernel: net-next with mq patches

2.1 2vcpu 2q vs 1q: ping guest vcpu and vhost thread in the same numa node

TCP_RR test:
size|session|+thu%|+normalize%
    1|     1|    0%|   -2%
    1|    20|  +23%|   +2%
    1|    50|   +9%|   -1%
    1|   100|   +2%|   -7%
   64|     1|    0%|   +1%
   64|    20|  +17%|   -1%
   64|    50|   +6%|   -4%
   64|   100|   +5%|   -5%
  256|     1|    0%|  +24%
  256|    20|  +52%|  +19%
  256|    50|  +46%|  +32%
  256|   100|  +44%|  +31%

- TCP_RR shows improvement of transaction rate. The reason why 1/64 byte does no
  show much gain is because the test could not fully utilized the two vhost
  threads: Each vhost thread cosume only about 50% of cpu.

TCP_CRR test:
size|session|+thu%|+normalize%
    1|     1|   -8%|  -13%
    1|    20|  +34%|   +1%
    1|    50|  +27%|    0%
    1|   100|  +29%|   +1%
   64|     1|   -9%|  -13%
   64|    20|  +31%|    0%
   64|    50|  +26%|   -1%
   64|   100|  +30%|   +1%
  256|     1|   -8%|  -11%
  256|    20|  +33%|   +1%
  256|    50|  +23%|   -3%
  256|   100|  +29%|   +1%

- TCP_CRR shows improvemnt of multiple sessions of TCP_CRR. Get regression of
  single session of TCP_CRR test, looks like the TCP_CRR will miss the flow
  director of both ixgbe and tun, which cause almost all physical queues has
  been used in host.

Guest TX:
size|session|+thu%|+normalize%
    1|     1|   -6%|    0%
    1|     2|   +3%|    0%
    1|     4|    0%|    0%
   64|     1|    0%|    0%
   64|     2|   -5%|   -8%
   64|     4|   -5%|   -7%
  256|     1|  +25%|   +7%
  256|     2|  -10%|  -34%
  256|     4|  -29%|  -31%
  512|     1|   -1%|  -63%
  512|     2|  -42%|  -43%
  512|     4|  -51%|  -60%
 1024|     1|   -5%|  -13%
 1024|     2|   +2%|  -39%
 1024|     4|    0%|  -27%
 4096|     1|  +73%|  +51%
 4096|     2|   +5%|   -9%
 4096|     4|   +3%|  -18%
16384|     1|  +48%|  +29%
16384|     2|  +73%|  +16%
16384|     4|  +21%|  -22%

- Parallel sending of small packets gets regression, statistics shows when
  multiqueue is enabled, TCP tends to send much more but smaller packets because
  the latency is improved, so TCP tends to batch less. More packets also means
  more exits/irqs which is bad for both throughput and cpu utilization.

Guest RX:
size|session|+thu%|+normalize%
    1|     1|    0%|  +26%
    1|     2|   -3%|  -51%
    1|     4|   -2%|  -44%
   64|     1|    0%|   -2%
   64|     2|    0%|  -29%
   64|     4|    0%|  -21%
  256|     1|    0%|   -2%
  256|     2|    0%|  -18%
  256|     4|  +11%|  -13%
  512|     1|   -1%|   -2%
  512|     2|   -9%|  -21%
  512|     4|   +7%|  -15%
 1024|     1|    0%|   -2%
 1024|     2|   +1%|  -11%
 1024|     4|   +5%|  -16%
 4096|     1|    0%|    0%
 4096|     2|    0%|  -10%
 4096|     4|  +10%|  -11%
16384|     1|    0%|   +1%
16384|     2|   +1%|  -15%
16384|     4|  +18%|   -7%

- RX performance is equal or better than single queue, but with a drop on per
  cpu throughput. Statistics shows more packets were sent and received by guest
  which result more exits/irqs.

2.2 4vcpu 4q vs 1q, pin vcpu in node 0, vhost thread in node 1

TCP_RR:
size|session|+thu%|+normalize%
    1|     1|   -1%|   +2%
    1|    20| +160%|   +5%
    1|    50| +169%|  +30%
    1|   100| +161%|  +30%
   64|     1|    0%|   +4%
   64|    20| +157%|  +11%
   64|    50| +112%|  +47%
   64|   100| +110%|  +48%
  256|     1|    0%|   +6%
  256|    20| +104%|   -3%
  256|    50| +131%|  +69%
  256|   100| +174%|  +96%

- Multiqueue shows much improvement in both transaction rate and cpu
  utilization.

TCP_CRR:
size|session|+thu%|+normalize%
    1|     1|  -30%|  -36%
    1|    20| +108%|   -4%
    1|    50| +132%|   +3%
    1|   100| +130%|   +9%
   64|     1|  -31%|  -36%
   64|    20| +111%|   -2%
   64|    50| +128%|   +2%
   64|   100| +136%|  +10%
  256|     1|  -30%|  -37%
  256|    20| +112%|   -1%
  256|    50| +136%|   +7%
  256|   100| +138%|  +11%

- Multiqueue shows much more improvement in aggregate transaction rate with
  equal or better cpu utilization.
- Like what we met in 2q test, single process of TCP_CRR get regression.

Guest TX:
size|session|+thu%|+normalize%
    1|     1|   -4%|    0%
    1|     2|  -15%|    0%
    1|     4|  -14%|    0%
   64|     1|   +1%|   -1%
   64|     2|  -10%|  -16%
   64|     4|  -19%|  -26%
  256|     1|   -3%|   -1%
  256|     2|  -34%|  -38%
  256|     4|  -27%|  -45%
  512|     1|   -7%|   -6%
  512|     2|  -42%|  -55%
  512|     4|   +1%|  -15%
 1024|     1|  +12%|  -25%
 1024|     2|    0%|  -23%
 1024|     4|   +2%|  -21%
 4096|     1|    0%|   -5%
 4096|     2|    0%|  -16%
 4096|     4|   -1%|  -31%
16384|     1|   -4%|   -3%
16384|     2|   +4%|  -17%
16384|     4|   +7%|  -28%

- Here we met the same issue as 2q: Statistics shows guest tends to send much
more but smaller packet in 4q since the latency is improved.

Guest RX:
size|session|+thu%|+normalize%
    1|     1|   +1%|    0%
    1|     2|   -2%|  -30%
    1|     4|   -2%|  -58%
   64|     1|    0%|   -1%
   64|     2|    0%|  -25%
   64|     4|   -1%|  -45%
  256|     1|    0%|    0%
  256|     2|   -2%|  -25%
  256|     4|  +61%|  -19%
  512|     1|   -1%|    0%
  512|     2|  +22%|  -11%
  512|     4|  +58%|  -22%
 1024|     1|   -3%|   -2%
 1024|     2|  +35%|   -6%
 1024|     4|  +53%|  -26%
 4096|     1|   -1%|    0%
 4096|     2|  +43%|   -3%
 4096|     4|  +66%|  -19%
16384|     1|    0%|    0%
16384|     2|  +45%|   -2%
16384|     4|  +79%|  -12%

- We get some performance improvement. The reason is becuase there's no much
  cpu in host node 0, so we must pin all vhost threads in node 1 to get stable
  result.
- Statistics shows much more packets were sent/received by guest which leads
  higher cpu utilization.

Jason Wang (3):
  virtio-net: separate fields of sending/receiving queue from
    virtnet_info
  virtio_net: multiqueue support
  virtio-net: support changing the number of queue pairs through
    ethtool

 drivers/net/virtio_net.c        |  726 +++++++++++++++++++++++++++++----------
 include/uapi/linux/virtio_net.h |   27 ++
 2 files changed, 567 insertions(+), 186 deletions(-)

^ permalink raw reply

* [PATCH net-next v3 1/3] virtio-net: separate fields of sending/receiving queue from virtnet_info
From: Jason Wang @ 2012-12-07 17:04 UTC (permalink / raw)
  To: mst, rusty, davem, virtualization, netdev, linux-kernel
  Cc: krkumar2, kvm, bhutchings, jwhan, shiyer
In-Reply-To: <1354899897-10423-1-git-send-email-jasowang@redhat.com>

To support multiqueue transmitq/receiveq, the first step is to separate queue
related structure from virtnet_info. This patch introduce send_queue and
receive_queue structure and use the pointer to them as the parameter in
functions handling sending/receiving.

Signed-off-by: Krishna Kumar <krkumar2@in.ibm.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/net/virtio_net.c |  282 ++++++++++++++++++++++++++--------------------
 1 files changed, 158 insertions(+), 124 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 90ac97d..02a7102 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -51,16 +51,40 @@ struct virtnet_stats {
 	u64 rx_packets;
 };
 
-struct virtnet_info {
-	struct virtio_device *vdev;
-	struct virtqueue *rvq, *svq, *cvq;
-	struct net_device *dev;
+/* Internal representation of a send virtqueue */
+struct send_queue {
+	/* Virtqueue associated with this send _queue */
+	struct virtqueue *vq;
+
+	/* TX: fragments + linear part + virtio header */
+	struct scatterlist sg[MAX_SKB_FRAGS + 2];
+};
+
+/* Internal representation of a receive virtqueue */
+struct receive_queue {
+	/* Virtqueue associated with this receive_queue */
+	struct virtqueue *vq;
+
 	struct napi_struct napi;
-	unsigned int status;
 
 	/* Number of input buffers, and max we've ever had. */
 	unsigned int num, max;
 
+	/* Chain pages by the private ptr. */
+	struct page *pages;
+
+	/* RX: fragments + linear part + virtio header */
+	struct scatterlist sg[MAX_SKB_FRAGS + 2];
+};
+
+struct virtnet_info {
+	struct virtio_device *vdev;
+	struct virtqueue *cvq;
+	struct net_device *dev;
+	struct send_queue sq;
+	struct receive_queue rq;
+	unsigned int status;
+
 	/* I like... big packets and I cannot lie! */
 	bool big_packets;
 
@@ -81,13 +105,6 @@ struct virtnet_info {
 
 	/* Lock for config space updates */
 	struct mutex config_lock;
-
-	/* Chain pages by the private ptr. */
-	struct page *pages;
-
-	/* fragments + linear part + virtio header */
-	struct scatterlist rx_sg[MAX_SKB_FRAGS + 2];
-	struct scatterlist tx_sg[MAX_SKB_FRAGS + 2];
 };
 
 struct skb_vnet_hdr {
@@ -117,22 +134,22 @@ static inline struct skb_vnet_hdr *skb_vnet_hdr(struct sk_buff *skb)
  * private is used to chain pages for big packets, put the whole
  * most recent used list in the beginning for reuse
  */
-static void give_pages(struct virtnet_info *vi, struct page *page)
+static void give_pages(struct receive_queue *rq, struct page *page)
 {
 	struct page *end;
 
-	/* Find end of list, sew whole thing into vi->pages. */
+	/* Find end of list, sew whole thing into vi->rq.pages. */
 	for (end = page; end->private; end = (struct page *)end->private);
-	end->private = (unsigned long)vi->pages;
-	vi->pages = page;
+	end->private = (unsigned long)rq->pages;
+	rq->pages = page;
 }
 
-static struct page *get_a_page(struct virtnet_info *vi, gfp_t gfp_mask)
+static struct page *get_a_page(struct receive_queue *rq, gfp_t gfp_mask)
 {
-	struct page *p = vi->pages;
+	struct page *p = rq->pages;
 
 	if (p) {
-		vi->pages = (struct page *)p->private;
+		rq->pages = (struct page *)p->private;
 		/* clear private here, it is used to chain pages */
 		p->private = 0;
 	} else
@@ -140,12 +157,12 @@ static struct page *get_a_page(struct virtnet_info *vi, gfp_t gfp_mask)
 	return p;
 }
 
-static void skb_xmit_done(struct virtqueue *svq)
+static void skb_xmit_done(struct virtqueue *vq)
 {
-	struct virtnet_info *vi = svq->vdev->priv;
+	struct virtnet_info *vi = vq->vdev->priv;
 
 	/* Suppress further interrupts. */
-	virtqueue_disable_cb(svq);
+	virtqueue_disable_cb(vq);
 
 	/* We were probably waiting for more output buffers. */
 	netif_wake_queue(vi->dev);
@@ -167,9 +184,10 @@ static void set_skb_frag(struct sk_buff *skb, struct page *page,
 }
 
 /* Called from bottom half context */
-static struct sk_buff *page_to_skb(struct virtnet_info *vi,
+static struct sk_buff *page_to_skb(struct receive_queue *rq,
 				   struct page *page, unsigned int len)
 {
+	struct virtnet_info *vi = rq->vq->vdev->priv;
 	struct sk_buff *skb;
 	struct skb_vnet_hdr *hdr;
 	unsigned int copy, hdr_len, offset;
@@ -224,12 +242,12 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
 	}
 
 	if (page)
-		give_pages(vi, page);
+		give_pages(rq, page);
 
 	return skb;
 }
 
-static int receive_mergeable(struct virtnet_info *vi, struct sk_buff *skb)
+static int receive_mergeable(struct receive_queue *rq, struct sk_buff *skb)
 {
 	struct skb_vnet_hdr *hdr = skb_vnet_hdr(skb);
 	struct page *page;
@@ -243,7 +261,7 @@ static int receive_mergeable(struct virtnet_info *vi, struct sk_buff *skb)
 			skb->dev->stats.rx_length_errors++;
 			return -EINVAL;
 		}
-		page = virtqueue_get_buf(vi->rvq, &len);
+		page = virtqueue_get_buf(rq->vq, &len);
 		if (!page) {
 			pr_debug("%s: rx error: %d buffers missing\n",
 				 skb->dev->name, hdr->mhdr.num_buffers);
@@ -256,14 +274,15 @@ static int receive_mergeable(struct virtnet_info *vi, struct sk_buff *skb)
 
 		set_skb_frag(skb, page, 0, &len);
 
-		--vi->num;
+		--rq->num;
 	}
 	return 0;
 }
 
-static void receive_buf(struct net_device *dev, void *buf, unsigned int len)
+static void receive_buf(struct receive_queue *rq, void *buf, unsigned int len)
 {
-	struct virtnet_info *vi = netdev_priv(dev);
+	struct virtnet_info *vi = rq->vq->vdev->priv;
+	struct net_device *dev = vi->dev;
 	struct virtnet_stats *stats = this_cpu_ptr(vi->stats);
 	struct sk_buff *skb;
 	struct page *page;
@@ -273,7 +292,7 @@ static void receive_buf(struct net_device *dev, void *buf, unsigned int len)
 		pr_debug("%s: short packet %i\n", dev->name, len);
 		dev->stats.rx_length_errors++;
 		if (vi->mergeable_rx_bufs || vi->big_packets)
-			give_pages(vi, buf);
+			give_pages(rq, buf);
 		else
 			dev_kfree_skb(buf);
 		return;
@@ -285,14 +304,14 @@ static void receive_buf(struct net_device *dev, void *buf, unsigned int len)
 		skb_trim(skb, len);
 	} else {
 		page = buf;
-		skb = page_to_skb(vi, page, len);
+		skb = page_to_skb(rq, page, len);
 		if (unlikely(!skb)) {
 			dev->stats.rx_dropped++;
-			give_pages(vi, page);
+			give_pages(rq, page);
 			return;
 		}
 		if (vi->mergeable_rx_bufs)
-			if (receive_mergeable(vi, skb)) {
+			if (receive_mergeable(rq, skb)) {
 				dev_kfree_skb(skb);
 				return;
 			}
@@ -359,8 +378,9 @@ frame_err:
 	dev_kfree_skb(skb);
 }
 
-static int add_recvbuf_small(struct virtnet_info *vi, gfp_t gfp)
+static int add_recvbuf_small(struct receive_queue *rq, gfp_t gfp)
 {
+	struct virtnet_info *vi = rq->vq->vdev->priv;
 	struct sk_buff *skb;
 	struct skb_vnet_hdr *hdr;
 	int err;
@@ -372,77 +392,77 @@ static int add_recvbuf_small(struct virtnet_info *vi, gfp_t gfp)
 	skb_put(skb, MAX_PACKET_LEN);
 
 	hdr = skb_vnet_hdr(skb);
-	sg_set_buf(vi->rx_sg, &hdr->hdr, sizeof hdr->hdr);
+	sg_set_buf(rq->sg, &hdr->hdr, sizeof hdr->hdr);
 
-	skb_to_sgvec(skb, vi->rx_sg + 1, 0, skb->len);
+	skb_to_sgvec(skb, rq->sg + 1, 0, skb->len);
 
-	err = virtqueue_add_buf(vi->rvq, vi->rx_sg, 0, 2, skb, gfp);
+	err = virtqueue_add_buf(rq->vq, rq->sg, 0, 2, skb, gfp);
 	if (err < 0)
 		dev_kfree_skb(skb);
 
 	return err;
 }
 
-static int add_recvbuf_big(struct virtnet_info *vi, gfp_t gfp)
+static int add_recvbuf_big(struct receive_queue *rq, gfp_t gfp)
 {
 	struct page *first, *list = NULL;
 	char *p;
 	int i, err, offset;
 
-	/* page in vi->rx_sg[MAX_SKB_FRAGS + 1] is list tail */
+	/* page in rq->sg[MAX_SKB_FRAGS + 1] is list tail */
 	for (i = MAX_SKB_FRAGS + 1; i > 1; --i) {
-		first = get_a_page(vi, gfp);
+		first = get_a_page(rq, gfp);
 		if (!first) {
 			if (list)
-				give_pages(vi, list);
+				give_pages(rq, list);
 			return -ENOMEM;
 		}
-		sg_set_buf(&vi->rx_sg[i], page_address(first), PAGE_SIZE);
+		sg_set_buf(&rq->sg[i], page_address(first), PAGE_SIZE);
 
 		/* chain new page in list head to match sg */
 		first->private = (unsigned long)list;
 		list = first;
 	}
 
-	first = get_a_page(vi, gfp);
+	first = get_a_page(rq, gfp);
 	if (!first) {
-		give_pages(vi, list);
+		give_pages(rq, list);
 		return -ENOMEM;
 	}
 	p = page_address(first);
 
-	/* vi->rx_sg[0], vi->rx_sg[1] share the same page */
-	/* a separated vi->rx_sg[0] for virtio_net_hdr only due to QEMU bug */
-	sg_set_buf(&vi->rx_sg[0], p, sizeof(struct virtio_net_hdr));
+	/* rq->sg[0], rq->sg[1] share the same page */
+	/* a separated rq->sg[0] for virtio_net_hdr only due to QEMU bug */
+	sg_set_buf(&rq->sg[0], p, sizeof(struct virtio_net_hdr));
 
-	/* vi->rx_sg[1] for data packet, from offset */
+	/* rq->sg[1] for data packet, from offset */
 	offset = sizeof(struct padded_vnet_hdr);
-	sg_set_buf(&vi->rx_sg[1], p + offset, PAGE_SIZE - offset);
+	sg_set_buf(&rq->sg[1], p + offset, PAGE_SIZE - offset);
 
 	/* chain first in list head */
 	first->private = (unsigned long)list;
-	err = virtqueue_add_buf(vi->rvq, vi->rx_sg, 0, MAX_SKB_FRAGS + 2,
+	err = virtqueue_add_buf(rq->vq, rq->sg, 0, MAX_SKB_FRAGS + 2,
 				first, gfp);
 	if (err < 0)
-		give_pages(vi, first);
+		give_pages(rq, first);
 
 	return err;
 }
 
-static int add_recvbuf_mergeable(struct virtnet_info *vi, gfp_t gfp)
+static int add_recvbuf_mergeable(struct receive_queue *rq, gfp_t gfp)
 {
 	struct page *page;
 	int err;
 
-	page = get_a_page(vi, gfp);
+	page = get_a_page(rq, gfp);
 	if (!page)
 		return -ENOMEM;
 
-	sg_init_one(vi->rx_sg, page_address(page), PAGE_SIZE);
+	sg_init_one(rq->sg, page_address(page), PAGE_SIZE);
 
-	err = virtqueue_add_buf(vi->rvq, vi->rx_sg, 0, 1, page, gfp);
+	err = virtqueue_add_buf(rq->vq, rq->sg, 0, 1, page, gfp);
 	if (err < 0)
-		give_pages(vi, page);
+		give_pages(rq, page);
 
 	return err;
 }
@@ -454,65 +474,68 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi, gfp_t gfp)
  * before we're receiving packets, or from refill_work which is
  * careful to disable receiving (using napi_disable).
  */
-static bool try_fill_recv(struct virtnet_info *vi, gfp_t gfp)
+static bool try_fill_recv(struct receive_queue *rq, gfp_t gfp)
 {
+	struct virtnet_info *vi = rq->vq->vdev->priv;
 	int err;
 	bool oom;
 
 	do {
 		if (vi->mergeable_rx_bufs)
-			err = add_recvbuf_mergeable(vi, gfp);
+			err = add_recvbuf_mergeable(rq, gfp);
 		else if (vi->big_packets)
-			err = add_recvbuf_big(vi, gfp);
+			err = add_recvbuf_big(rq, gfp);
 		else
-			err = add_recvbuf_small(vi, gfp);
+			err = add_recvbuf_small(rq, gfp);
 
 		oom = err == -ENOMEM;
 		if (err < 0)
 			break;
-		++vi->num;
+		++rq->num;
 	} while (err > 0);
-	if (unlikely(vi->num > vi->max))
-		vi->max = vi->num;
-	virtqueue_kick(vi->rvq);
+	if (unlikely(rq->num > rq->max))
+		rq->max = rq->num;
+	virtqueue_kick(rq->vq);
 	return !oom;
 }
 
 static void skb_recv_done(struct virtqueue *rvq)
 {
 	struct virtnet_info *vi = rvq->vdev->priv;
+	struct receive_queue *rq = &vi->rq;
+
 	/* Schedule NAPI, Suppress further interrupts if successful. */
-	if (napi_schedule_prep(&vi->napi)) {
+	if (napi_schedule_prep(&rq->napi)) {
 		virtqueue_disable_cb(rvq);
-		__napi_schedule(&vi->napi);
+		__napi_schedule(&rq->napi);
 	}
 }
 
-static void virtnet_napi_enable(struct virtnet_info *vi)
+static void virtnet_napi_enable(struct receive_queue *rq)
 {
-	napi_enable(&vi->napi);
+	napi_enable(&rq->napi);
 
 	/* If all buffers were filled by other side before we napi_enabled, we
 	 * won't get another interrupt, so process any outstanding packets
 	 * now.  virtnet_poll wants re-enable the queue, so we disable here.
 	 * We synchronize against interrupts via NAPI_STATE_SCHED */
-	if (napi_schedule_prep(&vi->napi)) {
-		virtqueue_disable_cb(vi->rvq);
+	if (napi_schedule_prep(&rq->napi)) {
+		virtqueue_disable_cb(rq->vq);
 		local_bh_disable();
-		__napi_schedule(&vi->napi);
+		__napi_schedule(&rq->napi);
 		local_bh_enable();
 	}
 }
 
 static void refill_work(struct work_struct *work)
 {
-	struct virtnet_info *vi;
+	struct virtnet_info *vi =
+		container_of(work, struct virtnet_info, refill.work);
 	bool still_empty;
 
-	vi = container_of(work, struct virtnet_info, refill.work);
-	napi_disable(&vi->napi);
-	still_empty = !try_fill_recv(vi, GFP_KERNEL);
-	virtnet_napi_enable(vi);
+	napi_disable(&vi->rq.napi);
+	still_empty = !try_fill_recv(&vi->rq, GFP_KERNEL);
+	virtnet_napi_enable(&vi->rq);
 
 	/* In theory, this can happen: if we don't get any buffers in
 	 * we will *never* try to fill again. */
@@ -522,29 +545,31 @@ static void refill_work(struct work_struct *work)
 
 static int virtnet_poll(struct napi_struct *napi, int budget)
 {
-	struct virtnet_info *vi = container_of(napi, struct virtnet_info, napi);
+	struct receive_queue *rq =
+		container_of(napi, struct receive_queue, napi);
+	struct virtnet_info *vi = rq->vq->vdev->priv;
 	void *buf;
 	unsigned int len, received = 0;
 
 again:
 	while (received < budget &&
-	       (buf = virtqueue_get_buf(vi->rvq, &len)) != NULL) {
-		receive_buf(vi->dev, buf, len);
-		--vi->num;
+	       (buf = virtqueue_get_buf(rq->vq, &len)) != NULL) {
+		receive_buf(rq, buf, len);
+		--rq->num;
 		received++;
 	}
 
-	if (vi->num < vi->max / 2) {
-		if (!try_fill_recv(vi, GFP_ATOMIC))
+	if (rq->num < rq->max / 2) {
+		if (!try_fill_recv(rq, GFP_ATOMIC))
 			schedule_delayed_work(&vi->refill, 0);
 	}
 
 	/* Out of packets? */
 	if (received < budget) {
 		napi_complete(napi);
-		if (unlikely(!virtqueue_enable_cb(vi->rvq)) &&
+		if (unlikely(!virtqueue_enable_cb(rq->vq)) &&
 		    napi_schedule_prep(napi)) {
-			virtqueue_disable_cb(vi->rvq);
+			virtqueue_disable_cb(rq->vq);
 			__napi_schedule(napi);
 			goto again;
 		}
@@ -553,13 +578,14 @@ again:
 	return received;
 }
 
-static unsigned int free_old_xmit_skbs(struct virtnet_info *vi)
+static unsigned int free_old_xmit_skbs(struct send_queue *sq)
 {
 	struct sk_buff *skb;
 	unsigned int len, tot_sgs = 0;
+	struct virtnet_info *vi = sq->vq->vdev->priv;
 	struct virtnet_stats *stats = this_cpu_ptr(vi->stats);
 
-	while ((skb = virtqueue_get_buf(vi->svq, &len)) != NULL) {
+	while ((skb = virtqueue_get_buf(sq->vq, &len)) != NULL) {
 		pr_debug("Sent skb %p\n", skb);
 
 		u64_stats_update_begin(&stats->tx_syncp);
@@ -573,10 +599,11 @@ static unsigned int free_old_xmit_skbs(struct virtnet_info *vi)
 	return tot_sgs;
 }
 
-static int xmit_skb(struct virtnet_info *vi, struct sk_buff *skb)
+static int xmit_skb(struct send_queue *sq, struct sk_buff *skb)
 {
 	struct skb_vnet_hdr *hdr = skb_vnet_hdr(skb);
 	const unsigned char *dest = ((struct ethhdr *)skb->data)->h_dest;
+	struct virtnet_info *vi = sq->vq->vdev->priv;
 
 	pr_debug("%s: xmit %p %pM\n", vi->dev->name, skb, dest);
 
@@ -611,25 +638,26 @@ static int xmit_skb(struct virtnet_info *vi, struct sk_buff *skb)
 
 	/* Encode metadata header at front. */
 	if (vi->mergeable_rx_bufs)
-		sg_set_buf(vi->tx_sg, &hdr->mhdr, sizeof hdr->mhdr);
+		sg_set_buf(sq->sg, &hdr->mhdr, sizeof hdr->mhdr);
 	else
-		sg_set_buf(vi->tx_sg, &hdr->hdr, sizeof hdr->hdr);
+		sg_set_buf(sq->sg, &hdr->hdr, sizeof hdr->hdr);
 
-	hdr->num_sg = skb_to_sgvec(skb, vi->tx_sg + 1, 0, skb->len) + 1;
-	return virtqueue_add_buf(vi->svq, vi->tx_sg, hdr->num_sg,
+	hdr->num_sg = skb_to_sgvec(skb, sq->sg + 1, 0, skb->len) + 1;
+	return virtqueue_add_buf(sq->vq, sq->sg, hdr->num_sg,
 				 0, skb, GFP_ATOMIC);
 }
 
 static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
 {
 	struct virtnet_info *vi = netdev_priv(dev);
+	struct send_queue *sq = &vi->sq;
 	int capacity;
 
 	/* Free up any pending old buffers before queueing new ones. */
-	free_old_xmit_skbs(vi);
+	free_old_xmit_skbs(sq);
 
 	/* Try to transmit */
-	capacity = xmit_skb(vi, skb);
+	capacity = xmit_skb(sq, skb);
 
 	/* This can happen with OOM and indirect buffers. */
 	if (unlikely(capacity < 0)) {
@@ -648,7 +676,7 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
 		kfree_skb(skb);
 		return NETDEV_TX_OK;
 	}
-	virtqueue_kick(vi->svq);
+	virtqueue_kick(sq->vq);
 
 	/* Don't wait up for transmitted skbs to be freed. */
 	skb_orphan(skb);
@@ -658,12 +686,12 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
 	 * before it gets out of hand.  Naturally, this wastes entries. */
 	if (capacity < 2+MAX_SKB_FRAGS) {
 		netif_stop_queue(dev);
-		if (unlikely(!virtqueue_enable_cb_delayed(vi->svq))) {
+		if (unlikely(!virtqueue_enable_cb_delayed(sq->vq))) {
 			/* More just got used, free them then recheck. */
-			capacity += free_old_xmit_skbs(vi);
+			capacity += free_old_xmit_skbs(sq);
 			if (capacity >= 2+MAX_SKB_FRAGS) {
 				netif_start_queue(dev);
-				virtqueue_disable_cb(vi->svq);
+				virtqueue_disable_cb(sq->vq);
 			}
 		}
 	}
@@ -731,7 +759,7 @@ static void virtnet_netpoll(struct net_device *dev)
 {
 	struct virtnet_info *vi = netdev_priv(dev);
 
-	napi_schedule(&vi->napi);
+	napi_schedule(&vi->rq.napi);
 }
 #endif
 
@@ -740,10 +768,10 @@ static int virtnet_open(struct net_device *dev)
 	struct virtnet_info *vi = netdev_priv(dev);
 
 	/* Make sure we have some buffers: if oom use wq. */
-	if (!try_fill_recv(vi, GFP_KERNEL))
+	if (!try_fill_recv(&vi->rq, GFP_KERNEL))
 		schedule_delayed_work(&vi->refill, 0);
 
-	virtnet_napi_enable(vi);
+	virtnet_napi_enable(&vi->rq);
 	return 0;
 }
 
@@ -808,7 +836,7 @@ static int virtnet_close(struct net_device *dev)
 
 	/* Make sure refill_work doesn't re-enable napi! */
 	cancel_delayed_work_sync(&vi->refill);
-	napi_disable(&vi->napi);
+	napi_disable(&vi->rq.napi);
 
 	return 0;
 }
@@ -920,11 +948,10 @@ static void virtnet_get_ringparam(struct net_device *dev,
 {
 	struct virtnet_info *vi = netdev_priv(dev);
 
-	ring->rx_max_pending = virtqueue_get_vring_size(vi->rvq);
-	ring->tx_max_pending = virtqueue_get_vring_size(vi->svq);
+	ring->rx_max_pending = virtqueue_get_vring_size(vi->rq.vq);
+	ring->tx_max_pending = virtqueue_get_vring_size(vi->sq.vq);
 	ring->rx_pending = ring->rx_max_pending;
 	ring->tx_pending = ring->tx_max_pending;
-
 }
 
 
@@ -1019,6 +1046,13 @@ static void virtnet_config_changed(struct virtio_device *vdev)
 	schedule_work(&vi->config_work);
 }
 
+static void virtnet_del_vqs(struct virtnet_info *vi)
+{
+	struct virtio_device *vdev = vi->vdev;
+
+	vdev->config->del_vqs(vdev);
+}
+
 static int init_vqs(struct virtnet_info *vi)
 {
 	struct virtqueue *vqs[3];
@@ -1034,8 +1068,8 @@ static int init_vqs(struct virtnet_info *vi)
 	if (err)
 		return err;
 
-	vi->rvq = vqs[0];
-	vi->svq = vqs[1];
+	vi->rq.vq = vqs[0];
+	vi->sq.vq = vqs[1];
 
 	if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ)) {
 		vi->cvq = vqs[2];
@@ -1099,11 +1133,11 @@ static int virtnet_probe(struct virtio_device *vdev)
 
 	/* Set up our device-specific information */
 	vi = netdev_priv(dev);
-	netif_napi_add(dev, &vi->napi, virtnet_poll, napi_weight);
+	netif_napi_add(dev, &vi->rq.napi, virtnet_poll, napi_weight);
 	vi->dev = dev;
 	vi->vdev = vdev;
 	vdev->priv = vi;
-	vi->pages = NULL;
+	vi->rq.pages = NULL;
 	vi->stats = alloc_percpu(struct virtnet_stats);
 	err = -ENOMEM;
 	if (vi->stats == NULL)
@@ -1113,8 +1147,8 @@ static int virtnet_probe(struct virtio_device *vdev)
 	mutex_init(&vi->config_lock);
 	vi->config_enable = true;
 	INIT_WORK(&vi->config_work, virtnet_config_changed_work);
-	sg_init_table(vi->rx_sg, ARRAY_SIZE(vi->rx_sg));
-	sg_init_table(vi->tx_sg, ARRAY_SIZE(vi->tx_sg));
+	sg_init_table(vi->rq.sg, ARRAY_SIZE(vi->rq.sg));
+	sg_init_table(vi->sq.sg, ARRAY_SIZE(vi->sq.sg));
 
 	/* If we can receive ANY GSO packets, we must allocate large ones. */
 	if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO4) ||
@@ -1136,10 +1170,10 @@ static int virtnet_probe(struct virtio_device *vdev)
 	}
 
 	/* Last of all, set up some receive buffers. */
-	try_fill_recv(vi, GFP_KERNEL);
+	try_fill_recv(&vi->rq, GFP_KERNEL);
 
 	/* If we didn't even get one input buffer, we're useless. */
-	if (vi->num == 0) {
+	if (vi->rq.num == 0) {
 		err = -ENOMEM;
 		goto unregister;
 	}
@@ -1160,7 +1194,7 @@ static int virtnet_probe(struct virtio_device *vdev)
 unregister:
 	unregister_netdev(dev);
 free_vqs:
-	vdev->config->del_vqs(vdev);
+	virtnet_del_vqs(vi);
 free_stats:
 	free_percpu(vi->stats);
 free:
@@ -1172,22 +1206,22 @@ static void free_unused_bufs(struct virtnet_info *vi)
 {
 	void *buf;
 	while (1) {
-		buf = virtqueue_detach_unused_buf(vi->svq);
+		buf = virtqueue_detach_unused_buf(vi->sq.vq);
 		if (!buf)
 			break;
 		dev_kfree_skb(buf);
 	}
 	while (1) {
-		buf = virtqueue_detach_unused_buf(vi->rvq);
+		buf = virtqueue_detach_unused_buf(vi->rq.vq);
 		if (!buf)
 			break;
 		if (vi->mergeable_rx_bufs || vi->big_packets)
-			give_pages(vi, buf);
+			give_pages(&vi->rq, buf);
 		else
 			dev_kfree_skb(buf);
-		--vi->num;
+		--vi->rq.num;
 	}
-	BUG_ON(vi->num != 0);
+	BUG_ON(vi->rq.num != 0);
 }
 
 static void remove_vq_common(struct virtnet_info *vi)
@@ -1197,10 +1231,10 @@ static void remove_vq_common(struct virtnet_info *vi)
 	/* Free unused buffers in both send and recv, if any. */
 	free_unused_bufs(vi);
 
-	vi->vdev->config->del_vqs(vi->vdev);
+	virtnet_del_vqs(vi);
 
-	while (vi->pages)
-		__free_pages(get_a_page(vi, GFP_KERNEL), 0);
+	while (vi->rq.pages)
+		__free_pages(get_a_page(&vi->rq, GFP_KERNEL), 0);
 }
 
 static void virtnet_remove(struct virtio_device *vdev)
@@ -1236,7 +1270,7 @@ static int virtnet_freeze(struct virtio_device *vdev)
 	cancel_delayed_work_sync(&vi->refill);
 
 	if (netif_running(vi->dev))
-		napi_disable(&vi->napi);
+		napi_disable(&vi->rq.napi);
 
 	remove_vq_common(vi);
 
@@ -1255,11 +1289,11 @@ static int virtnet_restore(struct virtio_device *vdev)
 		return err;
 
 	if (netif_running(vi->dev))
-		virtnet_napi_enable(vi);
+		virtnet_napi_enable(&vi->rq);
 
 	netif_device_attach(vi->dev);
 
-	if (!try_fill_recv(vi, GFP_KERNEL))
+	if (!try_fill_recv(&vi->rq, GFP_KERNEL))
 		schedule_delayed_work(&vi->refill, 0);
 
 	mutex_lock(&vi->config_lock);
-- 
1.7.1

^ permalink raw reply related

* [PATCH net-next v3 2/3] virtio_net: multiqueue support
From: Jason Wang @ 2012-12-07 17:04 UTC (permalink / raw)
  To: mst, rusty, davem, virtualization, netdev, linux-kernel
  Cc: krkumar2, kvm, bhutchings, jwhan, shiyer
In-Reply-To: <1354899897-10423-1-git-send-email-jasowang@redhat.com>

This patch adds the multiqueue (VIRTIO_NET_F_MQ) support to virtio_net
driver. VIRTIO_NET_F_MQ capable device could allow the driver to do packet
transmission and reception through multiple queue pairs and does the packet
steering to get better performance. By default, one one queue pair is used, user
could change the number of queue pairs by ethtool in the next patch.

When multiple queue pairs is used and the number of queue pairs is equal to the
number of vcpus. Driver does the following optimizations to implement per-cpu
virt queue pairs:

- select the txq based on the smp processor id.
- smp affinity hint to the cpu that owns the queue pairs.

This could be used with the flow steering support of the device to guarantee the
packets of a single flow is handled by the same cpu.

Signed-off-by: Krishna Kumar <krkumar2@in.ibm.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/net/virtio_net.c        |  473 +++++++++++++++++++++++++++++++--------
 include/uapi/linux/virtio_net.h |   27 +++
 2 files changed, 402 insertions(+), 98 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 02a7102..c083048 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -58,6 +58,9 @@ struct send_queue {
 
 	/* TX: fragments + linear part + virtio header */
 	struct scatterlist sg[MAX_SKB_FRAGS + 2];
+
+	/* Name of the send queue: output.$index */
+	char name[40];
 };
 
 /* Internal representation of a receive virtqueue */
@@ -75,22 +78,34 @@ struct receive_queue {
 
 	/* RX: fragments + linear part + virtio header */
 	struct scatterlist sg[MAX_SKB_FRAGS + 2];
+
+	/* Name of this receive queue: input.$index */
+	char name[40];
 };
 
 struct virtnet_info {
 	struct virtio_device *vdev;
 	struct virtqueue *cvq;
 	struct net_device *dev;
-	struct send_queue sq;
-	struct receive_queue rq;
+	struct send_queue *sq;
+	struct receive_queue *rq;
 	unsigned int status;
 
+	/* Max # of queue pairs supported by the device */
+	u16 max_queue_pairs;
+
+	/* # of queue pairs currently used by the driver */
+	u16 curr_queue_pairs;
+
 	/* I like... big packets and I cannot lie! */
 	bool big_packets;
 
 	/* Host will merge rx buffers for big packets (shake it! shake it!) */
 	bool mergeable_rx_bufs;
 
+	/* Has control virtqueue */
+	bool has_cvq;
+
 	/* enable config space updates */
 	bool config_enable;
 
@@ -105,6 +120,9 @@ struct virtnet_info {
 
 	/* Lock for config space updates */
 	struct mutex config_lock;
+
+	/* Does the affinity hint is set for virtqueues? */
+	bool affinity_hint_set;
 };
 
 struct skb_vnet_hdr {
@@ -125,6 +143,29 @@ struct padded_vnet_hdr {
 	char padding[6];
 };
 
+/* Converting between virtqueue no. and kernel tx/rx queue no.
+ * 0:rx0 1:tx0 2:rx1 3:tx1 ... 2N:rxN 2N+1:txN 2N+2:cvq
+ */
+static int vq2txq(struct virtqueue *vq)
+{
+	return (virtqueue_get_queue_index(vq) - 1) / 2;
+}
+
+static int txq2vq(int txq)
+{
+	return txq * 2 + 1;
+}
+
+static int vq2rxq(struct virtqueue *vq)
+{
+	return virtqueue_get_queue_index(vq) / 2;
+}
+
+static int rxq2vq(int rxq)
+{
+	return rxq * 2;
+}
+
 static inline struct skb_vnet_hdr *skb_vnet_hdr(struct sk_buff *skb)
 {
 	return (struct skb_vnet_hdr *)skb->cb;
@@ -165,7 +206,7 @@ static void skb_xmit_done(struct virtqueue *vq)
 	virtqueue_disable_cb(vq);
 
 	/* We were probably waiting for more output buffers. */
-	netif_wake_queue(vi->dev);
+	netif_wake_subqueue(vi->dev, vq2txq(vq));
 }
 
 static void set_skb_frag(struct sk_buff *skb, struct page *page,
@@ -502,7 +543,7 @@ static bool try_fill_recv(struct receive_queue *rq, gfp_t gfp)
 static void skb_recv_done(struct virtqueue *rvq)
 {
 	struct virtnet_info *vi = rvq->vdev->priv;
-	struct receive_queue *rq = &vi->rq;
+	struct receive_queue *rq = &vi->rq[vq2rxq(rvq)];
 
 	/* Schedule NAPI, Suppress further interrupts if successful. */
 	if (napi_schedule_prep(&rq->napi)) {
@@ -532,15 +573,21 @@ static void refill_work(struct work_struct *work)
 	struct virtnet_info *vi =
 		container_of(work, struct virtnet_info, refill.work);
 	bool still_empty;
+	int i;
+
+	for (i = 0; i < vi->max_queue_pairs; i++) {
+		struct receive_queue *rq = &vi->rq[i];
 
-	napi_disable(&vi->rq.napi);
-	still_empty = !try_fill_recv(&vi->rq, GFP_KERNEL);
-	virtnet_napi_enable(&vi->rq);
+		napi_disable(&rq->napi);
+		still_empty = !try_fill_recv(rq, GFP_KERNEL);
+		virtnet_napi_enable(rq);
 
-	/* In theory, this can happen: if we don't get any buffers in
-	 * we will *never* try to fill again. */
-	if (still_empty)
-		schedule_delayed_work(&vi->refill, HZ/2);
+		/* In theory, this can happen: if we don't get any buffers in
+		 * we will *never* try to fill again.
+		 */
+		if (still_empty)
+			schedule_delayed_work(&vi->refill, HZ/2);
+	}
 }
 
 static int virtnet_poll(struct napi_struct *napi, int budget)
@@ -578,6 +625,21 @@ again:
 	return received;
 }
 
+static int virtnet_open(struct net_device *dev)
+{
+	struct virtnet_info *vi = netdev_priv(dev);
+	int i;
+
+	for (i = 0; i < vi->max_queue_pairs; i++) {
+		/* Make sure we have some buffers: if oom use wq. */
+		if (!try_fill_recv(&vi->rq[i], GFP_KERNEL))
+			schedule_delayed_work(&vi->refill, 0);
+		virtnet_napi_enable(&vi->rq[i]);
+	}
+
+	return 0;
+}
+
 static unsigned int free_old_xmit_skbs(struct send_queue *sq)
 {
 	struct sk_buff *skb;
@@ -650,7 +712,8 @@ static int xmit_skb(struct send_queue *sq, struct sk_buff *skb)
 static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
 {
 	struct virtnet_info *vi = netdev_priv(dev);
-	struct send_queue *sq = &vi->sq;
+	int qnum = skb_get_queue_mapping(skb);
+	struct send_queue *sq = &vi->sq[qnum];
 	int capacity;
 
 	/* Free up any pending old buffers before queueing new ones. */
@@ -664,13 +727,14 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
 		if (likely(capacity == -ENOMEM)) {
 			if (net_ratelimit())
 				dev_warn(&dev->dev,
-					 "TX queue failure: out of memory\n");
+					 "TXQ (%d) failure: out of memory\n",
+					 qnum);
 		} else {
 			dev->stats.tx_fifo_errors++;
 			if (net_ratelimit())
 				dev_warn(&dev->dev,
-					 "Unexpected TX queue failure: %d\n",
-					 capacity);
+					 "Unexpected TXQ (%d) failure: %d\n",
+					 qnum, capacity);
 		}
 		dev->stats.tx_dropped++;
 		kfree_skb(skb);
@@ -685,12 +749,12 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
 	/* Apparently nice girls don't return TX_BUSY; stop the queue
 	 * before it gets out of hand.  Naturally, this wastes entries. */
 	if (capacity < 2+MAX_SKB_FRAGS) {
-		netif_stop_queue(dev);
+		netif_stop_subqueue(dev, qnum);
 		if (unlikely(!virtqueue_enable_cb_delayed(sq->vq))) {
 			/* More just got used, free them then recheck. */
 			capacity += free_old_xmit_skbs(sq);
 			if (capacity >= 2+MAX_SKB_FRAGS) {
-				netif_start_queue(dev);
+				netif_start_subqueue(dev, qnum);
 				virtqueue_disable_cb(sq->vq);
 			}
 		}
@@ -758,23 +822,13 @@ static struct rtnl_link_stats64 *virtnet_stats(struct net_device *dev,
 static void virtnet_netpoll(struct net_device *dev)
 {
 	struct virtnet_info *vi = netdev_priv(dev);
+	int i;
 
-	napi_schedule(&vi->rq.napi);
+	for (i = 0; i < vi->curr_queue_pairs; i++)
+		napi_schedule(&vi->rq[i].napi);
 }
 #endif
 
-static int virtnet_open(struct net_device *dev)
-{
-	struct virtnet_info *vi = netdev_priv(dev);
-
-	/* Make sure we have some buffers: if oom use wq. */
-	if (!try_fill_recv(&vi->rq, GFP_KERNEL))
-		schedule_delayed_work(&vi->refill, 0);
-
-	virtnet_napi_enable(&vi->rq);
-	return 0;
-}
-
 /*
  * Send command via the control virtqueue and check status.  Commands
  * supported by the hypervisor, as indicated by feature bits, should
@@ -830,13 +884,39 @@ static void virtnet_ack_link_announce(struct virtnet_info *vi)
 	rtnl_unlock();
 }
 
+static int virtnet_set_queues(struct virtnet_info *vi, u16 queue_pairs)
+{
+	struct scatterlist sg;
+	struct virtio_net_ctrl_mq s;
+	struct net_device *dev = vi->dev;
+
+	if (!vi->has_cvq || !virtio_has_feature(vi->vdev, VIRTIO_NET_F_MQ))
+		return 0;
+
+	s.virtqueue_pairs = queue_pairs;
+	sg_init_one(&sg, &s, sizeof(s));
+
+	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MQ,
+				  VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET, &sg, 1, 0)){
+		dev_warn(&dev->dev, "Fail to set num of queue pairs to %d\n",
+			 queue_pairs);
+		return -EINVAL;
+	} else
+		vi->curr_queue_pairs = queue_pairs;
+
+	return 0;
+}
+
 static int virtnet_close(struct net_device *dev)
 {
 	struct virtnet_info *vi = netdev_priv(dev);
+	int i;
 
 	/* Make sure refill_work doesn't re-enable napi! */
 	cancel_delayed_work_sync(&vi->refill);
-	napi_disable(&vi->rq.napi);
+
+	for (i = 0; i < vi->max_queue_pairs; i++)
+		napi_disable(&vi->rq[i].napi);
 
 	return 0;
 }
@@ -943,13 +1023,41 @@ static int virtnet_vlan_rx_kill_vid(struct net_device *dev, u16 vid)
 	return 0;
 }
 
+static void virtnet_set_affinity(struct virtnet_info *vi, bool set)
+{
+	int i;
+
+	/* In multiqueue mode, when the number of cpu is equal to the number of
+	 * queue pairs, we let the queue pairs to be private to one cpu by
+	 * setting the affinity hint to eliminate the contention.
+	 */
+	if ((vi->curr_queue_pairs == 1 ||
+	     vi->max_queue_pairs != num_online_cpus()) && set) {
+		if (vi->affinity_hint_set)
+			set = false;
+		else
+			return;
+	}
+
+	for (i = 0; i < vi->max_queue_pairs; i++) {
+		int cpu = set ? i : -1;
+		virtqueue_set_affinity(vi->rq[i].vq, cpu);
+		virtqueue_set_affinity(vi->sq[i].vq, cpu);
+	}
+
+	if (set)
+		vi->affinity_hint_set = true;
+	else
+		vi->affinity_hint_set = false;
+}
+
 static void virtnet_get_ringparam(struct net_device *dev,
 				struct ethtool_ringparam *ring)
 {
 	struct virtnet_info *vi = netdev_priv(dev);
 
-	ring->rx_max_pending = virtqueue_get_vring_size(vi->rq.vq);
-	ring->tx_max_pending = virtqueue_get_vring_size(vi->sq.vq);
+	ring->rx_max_pending = virtqueue_get_vring_size(vi->rq[0].vq);
+	ring->tx_max_pending = virtqueue_get_vring_size(vi->sq[0].vq);
 	ring->rx_pending = ring->rx_max_pending;
 	ring->tx_pending = ring->tx_max_pending;
 }
@@ -984,6 +1092,21 @@ static int virtnet_change_mtu(struct net_device *dev, int new_mtu)
 	return 0;
 }
 
+/* To avoid contending a lock hold by a vcpu who would exit to host, select the
+ * txq based on the processor id.
+ * TODO: handle cpu hotplug.
+ */
+static u16 virtnet_select_queue(struct net_device *dev, struct sk_buff *skb)
+{
+	int txq = skb_rx_queue_recorded(skb) ? skb_get_rx_queue(skb) :
+		  smp_processor_id();
+
+	while (unlikely(txq >= dev->real_num_tx_queues))
+		txq -= dev->real_num_tx_queues;
+
+	return txq;
+}
+
 static const struct net_device_ops virtnet_netdev = {
 	.ndo_open            = virtnet_open,
 	.ndo_stop   	     = virtnet_close,
@@ -995,6 +1118,7 @@ static const struct net_device_ops virtnet_netdev = {
 	.ndo_get_stats64     = virtnet_stats,
 	.ndo_vlan_rx_add_vid = virtnet_vlan_rx_add_vid,
 	.ndo_vlan_rx_kill_vid = virtnet_vlan_rx_kill_vid,
+	.ndo_select_queue     = virtnet_select_queue,
 #ifdef CONFIG_NET_POLL_CONTROLLER
 	.ndo_poll_controller = virtnet_netpoll,
 #endif
@@ -1030,10 +1154,10 @@ static void virtnet_config_changed_work(struct work_struct *work)
 
 	if (vi->status & VIRTIO_NET_S_LINK_UP) {
 		netif_carrier_on(vi->dev);
-		netif_wake_queue(vi->dev);
+		netif_tx_wake_all_queues(vi->dev);
 	} else {
 		netif_carrier_off(vi->dev);
-		netif_stop_queue(vi->dev);
+		netif_tx_stop_all_queues(vi->dev);
 	}
 done:
 	mutex_unlock(&vi->config_lock);
@@ -1046,48 +1170,203 @@ static void virtnet_config_changed(struct virtio_device *vdev)
 	schedule_work(&vi->config_work);
 }
 
+static void virtnet_free_queues(struct virtnet_info *vi)
+{
+	kfree(vi->rq);
+	kfree(vi->sq);
+}
+
+static void free_receive_bufs(struct virtnet_info *vi)
+{
+	int i;
+
+	for (i = 0; i < vi->max_queue_pairs; i++) {
+		while (vi->rq[i].pages)
+			__free_pages(get_a_page(&vi->rq[i], GFP_KERNEL), 0);
+	}
+}
+
+static void free_unused_bufs(struct virtnet_info *vi)
+{
+	void *buf;
+	int i;
+
+	for (i = 0; i < vi->max_queue_pairs; i++) {
+		struct virtqueue *vq = vi->sq[i].vq;
+		while ((buf = virtqueue_detach_unused_buf(vq)) != NULL)
+			dev_kfree_skb(buf);
+	}
+
+	for (i = 0; i < vi->max_queue_pairs; i++) {
+		struct virtqueue *vq = vi->rq[i].vq;
+
+		while ((buf = virtqueue_detach_unused_buf(vq)) != NULL) {
+			if (vi->mergeable_rx_bufs || vi->big_packets)
+				give_pages(&vi->rq[i], buf);
+			else
+				dev_kfree_skb(buf);
+			--vi->rq[i].num;
+		}
+		BUG_ON(vi->rq[i].num != 0);
+	}
+}
+
 static void virtnet_del_vqs(struct virtnet_info *vi)
 {
 	struct virtio_device *vdev = vi->vdev;
 
+	virtnet_set_affinity(vi, false);
+
 	vdev->config->del_vqs(vdev);
+
+	virtnet_free_queues(vi);
 }
 
-static int init_vqs(struct virtnet_info *vi)
+static int virtnet_find_vqs(struct virtnet_info *vi)
 {
-	struct virtqueue *vqs[3];
-	vq_callback_t *callbacks[] = { skb_recv_done, skb_xmit_done, NULL};
-	const char *names[] = { "input", "output", "control" };
-	int nvqs, err;
-
-	/* We expect two virtqueues, receive then send,
-	 * and optionally control. */
-	nvqs = virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ) ? 3 : 2;
-
-	err = vi->vdev->config->find_vqs(vi->vdev, nvqs, vqs, callbacks, names);
-	if (err)
-		return err;
+	vq_callback_t **callbacks;
+	struct virtqueue **vqs;
+	int ret = -ENOMEM;
+	int i, total_vqs;
+	const char **names;
+
+	/* We expect 1 RX virtqueue followed by 1 TX virtqueue, followed by
+	 * possible N-1 RX/TX queue pairs used in multiqueue mode, followed by
+	 * possible control vq.
+	 */
+	total_vqs = vi->max_queue_pairs * 2 +
+		    virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ);
+
+	/* Allocate space for find_vqs parameters */
+	vqs = kzalloc(total_vqs * sizeof(*vqs), GFP_KERNEL);
+	if (!vqs)
+		goto err_vq;
+	callbacks = kmalloc(total_vqs * sizeof(*callbacks), GFP_KERNEL);
+	if (!callbacks)
+		goto err_callback;
+	names = kmalloc(total_vqs * sizeof(*names), GFP_KERNEL);
+	if (!names)
+		goto err_names;
+
+	/* Parameters for control virtqueue, if any */
+	if (vi->has_cvq) {
+		callbacks[total_vqs - 1] = NULL;
+		names[total_vqs - 1] = "control";
+	}
 
-	vi->rq.vq = vqs[0];
-	vi->sq.vq = vqs[1];
+	/* Allocate/initialize parameters for send/receive virtqueues */
+	for (i = 0; i < vi->max_queue_pairs; i++) {
+		callbacks[rxq2vq(i)] = skb_recv_done;
+		callbacks[txq2vq(i)] = skb_xmit_done;
+		sprintf(vi->rq[i].name, "input.%d", i);
+		sprintf(vi->sq[i].name, "output.%d", i);
+		names[rxq2vq(i)] = vi->rq[i].name;
+		names[txq2vq(i)] = vi->sq[i].name;
+	}
 
-	if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ)) {
-		vi->cvq = vqs[2];
+	ret = vi->vdev->config->find_vqs(vi->vdev, total_vqs, vqs, callbacks,
+					 names);
+	if (ret)
+		goto err_find;
 
+	if (vi->has_cvq) {
+		vi->cvq = vqs[total_vqs - 1];
 		if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VLAN))
 			vi->dev->features |= NETIF_F_HW_VLAN_FILTER;
 	}
+
+	for (i = 0; i < vi->max_queue_pairs; i++) {
+		vi->rq[i].vq = vqs[rxq2vq(i)];
+		vi->sq[i].vq = vqs[txq2vq(i)];
+	}
+
+	kfree(names);
+	kfree(callbacks);
+	kfree(vqs);
+
 	return 0;
+
+err_find:
+	kfree(names);
+err_names:
+	kfree(callbacks);
+err_callback:
+	kfree(vqs);
+err_vq:
+	return ret;
+}
+
+static int virtnet_alloc_queues(struct virtnet_info *vi)
+{
+	int i;
+
+	vi->sq = kzalloc(sizeof(*vi->sq) * vi->max_queue_pairs, GFP_KERNEL);
+	if (!vi->sq)
+		goto err_sq;
+	vi->rq = kzalloc(sizeof(*vi->rq) * vi->max_queue_pairs, GFP_KERNEL);
+	if (!vi->sq)
+		goto err_rq;
+
+	INIT_DELAYED_WORK(&vi->refill, refill_work);
+	for (i = 0; i < vi->max_queue_pairs; i++) {
+		vi->rq[i].pages = NULL;
+		netif_napi_add(vi->dev, &vi->rq[i].napi, virtnet_poll,
+			       napi_weight);
+
+		sg_init_table(vi->rq[i].sg, ARRAY_SIZE(vi->rq[i].sg));
+		sg_init_table(vi->sq[i].sg, ARRAY_SIZE(vi->sq[i].sg));
+	}
+
+	return 0;
+
+err_rq:
+	kfree(vi->sq);
+err_sq:
+	return -ENOMEM;
+}
+
+static int init_vqs(struct virtnet_info *vi)
+{
+	int ret;
+
+	/* Allocate send & receive queues */
+	ret = virtnet_alloc_queues(vi);
+	if (ret)
+		goto err;
+
+	ret = virtnet_find_vqs(vi);
+	if (ret)
+		goto err_free;
+
+	virtnet_set_affinity(vi, true);
+	return 0;
+
+err_free:
+	virtnet_free_queues(vi);
+err:
+	return ret;
 }
 
 static int virtnet_probe(struct virtio_device *vdev)
 {
-	int err;
+	int i, err;
 	struct net_device *dev;
 	struct virtnet_info *vi;
+	u16 max_queue_pairs;
+
+	/* Find if host supports multiqueue virtio_net device */
+	err = virtio_config_val(vdev, VIRTIO_NET_F_MQ,
+				offsetof(struct virtio_net_config,
+				max_virtqueue_pairs), &max_queue_pairs);
+
+	/* We need at least 2 queue's */
+	if (err || max_queue_pairs < VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MIN ||
+	    max_queue_pairs > VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MAX ||
+	    !virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ))
+		max_queue_pairs = 1;
 
 	/* Allocate ourselves a network device with room for our info */
-	dev = alloc_etherdev(sizeof(struct virtnet_info));
+	dev = alloc_etherdev_mq(sizeof(struct virtnet_info), max_queue_pairs);
 	if (!dev)
 		return -ENOMEM;
 
@@ -1133,22 +1412,17 @@ static int virtnet_probe(struct virtio_device *vdev)
 
 	/* Set up our device-specific information */
 	vi = netdev_priv(dev);
-	netif_napi_add(dev, &vi->rq.napi, virtnet_poll, napi_weight);
 	vi->dev = dev;
 	vi->vdev = vdev;
 	vdev->priv = vi;
-	vi->rq.pages = NULL;
 	vi->stats = alloc_percpu(struct virtnet_stats);
 	err = -ENOMEM;
 	if (vi->stats == NULL)
 		goto free;
 
-	INIT_DELAYED_WORK(&vi->refill, refill_work);
 	mutex_init(&vi->config_lock);
 	vi->config_enable = true;
 	INIT_WORK(&vi->config_work, virtnet_config_changed_work);
-	sg_init_table(vi->rq.sg, ARRAY_SIZE(vi->rq.sg));
-	sg_init_table(vi->sq.sg, ARRAY_SIZE(vi->sq.sg));
 
 	/* If we can receive ANY GSO packets, we must allocate large ones. */
 	if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO4) ||
@@ -1159,10 +1433,21 @@ static int virtnet_probe(struct virtio_device *vdev)
 	if (virtio_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF))
 		vi->mergeable_rx_bufs = true;
 
+	if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ))
+		vi->has_cvq = true;
+
+	/* Use single tx/rx queue pair as default */
+	vi->curr_queue_pairs = 1;
+	vi->max_queue_pairs = max_queue_pairs;
+
+	/* Allocate/initialize the rx/tx queues, and invoke find_vqs */
 	err = init_vqs(vi);
 	if (err)
 		goto free_stats;
 
+	netif_set_real_num_tx_queues(dev, 1);
+	netif_set_real_num_rx_queues(dev, 1);
+
 	err = register_netdev(dev);
 	if (err) {
 		pr_debug("virtio_net: registering device failed\n");
@@ -1170,12 +1455,15 @@ static int virtnet_probe(struct virtio_device *vdev)
 	}
 
 	/* Last of all, set up some receive buffers. */
-	try_fill_recv(&vi->rq, GFP_KERNEL);
-
-	/* If we didn't even get one input buffer, we're useless. */
-	if (vi->rq.num == 0) {
-		err = -ENOMEM;
-		goto unregister;
+	for (i = 0; i < vi->max_queue_pairs; i++) {
+		try_fill_recv(&vi->rq[i], GFP_KERNEL);
+
+		/* If we didn't even get one input buffer, we're useless. */
+		if (vi->rq[i].num == 0) {
+			free_unused_bufs(vi);
+			err = -ENOMEM;
+			goto free_recv_bufs;
+		}
 	}
 
 	/* Assume link up if device can't report link status,
@@ -1188,12 +1476,16 @@ static int virtnet_probe(struct virtio_device *vdev)
 		netif_carrier_on(dev);
 	}
 
-	pr_debug("virtnet: registered device %s\n", dev->name);
+	pr_debug("virtnet: registered device %s with %d RX and TX vq's\n",
+		 dev->name, max_queue_pairs);
+
 	return 0;
 
-unregister:
+free_recv_bufs:
+	free_receive_bufs(vi);
 	unregister_netdev(dev);
 free_vqs:
+	cancel_delayed_work_sync(&vi->refill);
 	virtnet_del_vqs(vi);
 free_stats:
 	free_percpu(vi->stats);
@@ -1202,28 +1494,6 @@ free:
 	return err;
 }
 
-static void free_unused_bufs(struct virtnet_info *vi)
-{
-	void *buf;
-	while (1) {
-		buf = virtqueue_detach_unused_buf(vi->sq.vq);
-		if (!buf)
-			break;
-		dev_kfree_skb(buf);
-	}
-	while (1) {
-		buf = virtqueue_detach_unused_buf(vi->rq.vq);
-		if (!buf)
-			break;
-		if (vi->mergeable_rx_bufs || vi->big_packets)
-			give_pages(&vi->rq, buf);
-		else
-			dev_kfree_skb(buf);
-		--vi->rq.num;
-	}
-	BUG_ON(vi->rq.num != 0);
-}
-
 static void remove_vq_common(struct virtnet_info *vi)
 {
 	vi->vdev->config->reset(vi->vdev);
@@ -1231,10 +1501,9 @@ static void remove_vq_common(struct virtnet_info *vi)
 	/* Free unused buffers in both send and recv, if any. */
 	free_unused_bufs(vi);
 
-	virtnet_del_vqs(vi);
+	free_receive_bufs(vi);
 
-	while (vi->rq.pages)
-		__free_pages(get_a_page(&vi->rq, GFP_KERNEL), 0);
+	virtnet_del_vqs(vi);
 }
 
 static void virtnet_remove(struct virtio_device *vdev)
@@ -1260,6 +1529,7 @@ static void virtnet_remove(struct virtio_device *vdev)
 static int virtnet_freeze(struct virtio_device *vdev)
 {
 	struct virtnet_info *vi = vdev->priv;
+	int i;
 
 	/* Prevent config work handler from accessing the device */
 	mutex_lock(&vi->config_lock);
@@ -1270,7 +1540,10 @@ static int virtnet_freeze(struct virtio_device *vdev)
 	cancel_delayed_work_sync(&vi->refill);
 
 	if (netif_running(vi->dev))
-		napi_disable(&vi->rq.napi);
+		for (i = 0; i < vi->max_queue_pairs; i++) {
+			napi_disable(&vi->rq[i].napi);
+			netif_napi_del(&vi->rq[i].napi);
+		}
 
 	remove_vq_common(vi);
 
@@ -1282,24 +1555,28 @@ static int virtnet_freeze(struct virtio_device *vdev)
 static int virtnet_restore(struct virtio_device *vdev)
 {
 	struct virtnet_info *vi = vdev->priv;
-	int err;
+	int err, i;
 
 	err = init_vqs(vi);
 	if (err)
 		return err;
 
 	if (netif_running(vi->dev))
-		virtnet_napi_enable(&vi->rq);
+		for (i = 0; i < vi->max_queue_pairs; i++)
+			virtnet_napi_enable(&vi->rq[i]);
 
 	netif_device_attach(vi->dev);
 
-	if (!try_fill_recv(&vi->rq, GFP_KERNEL))
-		schedule_delayed_work(&vi->refill, 0);
+	for (i = 0; i < vi->max_queue_pairs; i++)
+		if (!try_fill_recv(&vi->rq[i], GFP_KERNEL))
+			schedule_delayed_work(&vi->refill, 0);
 
 	mutex_lock(&vi->config_lock);
 	vi->config_enable = true;
 	mutex_unlock(&vi->config_lock);
 
+	virtnet_set_queues(vi, vi->curr_queue_pairs);
+
 	return 0;
 }
 #endif
@@ -1317,7 +1594,7 @@ static unsigned int features[] = {
 	VIRTIO_NET_F_GUEST_ECN, VIRTIO_NET_F_GUEST_UFO,
 	VIRTIO_NET_F_MRG_RXBUF, VIRTIO_NET_F_STATUS, VIRTIO_NET_F_CTRL_VQ,
 	VIRTIO_NET_F_CTRL_RX, VIRTIO_NET_F_CTRL_VLAN,
-	VIRTIO_NET_F_GUEST_ANNOUNCE,
+	VIRTIO_NET_F_GUEST_ANNOUNCE, VIRTIO_NET_F_MQ,
 };
 
 static struct virtio_driver virtio_net_driver = {
diff --git a/include/uapi/linux/virtio_net.h b/include/uapi/linux/virtio_net.h
index 2470f54..848e358 100644
--- a/include/uapi/linux/virtio_net.h
+++ b/include/uapi/linux/virtio_net.h
@@ -51,6 +51,8 @@
 #define VIRTIO_NET_F_CTRL_RX_EXTRA 20	/* Extra RX mode control support */
 #define VIRTIO_NET_F_GUEST_ANNOUNCE 21	/* Guest can announce device on the
 					 * network */
+#define VIRTIO_NET_F_MQ	22	/* Device supports Receive Flow
+					 * Steering */
 
 #define VIRTIO_NET_S_LINK_UP	1	/* Link is up */
 #define VIRTIO_NET_S_ANNOUNCE	2	/* Announcement is needed */
@@ -60,6 +62,11 @@ struct virtio_net_config {
 	__u8 mac[6];
 	/* See VIRTIO_NET_F_STATUS and VIRTIO_NET_S_* above */
 	__u16 status;
+	/* Maximum number of each of transmit and receive queues;
+	 * see VIRTIO_NET_F_MQ and VIRTIO_NET_CTRL_MQ.
+	 * Legal values are between 1 and 0x8000
+	 */
+	__u16 max_virtqueue_pairs;
 } __attribute__((packed));
 
 /* This is the first element of the scatter-gather list.  If you don't
@@ -166,4 +173,24 @@ struct virtio_net_ctrl_mac {
 #define VIRTIO_NET_CTRL_ANNOUNCE       3
  #define VIRTIO_NET_CTRL_ANNOUNCE_ACK         0
 
+/*
+ * Control Receive Flow Steering
+ *
+ * The command VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET
+ * enables Receive Flow Steering, specifying the number of the transmit and
+ * receive queues that will be used. After the command is consumed and acked by
+ * the device, the device will not steer new packets on receive virtqueues
+ * other than specified nor read from transmit virtqueues other than specified.
+ * Accordingly, driver should not transmit new packets  on virtqueues other than
+ * specified.
+ */
+struct virtio_net_ctrl_mq {
+	u16 virtqueue_pairs;
+};
+
+#define VIRTIO_NET_CTRL_MQ   4
+ #define VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET        0
+ #define VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MIN        1
+ #define VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MAX        0x8000
+
 #endif /* _LINUX_VIRTIO_NET_H */
-- 
1.7.1

^ permalink raw reply related

* [PATCH net-next v3 3/3] virtio-net: support changing the number of queue pairs through ethtool
From: Jason Wang @ 2012-12-07 17:04 UTC (permalink / raw)
  To: mst, rusty, davem, virtualization, netdev, linux-kernel
  Cc: krkumar2, kvm, bhutchings, jwhan, shiyer
In-Reply-To: <1354899897-10423-1-git-send-email-jasowang@redhat.com>

This patch implements the ethtool_{set|get}_channels method of virtio-net to
allow user to change the number of queues when the device is running on demand.

Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/net/virtio_net.c |   43 +++++++++++++++++++++++++++++++++++++++++++
 1 files changed, 43 insertions(+), 0 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index c083048..a644eeb 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -1075,10 +1075,53 @@ static void virtnet_get_drvinfo(struct net_device *dev,
 
 }
 
+/* TODO: Eliminate OOO packets during switching */
+static int virtnet_set_channels(struct net_device *dev,
+				struct ethtool_channels *channels)
+{
+	struct virtnet_info *vi = netdev_priv(dev);
+	u16 queue_pairs = channels->combined_count;
+	int err;
+
+	/* We don't support separate rx/tx channels.
+	 * We don't allow setting 'other' channels.
+	 */
+	if (channels->rx_count || channels->tx_count || channels->other_count)
+		return -EINVAL;
+
+	if (queue_pairs > vi->max_queue_pairs)
+		return -EINVAL;
+
+	err = virtnet_set_queues(vi, queue_pairs);
+	if (!err) {
+		netif_set_real_num_tx_queues(dev, queue_pairs);
+		netif_set_real_num_rx_queues(dev, queue_pairs);
+
+		virtnet_set_affinity(vi, true);
+	}
+
+	return err;
+}
+
+static void virtnet_get_channels(struct net_device *dev,
+				 struct ethtool_channels *channels)
+{
+	struct virtnet_info *vi = netdev_priv(dev);
+
+	channels->combined_count = vi->curr_queue_pairs;
+	channels->max_combined = vi->max_queue_pairs;
+	channels->max_other = 0;
+	channels->rx_count = 0;
+	channels->tx_count = 0;
+	channels->other_count = 0;
+}
+
 static const struct ethtool_ops virtnet_ethtool_ops = {
 	.get_drvinfo = virtnet_get_drvinfo,
 	.get_link = ethtool_op_get_link,
 	.get_ringparam = virtnet_get_ringparam,
+	.set_channels = virtnet_set_channels,
+	.get_channels = virtnet_get_channels,
 };
 
 #define MIN_MTU 68
-- 
1.7.1

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox