Netdev List
 help / color / mirror / Atom feed
* [PATCH net-next 5/5] tipc: standardize recvmsg routine
From: Ying Xue @ 2014-01-17  1:50 UTC (permalink / raw)
  To: davem; +Cc: jon.maloy, netdev, Paul.Gortmaker, tipc-discussion
In-Reply-To: <1389923407-26969-1-git-send-email-ying.xue@windriver.com>

Standardize the behaviour of waiting for events in TIPC recvmsg()
so that all variables of socket or port structures are protected
within socket lock, allowing the process of calling recvmsg() to
be woken up at appropriate time.

Signed-off-by: Ying Xue <ying.xue@windriver.com>
Reviewed-by: Jon Maloy <jon.maloy@ericsson.com>
---
 net/tipc/socket.c |   80 ++++++++++++++++++++++++++++-------------------------
 1 file changed, 42 insertions(+), 38 deletions(-)

diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index c480310..eab17eb 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -55,9 +55,6 @@ struct tipc_sock {
 #define tipc_sk(sk) ((struct tipc_sock *)(sk))
 #define tipc_sk_port(sk) (tipc_sk(sk)->p)
 
-#define tipc_rx_ready(sock) (!skb_queue_empty(&sock->sk->sk_receive_queue) || \
-			(sock->state == SS_DISCONNECTING))
-
 static int backlog_rcv(struct sock *sk, struct sk_buff *skb);
 static u32 dispatch(struct tipc_port *tport, struct sk_buff *buf);
 static void wakeupdispatch(struct tipc_port *tport);
@@ -994,6 +991,37 @@ static int anc_data_recv(struct msghdr *m, struct tipc_msg *msg,
 	return 0;
 }
 
+static int tipc_wait_for_rcvmsg(struct socket *sock, long timeo)
+{
+	struct sock *sk = sock->sk;
+	DEFINE_WAIT(wait);
+	int err;
+
+	for (;;) {
+		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+		if (skb_queue_empty(&sk->sk_receive_queue)) {
+			if (sock->state == SS_DISCONNECTING) {
+				err = -ENOTCONN;
+				break;
+			}
+			release_sock(sk);
+			timeo = schedule_timeout(timeo);
+			lock_sock(sk);
+		}
+		err = 0;
+		if (!skb_queue_empty(&sk->sk_receive_queue))
+			break;
+		err = sock_intr_errno(timeo);
+		if (signal_pending(current))
+			break;
+		err = -EAGAIN;
+		if (!timeo)
+			break;
+	}
+	finish_wait(sk_sleep(sk), &wait);
+	return err;
+}
+
 /**
  * recv_msg - receive packet-oriented message
  * @iocb: (unused)
@@ -1013,7 +1041,7 @@ static int recv_msg(struct kiocb *iocb, struct socket *sock,
 	struct tipc_port *tport = tipc_sk_port(sk);
 	struct sk_buff *buf;
 	struct tipc_msg *msg;
-	long timeout;
+	long timeo;
 	unsigned int sz;
 	u32 err;
 	int res;
@@ -1029,25 +1057,13 @@ static int recv_msg(struct kiocb *iocb, struct socket *sock,
 		goto exit;
 	}
 
-	timeout = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
+	timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
 restart:
 
 	/* Look for a message in receive queue; wait if necessary */
-	while (skb_queue_empty(&sk->sk_receive_queue)) {
-		if (sock->state == SS_DISCONNECTING) {
-			res = -ENOTCONN;
-			goto exit;
-		}
-		if (timeout <= 0L) {
-			res = timeout ? timeout : -EWOULDBLOCK;
-			goto exit;
-		}
-		release_sock(sk);
-		timeout = wait_event_interruptible_timeout(*sk_sleep(sk),
-							   tipc_rx_ready(sock),
-							   timeout);
-		lock_sock(sk);
-	}
+	res = tipc_wait_for_rcvmsg(sock, timeo);
+	if (res)
+		goto exit;
 
 	/* Look at first message in receive queue */
 	buf = skb_peek(&sk->sk_receive_queue);
@@ -1119,7 +1135,7 @@ static int recv_stream(struct kiocb *iocb, struct socket *sock,
 	struct tipc_port *tport = tipc_sk_port(sk);
 	struct sk_buff *buf;
 	struct tipc_msg *msg;
-	long timeout;
+	long timeo;
 	unsigned int sz;
 	int sz_to_copy, target, needed;
 	int sz_copied = 0;
@@ -1132,31 +1148,19 @@ static int recv_stream(struct kiocb *iocb, struct socket *sock,
 
 	lock_sock(sk);
 
-	if (unlikely((sock->state == SS_UNCONNECTED))) {
+	if (unlikely(sock->state == SS_UNCONNECTED)) {
 		res = -ENOTCONN;
 		goto exit;
 	}
 
 	target = sock_rcvlowat(sk, flags & MSG_WAITALL, buf_len);
-	timeout = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
+	timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
 
 restart:
 	/* Look for a message in receive queue; wait if necessary */
-	while (skb_queue_empty(&sk->sk_receive_queue)) {
-		if (sock->state == SS_DISCONNECTING) {
-			res = -ENOTCONN;
-			goto exit;
-		}
-		if (timeout <= 0L) {
-			res = timeout ? timeout : -EWOULDBLOCK;
-			goto exit;
-		}
-		release_sock(sk);
-		timeout = wait_event_interruptible_timeout(*sk_sleep(sk),
-							   tipc_rx_ready(sock),
-							   timeout);
-		lock_sock(sk);
-	}
+	res = tipc_wait_for_rcvmsg(sock, timeo);
+	if (res)
+		goto exit;
 
 	/* Look at first message in receive queue */
 	buf = skb_peek(&sk->sk_receive_queue);
-- 
1.7.9.5


------------------------------------------------------------------------------
CenturyLink Cloud: The Leader in Enterprise Cloud Services.
Learn Why More Businesses Are Choosing CenturyLink Cloud For
Critical Workloads, Development Environments & Everything In Between.
Get a Quote or Start a Free Trial Today. 
http://pubads.g.doubleclick.net/gampad/clk?id=119420431&iu=/4140/ostg.clktrk

^ permalink raw reply related

* [PATCH -next] net_sched: fix error return code in fw_change_attrs()
From: Wei Yongjun @ 2014-01-17  1:53 UTC (permalink / raw)
  To: jhs, davem, xiyou.wangcong; +Cc: yongjun_wei, netdev

From: Wei Yongjun <yongjun_wei@trendmicro.com.cn>

The error code was not set if change indev fail, so the error
condition wasn't reflected in the return value. Fix to return a
negative error code from this error handling case instead of 0.

Fixes: 2519a602c273 ('net_sched: optimize tcf_match_indev()')
Signed-off-by: Wei Yongjun <yongjun_wei@trendmicro.com.cn>
---
 net/sched/cls_fw.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c
index ed00e8c..a366537 100644
--- a/net/sched/cls_fw.c
+++ b/net/sched/cls_fw.c
@@ -209,8 +209,10 @@ fw_change_attrs(struct net *net, struct tcf_proto *tp, struct fw_filter *f,
 	if (tb[TCA_FW_INDEV]) {
 		int ret;
 		ret = tcf_change_indev(net, tb[TCA_FW_INDEV]);
-		if (ret < 0)
+		if (ret < 0) {
+			err = ret;
 			goto errout;
+		}
 		f->ifindex = ret;
 	}
 #endif /* CONFIG_NET_CLS_IND */

^ permalink raw reply related

* Re: [PATCH net-next v2 2/2] bonding: try to enable SG features when adding a new, slave
From: Ding Tianhong @ 2014-01-17  1:59 UTC (permalink / raw)
  To: David Miller; +Cc: fubar, vfalico, edumazet, netdev
In-Reply-To: <20140116.160721.427996343348396551.davem@davemloft.net>

On 2014/1/17 8:07, David Miller wrote:
> From: Ding Tianhong <dingtianhong@huawei.com>
> Date: Tue, 14 Jan 2014 17:00:35 +0800
> 
>> The commit b0ce3508(bonding: allow TSO being set on bonding master)
>> has make the TSO being set for bond dev, but in some situation, if
>> the slave did not set the NETIF_F_SG features yet, the bond master
>> will miss the TSO features in netdev_fix_features because the TSO is
>> depended on SG.
>>
>> If the slave hw support SG features, but not set yet, I will try to
>> open it when enslave the dev, better for performance.
>>
>> Signed-off-by: Ding Tianhong <dingtianhong@huawei.com>
> 
> I really don't think we should force enable device features in slaves
> that perhaps the user intentionally disabled, or perhaps the driver has
> a reason to disable by default (lower performance, etc.)
> 
> I'm not applying this series, sorry.
> 
> 
Yes, pls miss this one, after the discussion with Veaceslav, I think it is not
reasonable to do this. thanks.

Ding

^ permalink raw reply

* Re: [PATCH] ipv6: don't call addrconf_dst_alloc again when enable lo
From: chenweilong @ 2014-01-17  2:02 UTC (permalink / raw)
  To: Hannes Frederic Sowa; +Cc: Gao feng, David Miller, kumaran.4353, netdev
In-Reply-To: <20140108085554.GF9007@order.stressinduktion.org>

On 2014/1/8 16:55, Hannes Frederic Sowa wrote:
> On Wed, Jan 08, 2014 at 04:42:46PM +0800, Gao feng wrote:
>> On 01/08/2014 04:05 PM, Hannes Frederic Sowa wrote:
>>> On Wed, Jan 08, 2014 at 03:50:09PM +0800, Gao feng wrote:
>>>> On 01/03/2014 02:53 PM, Hannes Frederic Sowa wrote:
>>>>> On Thu, Jan 02, 2014 at 05:33:15PM +0800, chenweilong wrote:
>>>>>> diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
>>>>>> index 62d1799..d2f8c0a 100644
>>>>>> --- a/net/ipv6/addrconf.c
>>>>>> +++ b/net/ipv6/addrconf.c
>>>>>> @@ -2422,8 +2422,9 @@ static void init_loopback(struct net_device *dev)
>>>>>>  			if (sp_ifa->flags & (IFA_F_DADFAILED | IFA_F_TENTATIVE))
>>>>>>  				continue;
>>>>>>
>>>>>> -			if (sp_ifa->rt)
>>>>>> -				continue;
>>>>>> +			if (sp_ifa->rt && sp_ifa->rt->dst.dev == dev) {
>>>>>> +				ip6_del_rt(sp_ifa->rt);
>>>>>> +			}
>>>>>>
>>>>>>  			sp_rt = addrconf_dst_alloc(idev, &sp_ifa->addr, 0);
>>>>>>
>>>>>
>>>>> Maybe this change would not be that bad after all, as those ifa attached dsts
>>>>> are already dead and queued up for gc and should not get inserted back.
>>>>
>>>> I like this idea, maybe the below patch is better. we only need to delete this
>>>> route when it has been added to garbage list.
>>>>
>>>> diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
>>>> index 1a341f7..4dca886 100644
>>>> --- a/net/ipv6/addrconf.c
>>>> +++ b/net/ipv6/addrconf.c
>>>> @@ -2610,8 +2610,16 @@ static void init_loopback(struct net_device *dev)
>>>>                         if (sp_ifa->flags & (IFA_F_DADFAILED | IFA_F_TENTATIVE))
>>>>                                 continue;
>>>>
>>>> -                       if (sp_ifa->rt)
>>>> -                               continue;
>>>> +                       if (sp_ifa->rt) {
>>>> +                               /* This dst has been added to garbage list when
>>>> +                                * lo device down, delete this obsolete dst and
>>>> +                                * reallocate new router for ifa. */
>>>> +                               if (sp_ifa->rt->dst.obsolete > 0) {
>>>> +                                       ip6_del_rt(sp_ifa->rt);
>>>> +                                       sp_ifa->rt = NULL;
>>>> +                               } else
>>>> +                                       continue;
>>>> +                       }
>>>>
>>>>                         sp_rt = addrconf_dst_alloc(idev, &sp_ifa->addr, false);
>>>
>>> It looks like it can work but I don't know if we should just fix this the
>>> clean way (see below).
>>>
>>>>> I'll try to just disable routes without removing them at all when we set an
>>>>> interface to down at the weekend.
>>>>>
>>>>
>>>> How do you decide which route should be disabled?  use rt6_flags? I don't know
>>>> if your way will cause miscarriage.
>>>
>>> What I did so far is that I added a new function next to rt6_ifdown that
>>> only gets called if interface gets shutdown but not unregistered (from
>>> addrconf_ifdown).
>>>
>>
>> rt6_ifdown has alreay put this device related routes to the garbage list.
>>
>>> fib6_clean_all then iterates over the whole routing table with a new predicate
>>> function which checks in the same way like fib6_ifdown, if it is a matching route
>>> (the interfaces match up) and if so, toggles a new "DEAD" flag in rt6i_flags.
>>>
>>> When bringing up the interface I distinguish between up and register and just
>>> clear this death flag from the routes on bringing it up.
>>>
>>> fib lookup code then does not honour those routes.
>>>
>>> I had no time to test this thoroughly at the weekend and still have some code
>>> paths were I am unsure. Do you see any problems with this so far? We could
>>> then delete the special cases on loopback interface init.
>>
>> So you add a special case in the general network interface down logic. I think this
>> is more complex...
> 
> The problem is, that we only have a reference to ifp->rt, the loopback
> RTF_LOCAL route. Currently we silently remove all other routes this interface
> has. Sure, they come back soon with RAs and such, but this is not the way this
> should work.
> 
> Greetings,
> 
>   Hannes
> 
> 
> .
> 

Hi,

It's quite a long time, How's your patch going on?

^ permalink raw reply

* Re: PANIC in vxlan <debugging now>
From: Jesse Brandeburg @ 2014-01-17  2:03 UTC (permalink / raw)
  To: netdev; +Cc: Jesse Brandeburg, dborkman
In-Reply-To: <20140116171428.00004da1@unknown>

+dborkman@redhat.com and left the full text of the message for him to
see.  Bad commit below.

On Thu, 16 Jan 2014 17:14:28 -0800
Jesse Brandeburg <jesse.brandeburg@intel.com> wrote:

> I'm currently debugging this but given where the kernel release cycle
> is I wanted to let the list know.
> 
> It may well be a bug in our code, and if it is we'll find it, but here is 
> the panic, it doesn't occur when vxlan is not enabled.
> 
> Jan 16 13:46:44 jbrandeb-cp2 kernel: [   17.331010] cgroup: libvirtd (1387) created nested cgroup for controller "memory" which has incomplete hierarchy supp
> ort. Nested cgroups may change behavior in the future.
> Jan 16 13:46:44 jbrandeb-cp2 kernel: [   17.331014] cgroup: "memory" requires setting use_hierarchy to 1 on the root.
> Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.576568] ------------[ cut here ]------------
> Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.586411] kernel BUG at include/net/netns/generic.h:45!
> Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.596336] invalid opcode: 0000 [#1] SMP
> Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.606268] Modules linked in: lockd sunrpc i40e igb iTCO_wdt iTCO_vendor_support sb_edac ioatdma ptp microcode lpc_ich edac_core i2c_i801 mfd_core dca pps_core wmi kvm uinput isci firewire_ohci libsas firewire_core crc_itu_t scsi_transport_sas mgag200 drm_kms_helper ttm
> Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.637923] CPU: 0 PID: 1387 Comm: libvirtd Not tainted 3.13.0-rc7+ #30
> Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.648599] Hardware name: Intel Corporation S2600CO ........../S2600CO, BIOS SE5C600.86B.01.08.6003.062420131549 06/24/2013
> Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.659612] task: ffff88063b5c6000 ti: ffff8806333ca000 task.ti: ffff8806333ca000
> Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.670661] RIP: 0010:[<ffffffff816df92f>]  [<ffffffff816df92f>] net_generic.isra.34.part.35+0x4/0x6
> Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.681738] RSP: 0018:ffff8806333cbb80  EFLAGS: 00010246
> Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.692536] RAX: 0000000000000000 RBX: 00000000ffffffed RCX: 0000000000000010
> Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.703577] RDX: ffff88063d03d380 RSI: 0000000000000010 RDI: ffffffff81cfd9f0
> Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.714612] RBP: ffff8806333cbb80 R08: 0000000000000000 R09: ffffffff81cfd9f0
> Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.725531] R10: 00000000000002cc R11: 0000000000000004 R12: 0000000000000000
> Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.736448] R13: ffff880639118000 R14: ffff8806333cbc68 R15: 0000000000000000
> Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.747292] FS:  00007f6381830700(0000) GS:ffff880647600000(0000) knlGS:0000000000000000
> Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.758248] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.769263] CR2: 00007f637c04b000 CR3: 0000000c3aa1f000 CR4: 00000000000407f0
> Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.780402] Stack:
> Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.791386]  ffff8806333cbbc0 ffffffff814d0865 ffff8806333cbc40 00000000ffffffef
> Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.802702]  00000000ffffffed ffffffff81cc67d0 0000000000000010 ffff8806333cbc68
> Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.814021]  ffff8806333cbc00 ffffffff816e9e5d 0000000000000004 ffff8806333cbc68
> Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.825185] Call Trace:
> Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.836106]  [<ffffffff814d0865>] vxlan_lowerdev_event+0xf5/0x100
> Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.847254]  [<ffffffff816e9e5d>] notifier_call_chain+0x4d/0x70
> Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.858457]  [<ffffffff810912be>] __raw_notifier_call_chain+0xe/0x10
> Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.869696]  [<ffffffff810912d6>] raw_notifier_call_chain+0x16/0x20
> Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.880896]  [<ffffffff815d9610>] call_netdevice_notifiers_info+0x40/0x70
> Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.892063]  [<ffffffff815d9656>] call_netdevice_notifiers+0x16/0x20
> Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.903107]  [<ffffffff815e1bce>] register_netdevice+0x1be/0x3a0
> Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.914128]  [<ffffffff815e1dce>] register_netdev+0x1e/0x30
> Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.925072]  [<ffffffff814cb94a>] loopback_net_init+0x4a/0xb0
> Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.936048]  [<ffffffffa016ed6e>] ? lockd_init_net+0x6e/0xb0 [lockd]
> Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.947081]  [<ffffffff815d6bac>] ops_init+0x4c/0x150
> Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.958070]  [<ffffffff815d6d23>] setup_net+0x73/0x110
> Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.969006]  [<ffffffff815d725b>] copy_net_ns+0x7b/0x100
> Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.979897]  [<ffffffff81090e11>] create_new_namespaces+0x101/0x1b0
> Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.990855]  [<ffffffff81090f45>] copy_namespaces+0x85/0xb0
> Jan 16 13:46:45 jbrandeb-cp2 kernel: [   18.001656]  [<ffffffff810693d5>] copy_process.part.26+0x935/0x1500
> Jan 16 13:46:45 jbrandeb-cp2 kernel: [   18.012370]  [<ffffffff811d5186>] ? mntput+0x26/0x40
> Jan 16 13:46:45 jbrandeb-cp2 kernel: [   18.022924]  [<ffffffff8106a15c>] do_fork+0xbc/0x2e0
> Jan 16 13:46:45 jbrandeb-cp2 kernel: [   18.033331]  [<ffffffff811b7f2e>] ? ____fput+0xe/0x10
> Jan 16 13:46:45 jbrandeb-cp2 kernel: [   18.043622]  [<ffffffff81089c5c>] ? task_work_run+0xac/0xe0
> Jan 16 13:46:45 jbrandeb-cp2 kernel: [   18.053905]  [<ffffffff8106a406>] SyS_clone+0x16/0x20
> Jan 16 13:46:45 jbrandeb-cp2 kernel: [   18.064265]  [<ffffffff816ee689>] stub_clone+0x69/0x90
> Jan 16 13:46:45 jbrandeb-cp2 kernel: [   18.074600]  [<ffffffff816ee329>] ? system_call_fastpath+0x16/0x1b
> Jan 16 13:46:45 jbrandeb-cp2 kernel: [   18.084879] Code: 00 75 1d 55 be 2f 00 00 00 48 c7 c7 65 93 a2 81 48 89 e5 e8 f4 b5 98 ff 5d c6 05 30 aa 5f 00 01 c3 55 48 89 e5 0f 0b 55 48 89 e5 <0f> 0b 55 48 89 e5 0f 0b 66 66 66 66 90 55 48 c7 c7 c0 4c cb 81
> Jan 16 13:46:45 jbrandeb-cp2 kernel: [   18.105818] RIP  [<ffffffff816df92f>] net_generic.isra.34.part.35+0x4/0x6
> Jan 16 13:46:45 jbrandeb-cp2 kernel: [   18.116106]  RSP <ffff8806333cbb80>
> Jan 16 13:46:45 jbrandeb-cp2 kernel: [   18.172366] ---[ end trace 0bb84cf9aa76a384 ]---
> Jan 16 13:46:47 jbrandeb-cp2 systemd[1]: Startup finished in 4s 918ms 164us (kernel) + 3s 548ms 460us (initrd) + 11s 2ms 474us (userspace) = 19s 469ms 98us.
> Jan 16 13:46:47 jbrandeb-cp2 dbus-daemon[989]: dbus[989]: [system] Activating via systemd: service name='org.freedesktop.Accounts' unit='accounts-daemon.service'
> 
> code says:
> (gdb) l *(vxlan_lowerdev_event+0xf5)
> 0xffffffff814d0865 is at include/net/netns/generic.h:41.
> 34      static inline void *net_generic(const struct net *net, int id)
> 35      {
> 36              struct net_generic *ng;
> 37              void *ptr;
> 38
> 39              rcu_read_lock();
> 40              ng = rcu_dereference(net->gen);
> 41              BUG_ON(id == 0 || id > ng->len);
> 42              ptr = ng->ptr[id - 1];
> 43              rcu_read_unlock();
> 44
> >>>> 45              BUG_ON(!ptr);
> 46              return ptr;
> 47      }
> 48      #endif
> 


It appears that the bug is in acaf4e70997f (net: vxlan: when lower dev
unregisters remove vxlan dev as well).

reverting that patch avoids the panic.  I wasn't able to see
immediately what was wrong in the patch.
-- 
Jesse

^ permalink raw reply

* Linux kernel patch: elide fib_validate_source() completely when possible - bad side effect?
From: gregory hoggarth @ 2014-01-17  1:59 UTC (permalink / raw)
  To: netdev

Hi,

I work for Allied Telesis, a company that manufactures and sells routers and switches.

Earlier in the year we upgraded the linux kernel on our L3 switch firmware to v3.6.2 which includes this patch. Our testers have picked up a change in behaviour which I have traced back to this patch and I'd like to get comments on whether this is a bug or not.

We found that in a configuration with a device under test (DUT) that has an IP address 192.168.0.1 / 24, sending an ICMP echo request with a source IP address of 192.168.0.255 will now be accepted by the DUT, which will send an ICMP echo reply back to the 192.168.0.255 address with a broadcast MAC address of FFFF.FFFF.FFFF, meaning any other devices in the L2 domain will pick up and process the ICMP reply.

Prior to the upgrade, our device was running on linux kernel 2.6.38.2, and in that version the DUT would reject the packet with a martian source address error:
11:20:54 awplus kernel: RxPkt(00001) Q:1 dev:8 lport:00000a00 cpuC:154 encap:0 len:98 bufs:1
11:20:54 awplus kernel: RxPkt(00001) vid:1 port2.0.23
11:20:54 awplus kernel: <7>   0000cd29 980d0000 cd27c1eb 08004500 
11:20:54 awplus kernel: <7>   00540000 40004001 b858c0a8 00ffc0a8 
11:20:54 awplus kernel: <7>   00010800 a4cf588c 000552dd c9990000 
11:20:54 awplus kernel: <7>   f3240809 0a0b0c0d 0e0f1011 12131415 
11:20:54 awplus kernel: <7>   (...)
11:20:54 awplus kernel: martian source 192.168.0.1 from 192.168.0.255, on dev vlan1
11:20:54 awplus kernel: ll header: 00:00:cd:29:98:0d:00:00:cd:27:c1:eb:08:00:45:00


Initially we were thinking that a device configured with 192.168.0.255 / 23 sending a ping to 192.168.0.1 / 24 should expect to get a reply, even though this is somewhat of a misconfiguration in that the subnets aren't matching and that some things may not work properly. For example ping from a device with IP address 192.168.0.254 would behave correctly, so it seemed correct that a ping from 192.168.0.255 should also work properly. However it appears that some other part of the kernel is detecting that the echo reply packet sent by the DUT has a destination IP address of the subnet broadcast, so instead of sending it with a unicast MAC address it sends it with the broadcast MAC address, which means all other devices on the subnet would process the ICMP echo reply. This therefore makes it s
 eem like the ICMP echo request should have been dropped in the first place.

The patch added the "if (!r && !fib_num_tclassid_users)" check to fib_validate_source, which for our devices evaluates to true, hence returning early and the source address check inside __fib_validate_source is not carried out.


Does anyone have comments as to what the correct behaviour should be in this case, and whether this is an unexpected side-effect from the patch or not?

Thanks,
Greg

^ permalink raw reply

* Re: [PATCH net-next v2 3/3] reciprocal_divide: correction/update of the algorithm
From: Eric Dumazet @ 2014-01-17  2:33 UTC (permalink / raw)
  To: Hannes Frederic Sowa
  Cc: netdev, Austin S Hemmelgarn, linux-kernel, Jesse Gross,
	Jamal Hadi Salim, Stephen Hemminger, Matt Mackall, Pekka Enberg,
	Christoph Lameter, Andy Gospodarek, Veaceslav Falico,
	Jay Vosburgh, Jakub Zawadzki, Daniel Borkmann
In-Reply-To: <1389918519-23779-4-git-send-email-hannes@stressinduktion.org>

On Fri, 2014-01-17 at 01:28 +0100, Hannes Frederic Sowa wrote:
> Jakub Zawadzki noticed that some divisions by reciprocal_divide()
> were not correct [1][2], which he could also show with BPF code after
> divisions are transformed into reciprocal_value() for runtime invariant
> which can be passed to reciprocal_divide() later on; reverse in BPF dump
> ended up with a different, off-by-one K.
> 
> This has been fixed by Eric Dumazet in commit aee636c4809fa5 ("bpf: do not
> use reciprocal divide"). This follow-up patch improves reciprocal_value()
> and reciprocal_divide() to work in all cases, so future use is safe.
> 
> Known problems with the old implementation were that division by 1 always
> returned 0 and some off-by-ones when the dividend and divisor where
> very large.  This seemed to not be problematic with its current users
> in networking, mm/slab.c and lib/flex_array.c, but future users would
> need to check for this specifically and it might not be obvious at first.
> 
> In order to fix that, we propose an extension from the original
> implementation from commit 6a2d7a955d8d resp. [3][4], by using
> the algorithm proposed in "Division by Invariant Integers Using
> Multiplication" [5], Torbjörn Granlund and Peter L. Montgomery, that is,
> pseudocode for q = n/d where q,n,d is in u32 universe:
> 
> 1) Initialization:
> 
>   int l = ceil(log_2 d)
>   uword m' = floor((1<<32)*((1<<l)-d)/d)+1
>   int sh_1 = min(l,1)
>   int sh_2 = max(l-1,0)
> 
> 2) For q = n/d, all uword:
> 
>   uword t = (n*m')>>32
>   q = (t+((n-t)>>sh_1))>>sh_2
> 
> The assembler implementation from Agner Fog [6] also helped a lot
> while implementing. We have tested the implementation on x86_64,
> ppc64, i686, s390x; on x86_64/haswell we're still half the latency
> compared to normal divide.
> 
> Joint work with Daniel Borkmann.
> 
>   [1] http://www.wireshark.org/~darkjames/reciprocal-buggy.c
>   [2] http://www.wireshark.org/~darkjames/set-and-dump-filter-k-bug.c
>   [3] https://gmplib.org/~tege/division-paper.pdf
>   [4] http://homepage.cs.uiowa.edu/~jones/bcd/divide.html
>   [5] http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.1.2556
>   [6] http://www.agner.org/optimize/asmlib.zip
> 
> Fixes: 6a2d7a955d8d ("SLAB: use a multiply instead of a divide in obj_to_index()")


I already demonstrated this slab patch was fine.

The current algo works well (no off-by-one error) when the dividend is
a multiple of the divisor.

You are adding extra overhead, while we know its not necessary.

By using "Fixes: ... " you are asking a backport to stable branches,
which seems really silly in this case, especially with this monolithic
patch changing 12 files in different subsystems.

If you believe flex_array has a problem, please fix flex_array only,
by a small patch (Maybe a revert ?)

Then, introduce your new helpers if we really think they are needed.

^ permalink raw reply

* [PATCH net-next v5 0/6] virtio-net: mergeable rx buffer size auto-tuning
From: Michael Dalton @ 2014-01-17  2:36 UTC (permalink / raw)
  To: David S. Miller
  Cc: Michael Dalton, Michael S. Tsirkin, netdev, virtualization,
	Eric Dumazet, Ben Hutchings

The virtio-net device currently uses aligned MTU-sized mergeable receive
packet buffers. Network throughput for workloads with large average
packet size can be improved by posting larger receive packet buffers.
However, due to SKB truesize effects, posting large (e.g, PAGE_SIZE)
buffers reduces the throughput of workloads that do not benefit from GRO
and have no large inbound packets.

This patchset introduces virtio-net mergeable buffer size auto-tuning,
with buffer sizes ranging from aligned MTU-size to PAGE_SIZE. Packet
buffer size is chosen based on a per-receive queue EWMA of incoming
packet size.

To unify mergeable receive buffer memory allocation and improve
SKB frag coalescing, all mergeable buffer memory allocation is
migrated to per-receive queue page frag allocators.

The per-receive queue mergeable packet buffer size is exported via
sysfs, and the network device sysfs layer has been extended to add
support for device-specific per-receive queue sysfs attribute groups.

Michael Dalton (6):
  net: allow > 0 order atomic page alloc in skb_page_frag_refill
  virtio-net: use per-receive queue page frag alloc for mergeable bufs
  virtio-net: auto-tune mergeable rx buffer size for improved
    performance
  net-sysfs: add support for device-specific rx queue sysfs attributes
  lib: Ensure EWMA does not store wrong intermediate values
  virtio-net: initial rx sysfs support, export mergeable rx buffer size

 drivers/net/virtio_net.c  | 196 +++++++++++++++++++++++++++++++++-------------
 include/linux/netdevice.h |  35 ++++++++-
 lib/average.c             |   6 +-
 net/core/dev.c            |  12 +--
 net/core/net-sysfs.c      |  50 +++++++-----
 net/core/sock.c           |   4 +-
 6 files changed, 213 insertions(+), 90 deletions(-)

-- 
1.8.5.2

^ permalink raw reply

* [PATCH net-next v5 1/6] net: allow > 0 order atomic page alloc in skb_page_frag_refill
From: Michael Dalton @ 2014-01-17  2:36 UTC (permalink / raw)
  To: David S. Miller
  Cc: Michael Dalton, Michael S. Tsirkin, netdev, virtualization,
	Eric Dumazet, Ben Hutchings
In-Reply-To: <1389926176-6699-1-git-send-email-mwdalton@google.com>

skb_page_frag_refill currently permits only order-0 page allocs
unless GFP_WAIT is used. Change skb_page_frag_refill to attempt
higher-order page allocations whether or not GFP_WAIT is used. If
memory cannot be allocated, the allocator will fall back to
successively smaller page allocs (down to order-0 page allocs).

This change brings skb_page_frag_refill in line with the existing
page allocation strategy employed by netdev_alloc_frag, which attempts
higher-order page allocations whether or not GFP_WAIT is set, falling
back to successively lower-order page allocations on failure. Part
of migration of virtio-net to per-receive queue page frag allocators.

Acked-by: Michael S. Tsirkin <mst@redhat.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Michael Dalton <mwdalton@google.com>
---
 net/core/sock.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/net/core/sock.c b/net/core/sock.c
index 85ad6f0..b3f7ee3 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1836,9 +1836,7 @@ bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t prio)
 		put_page(pfrag->page);
 	}
 
-	/* We restrict high order allocations to users that can afford to wait */
-	order = (prio & __GFP_WAIT) ? SKB_FRAG_PAGE_ORDER : 0;
-
+	order = SKB_FRAG_PAGE_ORDER;
 	do {
 		gfp_t gfp = prio;
 
-- 
1.8.5.2

^ permalink raw reply related

* [PATCH net-next v5 6/6] virtio-net: initial rx sysfs support, export mergeable rx buffer size
From: Michael Dalton @ 2014-01-17  2:36 UTC (permalink / raw)
  To: David S. Miller
  Cc: Michael Dalton, Michael S. Tsirkin, netdev, virtualization,
	Eric Dumazet, Ben Hutchings
In-Reply-To: <1389926176-6699-1-git-send-email-mwdalton@google.com>

Add initial support for per-rx queue sysfs attributes to virtio-net. If
mergeable packet buffers are enabled, adds a read-only mergeable packet
buffer size sysfs attribute for each RX queue.

Suggested-by: Michael S. Tsirkin <mst@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael Dalton <mwdalton@google.com>
---
v3->v4: Remove seqcount due to EWMA changes in patch 5.
        Add missing Suggested-By. 

 drivers/net/virtio_net.c | 46 ++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 42 insertions(+), 4 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 3e82311..968eacd 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -604,18 +604,25 @@ static int add_recvbuf_big(struct receive_queue *rq, gfp_t gfp)
 	return err;
 }
 
-static int add_recvbuf_mergeable(struct receive_queue *rq, gfp_t gfp)
+static unsigned int get_mergeable_buf_len(struct ewma *avg_pkt_len)
 {
 	const size_t hdr_len = sizeof(struct virtio_net_hdr_mrg_rxbuf);
+	unsigned int len;
+
+	len = hdr_len + clamp_t(unsigned int, ewma_read(avg_pkt_len),
+			GOOD_PACKET_LEN, PAGE_SIZE - hdr_len);
+	return ALIGN(len, MERGEABLE_BUFFER_ALIGN);
+}
+
+static int add_recvbuf_mergeable(struct receive_queue *rq, gfp_t gfp)
+{
 	struct page_frag *alloc_frag = &rq->alloc_frag;
 	char *buf;
 	unsigned long ctx;
 	int err;
 	unsigned int len, hole;
 
-	len = hdr_len + clamp_t(unsigned int, ewma_read(&rq->mrg_avg_pkt_len),
-				GOOD_PACKET_LEN, PAGE_SIZE - hdr_len);
-	len = ALIGN(len, MERGEABLE_BUFFER_ALIGN);
+	len = get_mergeable_buf_len(&rq->mrg_avg_pkt_len);
 	if (unlikely(!skb_page_frag_refill(len, alloc_frag, gfp)))
 		return -ENOMEM;
 
@@ -1594,6 +1601,33 @@ err:
 	return ret;
 }
 
+#ifdef CONFIG_SYSFS
+static ssize_t mergeable_rx_buffer_size_show(struct netdev_rx_queue *queue,
+		struct rx_queue_attribute *attribute, char *buf)
+{
+	struct virtnet_info *vi = netdev_priv(queue->dev);
+	unsigned int queue_index = get_netdev_rx_queue_index(queue);
+	struct ewma *avg;
+
+	BUG_ON(queue_index >= vi->max_queue_pairs);
+	avg = &vi->rq[queue_index].mrg_avg_pkt_len;
+	return sprintf(buf, "%u\n", get_mergeable_buf_len(avg));
+}
+
+static struct rx_queue_attribute mergeable_rx_buffer_size_attribute =
+	__ATTR_RO(mergeable_rx_buffer_size);
+
+static struct attribute *virtio_net_mrg_rx_attrs[] = {
+	&mergeable_rx_buffer_size_attribute.attr,
+	NULL
+};
+
+static const struct attribute_group virtio_net_mrg_rx_group = {
+	.name = "virtio_net",
+	.attrs = virtio_net_mrg_rx_attrs
+};
+#endif
+
 static int virtnet_probe(struct virtio_device *vdev)
 {
 	int i, err;
@@ -1708,6 +1742,10 @@ static int virtnet_probe(struct virtio_device *vdev)
 	if (err)
 		goto free_stats;
 
+#ifdef CONFIG_SYSFS
+	if (vi->mergeable_rx_bufs)
+		dev->sysfs_rx_queue_group = &virtio_net_mrg_rx_group;
+#endif
 	netif_set_real_num_tx_queues(dev, vi->curr_queue_pairs);
 	netif_set_real_num_rx_queues(dev, vi->curr_queue_pairs);
 
-- 
1.8.5.2

^ permalink raw reply related

* [PATCH net-next v5 2/6] virtio-net: use per-receive queue page frag alloc for mergeable bufs
From: Michael Dalton @ 2014-01-17  2:36 UTC (permalink / raw)
  To: David S. Miller
  Cc: netdev, Eric Dumazet, Rusty Russell, Michael S. Tsirkin,
	Jason Wang, Ben Hutchings, virtualization, Michael Dalton
In-Reply-To: <1389926176-6699-1-git-send-email-mwdalton@google.com>

The virtio-net driver currently uses netdev_alloc_frag() for GFP_ATOMIC
mergeable rx buffer allocations. This commit migrates virtio-net to use
per-receive queue page frags for GFP_ATOMIC allocation. This change unifies
mergeable rx buffer memory allocation, which now will use skb_refill_frag()
for both atomic and GFP-WAIT buffer allocations.

To address fragmentation concerns, if after buffer allocation there
is too little space left in the page frag to allocate a subsequent
buffer, the remaining space is added to the current allocated buffer
so that the remaining space can be used to store packet data.

Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael Dalton <mwdalton@google.com>
---
v1->v2: Use GFP_COLD for RX buffer allocations (as in netdev_alloc_frag()).
        Remove per-netdev GFP_KERNEL page_frag allocator.

 drivers/net/virtio_net.c | 69 ++++++++++++++++++++++++------------------------
 1 file changed, 35 insertions(+), 34 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 7b17240..36cbf06 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -78,6 +78,9 @@ struct receive_queue {
 	/* Chain pages by the private ptr. */
 	struct page *pages;
 
+	/* Page frag for packet buffer allocation. */
+	struct page_frag alloc_frag;
+
 	/* RX: fragments + linear part + virtio header */
 	struct scatterlist sg[MAX_SKB_FRAGS + 2];
 
@@ -126,11 +129,6 @@ struct virtnet_info {
 	/* Lock for config space updates */
 	struct mutex config_lock;
 
-	/* Page_frag for GFP_KERNEL packet buffer allocation when we run
-	 * low on memory.
-	 */
-	struct page_frag alloc_frag;
-
 	/* Does the affinity hint is set for virtqueues? */
 	bool affinity_hint_set;
 
@@ -336,8 +334,8 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
 	int num_buf = hdr->mhdr.num_buffers;
 	struct page *page = virt_to_head_page(buf);
 	int offset = buf - page_address(page);
-	struct sk_buff *head_skb = page_to_skb(rq, page, offset, len,
-					       MERGE_BUFFER_LEN);
+	unsigned int truesize = max_t(unsigned int, len, MERGE_BUFFER_LEN);
+	struct sk_buff *head_skb = page_to_skb(rq, page, offset, len, truesize);
 	struct sk_buff *curr_skb = head_skb;
 
 	if (unlikely(!curr_skb))
@@ -353,11 +351,6 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
 			dev->stats.rx_length_errors++;
 			goto err_buf;
 		}
-		if (unlikely(len > MERGE_BUFFER_LEN)) {
-			pr_debug("%s: rx error: merge buffer too long\n",
-				 dev->name);
-			len = MERGE_BUFFER_LEN;
-		}
 
 		page = virt_to_head_page(buf);
 		--rq->num;
@@ -376,19 +369,20 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
 			head_skb->truesize += nskb->truesize;
 			num_skb_frags = 0;
 		}
+		truesize = max_t(unsigned int, len, MERGE_BUFFER_LEN);
 		if (curr_skb != head_skb) {
 			head_skb->data_len += len;
 			head_skb->len += len;
-			head_skb->truesize += MERGE_BUFFER_LEN;
+			head_skb->truesize += truesize;
 		}
 		offset = buf - page_address(page);
 		if (skb_can_coalesce(curr_skb, num_skb_frags, page, offset)) {
 			put_page(page);
 			skb_coalesce_rx_frag(curr_skb, num_skb_frags - 1,
-					     len, MERGE_BUFFER_LEN);
+					     len, truesize);
 		} else {
 			skb_add_rx_frag(curr_skb, num_skb_frags, page,
-					offset, len, MERGE_BUFFER_LEN);
+					offset, len, truesize);
 		}
 	}
 
@@ -578,25 +572,24 @@ static int add_recvbuf_big(struct receive_queue *rq, gfp_t gfp)
 
 static int add_recvbuf_mergeable(struct receive_queue *rq, gfp_t gfp)
 {
-	struct virtnet_info *vi = rq->vq->vdev->priv;
-	char *buf = NULL;
+	struct page_frag *alloc_frag = &rq->alloc_frag;
+	char *buf;
 	int err;
+	unsigned int len, hole;
 
-	if (gfp & __GFP_WAIT) {
-		if (skb_page_frag_refill(MERGE_BUFFER_LEN, &vi->alloc_frag,
-					 gfp)) {
-			buf = (char *)page_address(vi->alloc_frag.page) +
-			      vi->alloc_frag.offset;
-			get_page(vi->alloc_frag.page);
-			vi->alloc_frag.offset += MERGE_BUFFER_LEN;
-		}
-	} else {
-		buf = netdev_alloc_frag(MERGE_BUFFER_LEN);
-	}
-	if (!buf)
+	if (unlikely(!skb_page_frag_refill(MERGE_BUFFER_LEN, alloc_frag, gfp)))
 		return -ENOMEM;
+	buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
+	get_page(alloc_frag->page);
+	len = MERGE_BUFFER_LEN;
+	alloc_frag->offset += len;
+	hole = alloc_frag->size - alloc_frag->offset;
+	if (hole < MERGE_BUFFER_LEN) {
+		len += hole;
+		alloc_frag->offset += hole;
+	}
 
-	sg_init_one(rq->sg, buf, MERGE_BUFFER_LEN);
+	sg_init_one(rq->sg, buf, len);
 	err = virtqueue_add_inbuf(rq->vq, rq->sg, 1, buf, gfp);
 	if (err < 0)
 		put_page(virt_to_head_page(buf));
@@ -617,6 +610,7 @@ static bool try_fill_recv(struct receive_queue *rq, gfp_t gfp)
 	int err;
 	bool oom;
 
+	gfp |= __GFP_COLD;
 	do {
 		if (vi->mergeable_rx_bufs)
 			err = add_recvbuf_mergeable(rq, gfp);
@@ -1377,6 +1371,14 @@ static void free_receive_bufs(struct virtnet_info *vi)
 	}
 }
 
+static void free_receive_page_frags(struct virtnet_info *vi)
+{
+	int i;
+	for (i = 0; i < vi->max_queue_pairs; i++)
+		if (vi->rq[i].alloc_frag.page)
+			put_page(vi->rq[i].alloc_frag.page);
+}
+
 static void free_unused_bufs(struct virtnet_info *vi)
 {
 	void *buf;
@@ -1705,9 +1707,8 @@ free_recv_bufs:
 	unregister_netdev(dev);
 free_vqs:
 	cancel_delayed_work_sync(&vi->refill);
+	free_receive_page_frags(vi);
 	virtnet_del_vqs(vi);
-	if (vi->alloc_frag.page)
-		put_page(vi->alloc_frag.page);
 free_stats:
 	free_percpu(vi->stats);
 free:
@@ -1724,6 +1725,8 @@ static void remove_vq_common(struct virtnet_info *vi)
 
 	free_receive_bufs(vi);
 
+	free_receive_page_frags(vi);
+
 	virtnet_del_vqs(vi);
 }
 
@@ -1741,8 +1744,6 @@ static void virtnet_remove(struct virtio_device *vdev)
 	unregister_netdev(vi->dev);
 
 	remove_vq_common(vi);
-	if (vi->alloc_frag.page)
-		put_page(vi->alloc_frag.page);
 
 	flush_work(&vi->config_work);
 
-- 
1.8.5.2

^ permalink raw reply related

* [PATCH net-next v5 3/6] virtio-net: auto-tune mergeable rx buffer size for improved performance
From: Michael Dalton @ 2014-01-17  2:36 UTC (permalink / raw)
  To: David S. Miller
  Cc: netdev, Eric Dumazet, Rusty Russell, Michael S. Tsirkin,
	Jason Wang, Ben Hutchings, virtualization, Michael Dalton
In-Reply-To: <1389926176-6699-1-git-send-email-mwdalton@google.com>

Commit 2613af0ed18a ("virtio_net: migrate mergeable rx buffers to page frag
allocators") changed the mergeable receive buffer size from PAGE_SIZE to
MTU-size, introducing a single-stream regression for benchmarks with large
average packet size. There is no single optimal buffer size for all
workloads.  For workloads with packet size <= MTU bytes, MTU + virtio-net
header-sized buffers are preferred as larger buffers reduce the TCP window
due to SKB truesize. However, single-stream workloads with large average
packet sizes have higher throughput if larger (e.g., PAGE_SIZE) buffers
are used.

This commit auto-tunes the mergeable receiver buffer packet size by
choosing the packet buffer size based on an EWMA of the recent packet
sizes for the receive queue. Packet buffer sizes range from MTU_SIZE +
virtio-net header len to PAGE_SIZE. This improves throughput for
large packet workloads, as any workload with average packet size >=
PAGE_SIZE will use PAGE_SIZE buffers.

These optimizations interact positively with recent commit
ba275241030c ("virtio-net: coalesce rx frags when possible during rx"),
which coalesces adjacent RX SKB fragments in virtio_net. The coalescing
optimizations benefit buffers of any size.

Benchmarks taken from an average of 5 netperf 30-second TCP_STREAM runs
between two QEMU VMs on a single physical machine. Each VM has two VCPUs
with all offloads & vhost enabled. All VMs and vhost threads run in a
single 4 CPU cgroup cpuset, using cgroups to ensure that other processes
in the system will not be scheduled on the benchmark CPUs. Trunk includes
SKB rx frag coalescing.

net-next w/ virtio_net before 2613af0ed18a (PAGE_SIZE bufs): 14642.85Gb/s
net-next (MTU-size bufs):  13170.01Gb/s
net-next + auto-tune: 14555.94Gb/s

Jason Wang also reported a throughput increase on mlx4 from 22Gb/s
using MTU-sized buffers to about 26Gb/s using auto-tuning.

Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael Dalton <mwdalton@google.com>
---
v2->v3: Remove per-receive queue metadata ring. Encode packet buffer
        base address and truesize into an unsigned long by requiring a
        minimum packet size alignment of 256. Permit attempts to fill
        an already-full RX ring (reverting the change in v2).
v1->v2: Add per-receive queue metadata ring to track precise truesize for
        mergeable receive buffers. Remove all truesize approximation. Never
        try to fill a full RX ring (required for metadata ring in v2).

 drivers/net/virtio_net.c | 99 ++++++++++++++++++++++++++++++++++++------------
 1 file changed, 74 insertions(+), 25 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 36cbf06..3e82311 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -26,6 +26,7 @@
 #include <linux/if_vlan.h>
 #include <linux/slab.h>
 #include <linux/cpu.h>
+#include <linux/average.h>
 
 static int napi_weight = NAPI_POLL_WEIGHT;
 module_param(napi_weight, int, 0444);
@@ -36,11 +37,18 @@ module_param(gso, bool, 0444);
 
 /* FIXME: MTU in config. */
 #define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
-#define MERGE_BUFFER_LEN (ALIGN(GOOD_PACKET_LEN + \
-                                sizeof(struct virtio_net_hdr_mrg_rxbuf), \
-                                L1_CACHE_BYTES))
 #define GOOD_COPY_LEN	128
 
+/* Weight used for the RX packet size EWMA. The average packet size is used to
+ * determine the packet buffer size when refilling RX rings. As the entire RX
+ * ring may be refilled at once, the weight is chosen so that the EWMA will be
+ * insensitive to short-term, transient changes in packet size.
+ */
+#define RECEIVE_AVG_WEIGHT 64
+
+/* Minimum alignment for mergeable packet buffers. */
+#define MERGEABLE_BUFFER_ALIGN max(L1_CACHE_BYTES, 256)
+
 #define VIRTNET_DRIVER_VERSION "1.0.0"
 
 struct virtnet_stats {
@@ -78,6 +86,9 @@ struct receive_queue {
 	/* Chain pages by the private ptr. */
 	struct page *pages;
 
+	/* Average packet length for mergeable receive buffers. */
+	struct ewma mrg_avg_pkt_len;
+
 	/* Page frag for packet buffer allocation. */
 	struct page_frag alloc_frag;
 
@@ -219,6 +230,23 @@ static void skb_xmit_done(struct virtqueue *vq)
 	netif_wake_subqueue(vi->dev, vq2txq(vq));
 }
 
+static unsigned int mergeable_ctx_to_buf_truesize(unsigned long mrg_ctx)
+{
+	unsigned int truesize = mrg_ctx & (MERGEABLE_BUFFER_ALIGN - 1);
+	return truesize * MERGEABLE_BUFFER_ALIGN;
+}
+
+static void *mergeable_ctx_to_buf_address(unsigned long mrg_ctx)
+{
+	return (void *)(mrg_ctx & -MERGEABLE_BUFFER_ALIGN);
+
+}
+
+static unsigned long mergeable_buf_to_ctx(void *buf, unsigned int truesize)
+{
+	return (unsigned long)buf | (truesize / MERGEABLE_BUFFER_ALIGN);
+}
+
 /* Called from bottom half context */
 static struct sk_buff *page_to_skb(struct receive_queue *rq,
 				   struct page *page, unsigned int offset,
@@ -327,31 +355,33 @@ err:
 
 static struct sk_buff *receive_mergeable(struct net_device *dev,
 					 struct receive_queue *rq,
-					 void *buf,
+					 unsigned long ctx,
 					 unsigned int len)
 {
+	void *buf = mergeable_ctx_to_buf_address(ctx);
 	struct skb_vnet_hdr *hdr = buf;
 	int num_buf = hdr->mhdr.num_buffers;
 	struct page *page = virt_to_head_page(buf);
 	int offset = buf - page_address(page);
-	unsigned int truesize = max_t(unsigned int, len, MERGE_BUFFER_LEN);
+	unsigned int truesize = max(len, mergeable_ctx_to_buf_truesize(ctx));
+
 	struct sk_buff *head_skb = page_to_skb(rq, page, offset, len, truesize);
 	struct sk_buff *curr_skb = head_skb;
 
 	if (unlikely(!curr_skb))
 		goto err_skb;
-
 	while (--num_buf) {
 		int num_skb_frags;
 
-		buf = virtqueue_get_buf(rq->vq, &len);
-		if (unlikely(!buf)) {
+		ctx = (unsigned long)virtqueue_get_buf(rq->vq, &len);
+		if (unlikely(!ctx)) {
 			pr_debug("%s: rx error: %d buffers out of %d missing\n",
 				 dev->name, num_buf, hdr->mhdr.num_buffers);
 			dev->stats.rx_length_errors++;
 			goto err_buf;
 		}
 
+		buf = mergeable_ctx_to_buf_address(ctx);
 		page = virt_to_head_page(buf);
 		--rq->num;
 
@@ -369,7 +399,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
 			head_skb->truesize += nskb->truesize;
 			num_skb_frags = 0;
 		}
-		truesize = max_t(unsigned int, len, MERGE_BUFFER_LEN);
+		truesize = max(len, mergeable_ctx_to_buf_truesize(ctx));
 		if (curr_skb != head_skb) {
 			head_skb->data_len += len;
 			head_skb->len += len;
@@ -386,19 +416,20 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
 		}
 	}
 
+	ewma_add(&rq->mrg_avg_pkt_len, head_skb->len);
 	return head_skb;
 
 err_skb:
 	put_page(page);
 	while (--num_buf) {
-		buf = virtqueue_get_buf(rq->vq, &len);
-		if (unlikely(!buf)) {
+		ctx = (unsigned long)virtqueue_get_buf(rq->vq, &len);
+		if (unlikely(!ctx)) {
 			pr_debug("%s: rx error: %d buffers missing\n",
 				 dev->name, num_buf);
 			dev->stats.rx_length_errors++;
 			break;
 		}
-		page = virt_to_head_page(buf);
+		page = virt_to_head_page(mergeable_ctx_to_buf_address(ctx));
 		put_page(page);
 		--rq->num;
 	}
@@ -419,17 +450,20 @@ static void receive_buf(struct receive_queue *rq, void *buf, unsigned int len)
 	if (unlikely(len < sizeof(struct virtio_net_hdr) + ETH_HLEN)) {
 		pr_debug("%s: short packet %i\n", dev->name, len);
 		dev->stats.rx_length_errors++;
-		if (vi->mergeable_rx_bufs)
-			put_page(virt_to_head_page(buf));
-		else if (vi->big_packets)
+		if (vi->mergeable_rx_bufs) {
+			unsigned long ctx = (unsigned long)buf;
+			void *base = mergeable_ctx_to_buf_address(ctx);
+			put_page(virt_to_head_page(base));
+		} else if (vi->big_packets) {
 			give_pages(rq, buf);
-		else
+		} else {
 			dev_kfree_skb(buf);
+		}
 		return;
 	}
 
 	if (vi->mergeable_rx_bufs)
-		skb = receive_mergeable(dev, rq, buf, len);
+		skb = receive_mergeable(dev, rq, (unsigned long)buf, len);
 	else if (vi->big_packets)
 		skb = receive_big(dev, rq, buf, len);
 	else
@@ -572,25 +606,36 @@ static int add_recvbuf_big(struct receive_queue *rq, gfp_t gfp)
 
 static int add_recvbuf_mergeable(struct receive_queue *rq, gfp_t gfp)
 {
+	const size_t hdr_len = sizeof(struct virtio_net_hdr_mrg_rxbuf);
 	struct page_frag *alloc_frag = &rq->alloc_frag;
 	char *buf;
+	unsigned long ctx;
 	int err;
 	unsigned int len, hole;
 
-	if (unlikely(!skb_page_frag_refill(MERGE_BUFFER_LEN, alloc_frag, gfp)))
+	len = hdr_len + clamp_t(unsigned int, ewma_read(&rq->mrg_avg_pkt_len),
+				GOOD_PACKET_LEN, PAGE_SIZE - hdr_len);
+	len = ALIGN(len, MERGEABLE_BUFFER_ALIGN);
+	if (unlikely(!skb_page_frag_refill(len, alloc_frag, gfp)))
 		return -ENOMEM;
+
 	buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
+	ctx = mergeable_buf_to_ctx(buf, len);
 	get_page(alloc_frag->page);
-	len = MERGE_BUFFER_LEN;
 	alloc_frag->offset += len;
 	hole = alloc_frag->size - alloc_frag->offset;
-	if (hole < MERGE_BUFFER_LEN) {
+	if (hole < len) {
+		/* To avoid internal fragmentation, if there is very likely not
+		 * enough space for another buffer, add the remaining space to
+		 * the current buffer. This extra space is not included in
+		 * the truesize stored in ctx.
+		 */
 		len += hole;
 		alloc_frag->offset += hole;
 	}
 
 	sg_init_one(rq->sg, buf, len);
-	err = virtqueue_add_inbuf(rq->vq, rq->sg, 1, buf, gfp);
+	err = virtqueue_add_inbuf(rq->vq, rq->sg, 1, (void *)ctx, gfp);
 	if (err < 0)
 		put_page(virt_to_head_page(buf));
 
@@ -1394,12 +1439,15 @@ static void free_unused_bufs(struct virtnet_info *vi)
 		struct virtqueue *vq = vi->rq[i].vq;
 
 		while ((buf = virtqueue_detach_unused_buf(vq)) != NULL) {
-			if (vi->mergeable_rx_bufs)
-				put_page(virt_to_head_page(buf));
-			else if (vi->big_packets)
+			if (vi->mergeable_rx_bufs) {
+				unsigned long ctx = (unsigned long)buf;
+				void *base = mergeable_ctx_to_buf_address(ctx);
+				put_page(virt_to_head_page(base));
+			} else if (vi->big_packets) {
 				give_pages(&vi->rq[i], buf);
-			else
+			} else {
 				dev_kfree_skb(buf);
+			}
 			--vi->rq[i].num;
 		}
 		BUG_ON(vi->rq[i].num != 0);
@@ -1509,6 +1557,7 @@ static int virtnet_alloc_queues(struct virtnet_info *vi)
 			       napi_weight);
 
 		sg_init_table(vi->rq[i].sg, ARRAY_SIZE(vi->rq[i].sg));
+		ewma_init(&vi->rq[i].mrg_avg_pkt_len, 1, RECEIVE_AVG_WEIGHT);
 		sg_init_table(vi->sq[i].sg, ARRAY_SIZE(vi->sq[i].sg));
 	}
 
-- 
1.8.5.2

^ permalink raw reply related

* [PATCH net-next v5 4/6] net-sysfs: add support for device-specific rx queue sysfs attributes
From: Michael Dalton @ 2014-01-17  2:36 UTC (permalink / raw)
  To: David S. Miller
  Cc: netdev, Eric Dumazet, Rusty Russell, Michael S. Tsirkin,
	Jason Wang, Ben Hutchings, virtualization, Michael Dalton
In-Reply-To: <1389926176-6699-1-git-send-email-mwdalton@google.com>

Extend existing support for netdevice receive queue sysfs attributes to
permit a device-specific attribute group. Initial use case for this
support will be to allow the virtio-net device to export per-receive
queue mergeable receive buffer size.

Signed-off-by: Michael Dalton <mwdalton@google.com>
---
v4->v5: Handle sysfs_create_group failure. Call sysfs_remove_group when
        removing a RX queue kobj if a device-specific group exists.
v3->v4: Simplify by removing loop in get_netdev_rx_queue_index.

 include/linux/netdevice.h | 35 +++++++++++++++++++++++++++++----
 net/core/dev.c            | 12 ++++++------
 net/core/net-sysfs.c      | 50 +++++++++++++++++++++++++++--------------------
 3 files changed, 66 insertions(+), 31 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 5c88ab1..38929bc 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -668,15 +668,28 @@ extern struct rps_sock_flow_table __rcu *rps_sock_flow_table;
 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index, u32 flow_id,
 			 u16 filter_id);
 #endif
+#endif /* CONFIG_RPS */
 
 /* This structure contains an instance of an RX queue. */
 struct netdev_rx_queue {
+#ifdef CONFIG_RPS
 	struct rps_map __rcu		*rps_map;
 	struct rps_dev_flow_table __rcu	*rps_flow_table;
+#endif
 	struct kobject			kobj;
 	struct net_device		*dev;
 } ____cacheline_aligned_in_smp;
-#endif /* CONFIG_RPS */
+
+/*
+ * RX queue sysfs structures and functions.
+ */
+struct rx_queue_attribute {
+	struct attribute attr;
+	ssize_t (*show)(struct netdev_rx_queue *queue,
+	    struct rx_queue_attribute *attr, char *buf);
+	ssize_t (*store)(struct netdev_rx_queue *queue,
+	    struct rx_queue_attribute *attr, const char *buf, size_t len);
+};
 
 #ifdef CONFIG_XPS
 /*
@@ -1313,7 +1326,7 @@ struct net_device {
 						   unicast) */
 
 
-#ifdef CONFIG_RPS
+#ifdef CONFIG_SYSFS
 	struct netdev_rx_queue	*_rx;
 
 	/* Number of RX queues allocated at register_netdev() time */
@@ -1424,6 +1437,8 @@ struct net_device {
 	struct device		dev;
 	/* space for optional device, statistics, and wireless sysfs groups */
 	const struct attribute_group *sysfs_groups[4];
+	/* space for optional per-rx queue attributes */
+	const struct attribute_group *sysfs_rx_queue_group;
 
 	/* rtnetlink link ops */
 	const struct rtnl_link_ops *rtnl_link_ops;
@@ -2374,7 +2389,7 @@ static inline bool netif_is_multiqueue(const struct net_device *dev)
 
 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq);
 
-#ifdef CONFIG_RPS
+#ifdef CONFIG_SYSFS
 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq);
 #else
 static inline int netif_set_real_num_rx_queues(struct net_device *dev,
@@ -2393,7 +2408,7 @@ static inline int netif_copy_real_num_queues(struct net_device *to_dev,
 					   from_dev->real_num_tx_queues);
 	if (err)
 		return err;
-#ifdef CONFIG_RPS
+#ifdef CONFIG_SYSFS
 	return netif_set_real_num_rx_queues(to_dev,
 					    from_dev->real_num_rx_queues);
 #else
@@ -2401,6 +2416,18 @@ static inline int netif_copy_real_num_queues(struct net_device *to_dev,
 #endif
 }
 
+#ifdef CONFIG_SYSFS
+static inline unsigned int get_netdev_rx_queue_index(
+		struct netdev_rx_queue *queue)
+{
+	struct net_device *dev = queue->dev;
+	int index = queue - dev->_rx;
+
+	BUG_ON(index >= dev->num_rx_queues);
+	return index;
+}
+#endif
+
 #define DEFAULT_MAX_NUM_RSS_QUEUES	(8)
 int netif_get_num_default_rss_queues(void);
 
diff --git a/net/core/dev.c b/net/core/dev.c
index 20c834e..4be7931 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2080,7 +2080,7 @@ int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
 }
 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
 
-#ifdef CONFIG_RPS
+#ifdef CONFIG_SYSFS
 /**
  *	netif_set_real_num_rx_queues - set actual number of RX queues used
  *	@dev: Network device
@@ -5727,7 +5727,7 @@ void netif_stacked_transfer_operstate(const struct net_device *rootdev,
 }
 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
 
-#ifdef CONFIG_RPS
+#ifdef CONFIG_SYSFS
 static int netif_alloc_rx_queues(struct net_device *dev)
 {
 	unsigned int i, count = dev->num_rx_queues;
@@ -6272,7 +6272,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
 		return NULL;
 	}
 
-#ifdef CONFIG_RPS
+#ifdef CONFIG_SYSFS
 	if (rxqs < 1) {
 		pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
 		return NULL;
@@ -6328,7 +6328,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
 	if (netif_alloc_netdev_queues(dev))
 		goto free_all;
 
-#ifdef CONFIG_RPS
+#ifdef CONFIG_SYSFS
 	dev->num_rx_queues = rxqs;
 	dev->real_num_rx_queues = rxqs;
 	if (netif_alloc_rx_queues(dev))
@@ -6348,7 +6348,7 @@ free_all:
 free_pcpu:
 	free_percpu(dev->pcpu_refcnt);
 	netif_free_tx_queues(dev);
-#ifdef CONFIG_RPS
+#ifdef CONFIG_SYSFS
 	kfree(dev->_rx);
 #endif
 
@@ -6373,7 +6373,7 @@ void free_netdev(struct net_device *dev)
 	release_net(dev_net(dev));
 
 	netif_free_tx_queues(dev);
-#ifdef CONFIG_RPS
+#ifdef CONFIG_SYSFS
 	kfree(dev->_rx);
 #endif
 
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 49843bf..7eeadee 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -498,17 +498,7 @@ static struct attribute_group wireless_group = {
 #define net_class_groups	NULL
 #endif /* CONFIG_SYSFS */
 
-#ifdef CONFIG_RPS
-/*
- * RX queue sysfs structures and functions.
- */
-struct rx_queue_attribute {
-	struct attribute attr;
-	ssize_t (*show)(struct netdev_rx_queue *queue,
-	    struct rx_queue_attribute *attr, char *buf);
-	ssize_t (*store)(struct netdev_rx_queue *queue,
-	    struct rx_queue_attribute *attr, const char *buf, size_t len);
-};
+#ifdef CONFIG_SYSFS
 #define to_rx_queue_attr(_attr) container_of(_attr,		\
     struct rx_queue_attribute, attr)
 
@@ -543,6 +533,7 @@ static const struct sysfs_ops rx_queue_sysfs_ops = {
 	.store = rx_queue_attr_store,
 };
 
+#ifdef CONFIG_RPS
 static ssize_t show_rps_map(struct netdev_rx_queue *queue,
 			    struct rx_queue_attribute *attribute, char *buf)
 {
@@ -718,16 +709,20 @@ static struct rx_queue_attribute rps_cpus_attribute =
 static struct rx_queue_attribute rps_dev_flow_table_cnt_attribute =
 	__ATTR(rps_flow_cnt, S_IRUGO | S_IWUSR,
 	    show_rps_dev_flow_table_cnt, store_rps_dev_flow_table_cnt);
+#endif /* CONFIG_RPS */
 
 static struct attribute *rx_queue_default_attrs[] = {
+#ifdef CONFIG_RPS
 	&rps_cpus_attribute.attr,
 	&rps_dev_flow_table_cnt_attribute.attr,
+#endif
 	NULL
 };
 
 static void rx_queue_release(struct kobject *kobj)
 {
 	struct netdev_rx_queue *queue = to_rx_queue(kobj);
+#ifdef CONFIG_RPS
 	struct rps_map *map;
 	struct rps_dev_flow_table *flow_table;
 
@@ -743,6 +738,7 @@ static void rx_queue_release(struct kobject *kobj)
 		RCU_INIT_POINTER(queue->rps_flow_table, NULL);
 		call_rcu(&flow_table->rcu, rps_dev_flow_table_release);
 	}
+#endif
 
 	memset(kobj, 0, sizeof(*kobj));
 	dev_put(queue->dev);
@@ -763,25 +759,36 @@ static int rx_queue_add_kobject(struct net_device *net, int index)
 	kobj->kset = net->queues_kset;
 	error = kobject_init_and_add(kobj, &rx_queue_ktype, NULL,
 	    "rx-%u", index);
-	if (error) {
-		kobject_put(kobj);
-		return error;
+	if (error)
+		goto exit;
+
+	if (net->sysfs_rx_queue_group) {
+		error = sysfs_create_group(kobj, net->sysfs_rx_queue_group);
+		if (error)
+			goto exit;
 	}
 
 	kobject_uevent(kobj, KOBJ_ADD);
 	dev_hold(queue->dev);
 
 	return error;
+exit:
+	kobject_put(kobj);
+	return error;
 }
-#endif /* CONFIG_RPS */
+#endif /* CONFIG_SYFS */
 
 int
 net_rx_queue_update_kobjects(struct net_device *net, int old_num, int new_num)
 {
-#ifdef CONFIG_RPS
+#ifdef CONFIG_SYSFS
 	int i;
 	int error = 0;
 
+#ifndef CONFIG_RPS
+	if (!net->sysfs_rx_queue_group)
+		return 0;
+#endif
 	for (i = old_num; i < new_num; i++) {
 		error = rx_queue_add_kobject(net, i);
 		if (error) {
@@ -790,8 +797,12 @@ net_rx_queue_update_kobjects(struct net_device *net, int old_num, int new_num)
 		}
 	}
 
-	while (--i >= new_num)
+	while (--i >= new_num) {
+		if (net->sysfs_rx_queue_group)
+			sysfs_remove_group(&net->_rx[i].kobj,
+					   net->sysfs_rx_queue_group);
 		kobject_put(&net->_rx[i].kobj);
+	}
 
 	return error;
 #else
@@ -1155,9 +1166,6 @@ static int register_queue_kobjects(struct net_device *net)
 	    NULL, &net->dev.kobj);
 	if (!net->queues_kset)
 		return -ENOMEM;
-#endif
-
-#ifdef CONFIG_RPS
 	real_rx = net->real_num_rx_queues;
 #endif
 	real_tx = net->real_num_tx_queues;
@@ -1184,7 +1192,7 @@ static void remove_queue_kobjects(struct net_device *net)
 {
 	int real_rx = 0, real_tx = 0;
 
-#ifdef CONFIG_RPS
+#ifdef CONFIG_SYSFS
 	real_rx = net->real_num_rx_queues;
 #endif
 	real_tx = net->real_num_tx_queues;
-- 
1.8.5.2

^ permalink raw reply related

* [PATCH net-next v5 5/6] lib: Ensure EWMA does not store wrong intermediate values
From: Michael Dalton @ 2014-01-17  2:36 UTC (permalink / raw)
  To: David S. Miller
  Cc: netdev, Eric Dumazet, Rusty Russell, Michael S. Tsirkin,
	Jason Wang, Ben Hutchings, virtualization, Michael Dalton
In-Reply-To: <1389926176-6699-1-git-send-email-mwdalton@google.com>

To ensure ewma_read() without a lock returns a valid but possibly
out of date average, modify ewma_add() by using ACCESS_ONCE to prevent
intermediate wrong values from being written to avg->internal.

Suggested-by: Eric Dumazet <eric.dumazet@gmail.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Michael Dalton <mwdalton@google.com>
---
 lib/average.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/lib/average.c b/lib/average.c
index 99a67e6..114d1be 100644
--- a/lib/average.c
+++ b/lib/average.c
@@ -53,8 +53,10 @@ EXPORT_SYMBOL(ewma_init);
  */
 struct ewma *ewma_add(struct ewma *avg, unsigned long val)
 {
-	avg->internal = avg->internal  ?
-		(((avg->internal << avg->weight) - avg->internal) +
+	unsigned long internal = ACCESS_ONCE(avg->internal);
+
+	ACCESS_ONCE(avg->internal) = internal ?
+		(((internal << avg->weight) - internal) +
 			(val << avg->factor)) >> avg->weight :
 		(val << avg->factor);
 	return avg;
-- 
1.8.5.2

^ permalink raw reply related

* Re: Fwd: [RFC PATCH net-next 0/3] virtio_net: add aRFS support
From: Jason Wang @ 2014-01-17  3:04 UTC (permalink / raw)
  To: Stefan Hajnoczi, Zhi Yong Wu
  Cc: Linux Netdev List, Tom Herbert, Eric Dumazet, David S. Miller,
	Zhi Yong Wu, Michael S. Tsirkin, Rusty Russell
In-Reply-To: <20140116085253.GA32073@stefanha-thinkpad.redhat.com>

On 01/16/2014 04:52 PM, Stefan Hajnoczi wrote:
> On Thu, Jan 16, 2014 at 04:34:10PM +0800, Zhi Yong Wu wrote:
>> CC: stefanha, MST, Rusty Russel
>>
>> ---------- Forwarded message ----------
>> From: Jason Wang <jasowang@redhat.com>
>> Date: Thu, Jan 16, 2014 at 12:23 PM
>> Subject: Re: [RFC PATCH net-next 0/3] virtio_net: add aRFS support
>> To: Zhi Yong Wu <zwu.kernel@gmail.com>
>> Cc: netdev@vger.kernel.org, therbert@google.com, edumazet@google.com,
>> davem@davemloft.net, Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
>>
>>
>> On 01/15/2014 10:20 PM, Zhi Yong Wu wrote:
>>> From: Zhi Yong Wu<wuzhy@linux.vnet.ibm.com>
>>>
>>> HI, folks
>>>
>>> The patchset is trying to integrate aRFS support to virtio_net. In this case,
>>> aRFS will be used to select the RX queue. To make sure that it's going ahead
>>> in the correct direction, although it is still one RFC and isn't tested, it's
>>> post out ASAP. Any comment are appreciated, thanks.
>>>
>>> If anyone is interested in playing with it, you can get this patchset from my
>>> dev git on github:
>>>    git://github.com/wuzhy/kernel.git virtnet_rfs
>>>
>>> Zhi Yong Wu (3):
>>>    virtio_pci: Introduce one new config api vp_get_vq_irq()
>>>    virtio_net: Introduce one dummy function virtnet_filter_rfs()
>>>    virtio-net: Add accelerated RFS support
>>>
>>>   drivers/net/virtio_net.c      |   67 ++++++++++++++++++++++++++++++++++++++++-
>>>   drivers/virtio/virtio_pci.c   |   11 +++++++
>>>   include/linux/virtio_config.h |   12 +++++++
>>>   3 files changed, 89 insertions(+), 1 deletions(-)
>>>
>> Please run get_maintainter.pl before sending the patch. You'd better
>> at least cc virtio maintainer/list for this.
>>
>> The core aRFS method is a noop in this RFC which make this series no
>> much sense to discuss. You should at least mention the big picture
>> here in the cover letter. I suggest you should post a RFC which can
>> run and has expected result or you can just raise a thread for the
>> design discussion.
>>
>> And this method has been discussed before, you can search "[net-next
>> RFC PATCH 5/5] virtio-net: flow director support" in netdev archive
>> for a very old prototype implemented by me. It can work and looks like
>> most of this RFC have already done there.
>>
>> A basic question is whether or not we need this, not all the mq cards
>> use aRFS (see ixgbe ATR). And whether or not it can bring extra
>> overheads? For virtio, we want to reduce the vmexits as much as
>> possible but this aRFS seems introduce a lot of more of this. Making a
>> complex interfaces just for an virtual device may not be good, simple
>> method may works for most of the cases.
>>
>> We really should consider to offload this to real nic. VMDq and L2
>> forwarding offload may help in this case.
> Zhi Yong and I had an IRC chat.  I wanted to post my questions on the
> list - it's still the same concern I had in the old email thread that
> Jason mentioned.
>
> In order for virtio-net aRFS to make sense there needs to be an overall
> plan for pushing flow mapping information down to the physical NIC.
> That's the only way to actually achieve the benefit of steering:
> processing the packet on the CPU where the application is running.

There are several places that the packet needs to be processing:
1) If you mean send the interrupt of virtio-net to the vcpu that the
application is running, current virito-net has already had the ability
with the help of flow caches of tun and the vcpu private queue
configuration of XPS and irq affinity hint which is automatically done
through the driver itself.
2) If you mean send the interrupt or doing the softirq of the host card
to the cpu where vhost thread is running, recent RFS support of tun can
do this.
3) And about the affinity of vhost thread and vcpu thread, this seems
beyond the scope of aRFS.

So it looks like current code works (though may have some drawbacks).

aRFS just another method of doing 1) by letting the driver can control
the flow to queue mapping. Currently tun do it silently and does not
expose an API for userspace to configure. The question is whether or not
we need this and if yes how can implement it efficiently.
>
> If it's not possible or too hard to implement aRFS down the entire
> stack, we won't be able to process the packet on the right CPU.
> Then we might as well not bother with aRFS and just distribute uniformly
> across the rx virtqueues.
>
> Please post an outline of how rx packets will be steered up the stack so
> we can discuss whether aRFS can bring any benefit.
>
> Stefan

^ permalink raw reply

* Re: [PATCH net-next v5 0/6] virtio-net: mergeable rx buffer size auto-tuning
From: David Miller @ 2014-01-17  3:08 UTC (permalink / raw)
  To: mwdalton; +Cc: mst, netdev, virtualization, edumazet, bhutchings
In-Reply-To: <1389926176-6699-1-git-send-email-mwdalton@google.com>


This series does not apply cleanly to net-next.

^ permalink raw reply

* Re: PANIC in vxlan <debugging now>
From: Fan Du @ 2014-01-17  3:08 UTC (permalink / raw)
  To: Jesse Brandeburg; +Cc: netdev
In-Reply-To: <20140116171428.00004da1@unknown>



On 2014年01月17日 09:14, Jesse Brandeburg wrote:
> I'm currently debugging this but given where the kernel release cycle
> is I wanted to let the list know.

How you reproduce it? I use below two methods, both works.
And my net-next head is at: abfce3ef58b6a6c95de389f9d20047a05b10e484, pretty new.

Method 1:
ip link add vxlan0 type vxlan id 42 group ff0e::110
ip link set vxlan0 up
ip link set vxlan0 down

Method 2:
ip link add vxlan0 type vxlan id 42 group ff0e::110
ip link set vxlan0 up
ip link delete vxlan0


> It may well be a bug in our code, and if it is we'll find it, but here is
> the panic, it doesn't occur when vxlan is not enabled.
>
> Jan 16 13:46:44 jbrandeb-cp2 kernel: [   17.331010] cgroup: libvirtd (1387) created nested cgroup for controller "memory" which has incomplete hierarchy supp
> ort. Nested cgroups may change behavior in the future.
> Jan 16 13:46:44 jbrandeb-cp2 kernel: [   17.331014] cgroup: "memory" requires setting use_hierarchy to 1 on the root.
> Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.576568] ------------[ cut here ]------------
> Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.586411] kernel BUG at include/net/netns/generic.h:45!
> Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.596336] invalid opcode: 0000 [#1] SMP
> Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.606268] Modules linked in: lockd sunrpc i40e igb iTCO_wdt iTCO_vendor_support sb_edac ioatdma ptp microcode lpc_ich edac_core i2c_i801 mfd_core dca pps_core wmi kvm uinput isci firewire_ohci libsas firewire_core crc_itu_t scsi_transport_sas mgag200 drm_kms_helper ttm
> Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.637923] CPU: 0 PID: 1387 Comm: libvirtd Not tainted 3.13.0-rc7+ #30
> Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.648599] Hardware name: Intel Corporation S2600CO ........../S2600CO, BIOS SE5C600.86B.01.08.6003.062420131549 06/24/2013
> Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.659612] task: ffff88063b5c6000 ti: ffff8806333ca000 task.ti: ffff8806333ca000
> Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.670661] RIP: 0010:[<ffffffff816df92f>]  [<ffffffff816df92f>] net_generic.isra.34.part.35+0x4/0x6
> Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.681738] RSP: 0018:ffff8806333cbb80  EFLAGS: 00010246
> Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.692536] RAX: 0000000000000000 RBX: 00000000ffffffed RCX: 0000000000000010
> Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.703577] RDX: ffff88063d03d380 RSI: 0000000000000010 RDI: ffffffff81cfd9f0
> Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.714612] RBP: ffff8806333cbb80 R08: 0000000000000000 R09: ffffffff81cfd9f0
> Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.725531] R10: 00000000000002cc R11: 0000000000000004 R12: 0000000000000000
> Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.736448] R13: ffff880639118000 R14: ffff8806333cbc68 R15: 0000000000000000
> Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.747292] FS:  00007f6381830700(0000) GS:ffff880647600000(0000) knlGS:0000000000000000
> Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.758248] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.769263] CR2: 00007f637c04b000 CR3: 0000000c3aa1f000 CR4: 00000000000407f0
> Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.780402] Stack:
> Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.791386]  ffff8806333cbbc0 ffffffff814d0865 ffff8806333cbc40 00000000ffffffef
> Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.802702]  00000000ffffffed ffffffff81cc67d0 0000000000000010 ffff8806333cbc68
> Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.814021]  ffff8806333cbc00 ffffffff816e9e5d 0000000000000004 ffff8806333cbc68
> Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.825185] Call Trace:
> Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.836106]  [<ffffffff814d0865>] vxlan_lowerdev_event+0xf5/0x100
> Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.847254]  [<ffffffff816e9e5d>] notifier_call_chain+0x4d/0x70
> Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.858457]  [<ffffffff810912be>] __raw_notifier_call_chain+0xe/0x10
> Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.869696]  [<ffffffff810912d6>] raw_notifier_call_chain+0x16/0x20
> Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.880896]  [<ffffffff815d9610>] call_netdevice_notifiers_info+0x40/0x70
> Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.892063]  [<ffffffff815d9656>] call_netdevice_notifiers+0x16/0x20
> Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.903107]  [<ffffffff815e1bce>] register_netdevice+0x1be/0x3a0
> Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.914128]  [<ffffffff815e1dce>] register_netdev+0x1e/0x30
> Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.925072]  [<ffffffff814cb94a>] loopback_net_init+0x4a/0xb0
> Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.936048]  [<ffffffffa016ed6e>] ? lockd_init_net+0x6e/0xb0 [lockd]
> Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.947081]  [<ffffffff815d6bac>] ops_init+0x4c/0x150
> Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.958070]  [<ffffffff815d6d23>] setup_net+0x73/0x110
> Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.969006]  [<ffffffff815d725b>] copy_net_ns+0x7b/0x100
> Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.979897]  [<ffffffff81090e11>] create_new_namespaces+0x101/0x1b0
> Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.990855]  [<ffffffff81090f45>] copy_namespaces+0x85/0xb0
> Jan 16 13:46:45 jbrandeb-cp2 kernel: [   18.001656]  [<ffffffff810693d5>] copy_process.part.26+0x935/0x1500
> Jan 16 13:46:45 jbrandeb-cp2 kernel: [   18.012370]  [<ffffffff811d5186>] ? mntput+0x26/0x40
> Jan 16 13:46:45 jbrandeb-cp2 kernel: [   18.022924]  [<ffffffff8106a15c>] do_fork+0xbc/0x2e0
> Jan 16 13:46:45 jbrandeb-cp2 kernel: [   18.033331]  [<ffffffff811b7f2e>] ? ____fput+0xe/0x10
> Jan 16 13:46:45 jbrandeb-cp2 kernel: [   18.043622]  [<ffffffff81089c5c>] ? task_work_run+0xac/0xe0
> Jan 16 13:46:45 jbrandeb-cp2 kernel: [   18.053905]  [<ffffffff8106a406>] SyS_clone+0x16/0x20
> Jan 16 13:46:45 jbrandeb-cp2 kernel: [   18.064265]  [<ffffffff816ee689>] stub_clone+0x69/0x90
> Jan 16 13:46:45 jbrandeb-cp2 kernel: [   18.074600]  [<ffffffff816ee329>] ? system_call_fastpath+0x16/0x1b
> Jan 16 13:46:45 jbrandeb-cp2 kernel: [   18.084879] Code: 00 75 1d 55 be 2f 00 00 00 48 c7 c7 65 93 a2 81 48 89 e5 e8 f4 b5 98 ff 5d c6 05 30 aa 5f 00 01 c3 55 48 89 e5 0f 0b 55 48 89 e5<0f>  0b 55 48 89 e5 0f 0b 66 66 66 66 90 55 48 c7 c7 c0 4c cb 81
> Jan 16 13:46:45 jbrandeb-cp2 kernel: [   18.105818] RIP  [<ffffffff816df92f>] net_generic.isra.34.part.35+0x4/0x6
> Jan 16 13:46:45 jbrandeb-cp2 kernel: [   18.116106]  RSP<ffff8806333cbb80>
> Jan 16 13:46:45 jbrandeb-cp2 kernel: [   18.172366] ---[ end trace 0bb84cf9aa76a384 ]---
> Jan 16 13:46:47 jbrandeb-cp2 systemd[1]: Startup finished in 4s 918ms 164us (kernel) + 3s 548ms 460us (initrd) + 11s 2ms 474us (userspace) = 19s 469ms 98us.
> Jan 16 13:46:47 jbrandeb-cp2 dbus-daemon[989]: dbus[989]: [system] Activating via systemd: service name='org.freedesktop.Accounts' unit='accounts-daemon.service'
>
> code says:
> (gdb) l *(vxlan_lowerdev_event+0xf5)
> 0xffffffff814d0865 is at include/net/netns/generic.h:41.
> 34      static inline void *net_generic(const struct net *net, int id)
> 35      {
> 36              struct net_generic *ng;
> 37              void *ptr;
> 38
> 39              rcu_read_lock();
> 40              ng = rcu_dereference(net->gen);
> 41              BUG_ON(id == 0 || id>  ng->len);
> 42              ptr = ng->ptr[id - 1];
> 43              rcu_read_unlock();
> 44
>>>>> 45              BUG_ON(!ptr);
> 46              return ptr;
> 47      }
> 48      #endif
>
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>

-- 
浮沉随浪只记今朝笑

--fan

^ permalink raw reply

* Re: [PATCH net-next 0/5] tipc: align TIPC behaviours of waiting for events with other stacks
From: David Miller @ 2014-01-17  3:11 UTC (permalink / raw)
  To: ying.xue
  Cc: Paul.Gortmaker, maloy, jon.maloy, erik.hugne, netdev,
	tipc-discussion
In-Reply-To: <1389923407-26969-1-git-send-email-ying.xue@windriver.com>

From: Ying Xue <ying.xue@windriver.com>
Date: Fri, 17 Jan 2014 09:50:02 +0800

> Comparing the current implementations of waiting for events in TIPC
> socket layer with other stacks, TIPC's behaviour is very different
> because wait_event_interruptible_timeout()/wait_event_interruptible()
> are always used by TIPC to wait for events while relevant socket or
> port variables are fed to them as their arguments. As socket lock has
> to be released temporarily before the two routines of waiting for
> events are called, their arguments associated with socket or port
> structures are out of socket lock protection. This might cause
> serious issues where the process of calling socket syscall such as
> sendsmg(), connect(), accept(), and recvmsg(), cannot be waken up
> at all even if proper event arrives or improperly be woken up
> although the condition of waking up the process is not satisfied
> in practice.
> 
> Therefore, aligning its behaviours with similar functions implemented
> in other stacks, for instance, sk_stream_wait_connect() and
> inet_csk_wait_for_connect() etc, can avoid above risks for us.

Series applied, thank you.

^ permalink raw reply

* Re: [PATCH -next] net_sched: fix error return code in fw_change_attrs()
From: David Miller @ 2014-01-17  3:12 UTC (permalink / raw)
  To: weiyj.lk; +Cc: jhs, xiyou.wangcong, yongjun_wei, netdev
In-Reply-To: <CAPgLHd_cjvdEcYZZNGg1rwKRBQcUS74vGvsdmTNkSWpPqR+fMQ@mail.gmail.com>

From: Wei Yongjun <weiyj.lk@gmail.com>
Date: Fri, 17 Jan 2014 09:53:20 +0800

> From: Wei Yongjun <yongjun_wei@trendmicro.com.cn>
> 
> The error code was not set if change indev fail, so the error
> condition wasn't reflected in the return value. Fix to return a
> negative error code from this error handling case instead of 0.
> 
> Fixes: 2519a602c273 ('net_sched: optimize tcf_match_indev()')
> Signed-off-by: Wei Yongjun <yongjun_wei@trendmicro.com.cn>

Applied, thank you.

^ permalink raw reply

* Re: [PATCH net-next 0/5] tipc: align TIPC behaviours of waiting for events with other stacks
From: Jon Maloy @ 2014-01-17  3:20 UTC (permalink / raw)
  To: David Miller, ying.xue@windriver.com
  Cc: Paul.Gortmaker@windriver.com,
	tipc-discussion@lists.sourceforge.net, netdev@vger.kernel.org
In-Reply-To: <20140116.191140.184722722665650663.davem@davemloft.net>

Nice job, Ying.
///jon

> -----Original Message-----
> From: David Miller [mailto:davem@davemloft.net]
> Sent: January-16-14 10:12 PM
> To: ying.xue@windriver.com
> Cc: Paul.Gortmaker@windriver.com; maloy@donjonn.com; Jon Maloy; Erik
> Hugne; netdev@vger.kernel.org; tipc-discussion@lists.sourceforge.net
> Subject: Re: [PATCH net-next 0/5] tipc: align TIPC behaviours of waiting for
> events with other stacks
> 
> From: Ying Xue <ying.xue@windriver.com>
> Date: Fri, 17 Jan 2014 09:50:02 +0800
> 
> > Comparing the current implementations of waiting for events in TIPC
> > socket layer with other stacks, TIPC's behaviour is very different
> > because wait_event_interruptible_timeout()/wait_event_interruptible()
> > are always used by TIPC to wait for events while relevant socket or
> > port variables are fed to them as their arguments. As socket lock has
> > to be released temporarily before the two routines of waiting for
> > events are called, their arguments associated with socket or port
> > structures are out of socket lock protection. This might cause serious
> > issues where the process of calling socket syscall such as sendsmg(),
> > connect(), accept(), and recvmsg(), cannot be waken up at all even if
> > proper event arrives or improperly be woken up although the condition
> > of waking up the process is not satisfied in practice.
> >
> > Therefore, aligning its behaviours with similar functions implemented
> > in other stacks, for instance, sk_stream_wait_connect() and
> > inet_csk_wait_for_connect() etc, can avoid above risks for us.
> 
> Series applied, thank you.

------------------------------------------------------------------------------
CenturyLink Cloud: The Leader in Enterprise Cloud Services.
Learn Why More Businesses Are Choosing CenturyLink Cloud For
Critical Workloads, Development Environments & Everything In Between.
Get a Quote or Start a Free Trial Today. 
http://pubads.g.doubleclick.net/gampad/clk?id=119420431&iu=/4140/ostg.clktrk

^ permalink raw reply

* Re: Fwd: [RFC PATCH net-next 0/3] virtio_net: add aRFS support
From: Jason Wang @ 2014-01-17  3:26 UTC (permalink / raw)
  To: Tom Herbert, Stefan Hajnoczi
  Cc: Zhi Yong Wu, Linux Netdev List, Eric Dumazet, David S. Miller,
	Zhi Yong Wu, Michael S. Tsirkin, Rusty Russell
In-Reply-To: <CA+mtBx9PBtYurdnhCKL0MLL8i+_+3yPNWFVj5h6SPJH+YDBCjw@mail.gmail.com>

On 01/17/2014 01:12 AM, Tom Herbert wrote:
> On Thu, Jan 16, 2014 at 12:52 AM, Stefan Hajnoczi <stefanha@redhat.com> wrote:
>> On Thu, Jan 16, 2014 at 04:34:10PM +0800, Zhi Yong Wu wrote:
>>> CC: stefanha, MST, Rusty Russel
>>>
>>> ---------- Forwarded message ----------
>>> From: Jason Wang <jasowang@redhat.com>
>>> Date: Thu, Jan 16, 2014 at 12:23 PM
>>> Subject: Re: [RFC PATCH net-next 0/3] virtio_net: add aRFS support
>>> To: Zhi Yong Wu <zwu.kernel@gmail.com>
>>> Cc: netdev@vger.kernel.org, therbert@google.com, edumazet@google.com,
>>> davem@davemloft.net, Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
>>>
>>>
>>> On 01/15/2014 10:20 PM, Zhi Yong Wu wrote:
>>>> From: Zhi Yong Wu<wuzhy@linux.vnet.ibm.com>
>>>>
>>>> HI, folks
>>>>
>>>> The patchset is trying to integrate aRFS support to virtio_net. In this case,
>>>> aRFS will be used to select the RX queue. To make sure that it's going ahead
>>>> in the correct direction, although it is still one RFC and isn't tested, it's
>>>> post out ASAP. Any comment are appreciated, thanks.
>>>>
>>>> If anyone is interested in playing with it, you can get this patchset from my
>>>> dev git on github:
>>>>    git://github.com/wuzhy/kernel.git virtnet_rfs
>>>>
>>>> Zhi Yong Wu (3):
>>>>    virtio_pci: Introduce one new config api vp_get_vq_irq()
>>>>    virtio_net: Introduce one dummy function virtnet_filter_rfs()
>>>>    virtio-net: Add accelerated RFS support
>>>>
>>>>   drivers/net/virtio_net.c      |   67 ++++++++++++++++++++++++++++++++++++++++-
>>>>   drivers/virtio/virtio_pci.c   |   11 +++++++
>>>>   include/linux/virtio_config.h |   12 +++++++
>>>>   3 files changed, 89 insertions(+), 1 deletions(-)
>>>>
>>> Please run get_maintainter.pl before sending the patch. You'd better
>>> at least cc virtio maintainer/list for this.
>>>
>>> The core aRFS method is a noop in this RFC which make this series no
>>> much sense to discuss. You should at least mention the big picture
>>> here in the cover letter. I suggest you should post a RFC which can
>>> run and has expected result or you can just raise a thread for the
>>> design discussion.
>>>
>>> And this method has been discussed before, you can search "[net-next
>>> RFC PATCH 5/5] virtio-net: flow director support" in netdev archive
>>> for a very old prototype implemented by me. It can work and looks like
>>> most of this RFC have already done there.
>>>
>>> A basic question is whether or not we need this, not all the mq cards
>>> use aRFS (see ixgbe ATR). And whether or not it can bring extra
>>> overheads? For virtio, we want to reduce the vmexits as much as
>>> possible but this aRFS seems introduce a lot of more of this. Making a
>>> complex interfaces just for an virtual device may not be good, simple
>>> method may works for most of the cases.
>>>
>>> We really should consider to offload this to real nic. VMDq and L2
>>> forwarding offload may help in this case.
> Adding flow director support would be a good step, Zhi's patches for
> support in tun have been merged, so support in virtio-net would be a
> good follow on. But, flow-director does have some limitations and
> performance issues of it's own (forced pairing between TX and RX
> queues, lookup on every TX packet). 

True. But the pairing was designed to work without guest involving since
we really want to reduce the vmexits from guest. And lookup on every TX
packets could be released to every N packets. But I agree exposing the
API to guest may bring lots of flexibility.
> In the case of virtualization,
> aRFS, RSS, ntuple filtering, LRO, etc. can be implemented as software
> emulations and so far seems to be wins in most cases. Extending these
> down into the stack so that they can leverage HW mechanisms is a good
> goal for best performance. It's probably generally true that most of
> the offloads commonly available for NICs we'll want in virtualization
> path. Of course, we need to deomonstrate that they provide real
> performance benefit in this use case.

Yes, we need a prototype to see how much it can help.
>
> I believe tying in aRFS (or flow director) into a real aRFS is just a
> matter of programming the RFS table properly. This is not the complex
> side of the interface, I believe this already works with the tun
> patches.

Right, what we may needs is

- exposing new tun ioctls for qemu adding or removing a flow
- new virtqueue command for guest driver to adding or removing a flow
(btw, current control virtqueue is really slow, we may need to improve it).
- an agreement of host and guest to use the same hash method, or just
compute software hash in host and pass it to guest (which needs extra
API to do)
- change guest driver to use aRFS

Some of the above has been implemented in my old RFC.
>
>> Zhi Yong and I had an IRC chat.  I wanted to post my questions on the
>> list - it's still the same concern I had in the old email thread that
>> Jason mentioned.
>>
>> In order for virtio-net aRFS to make sense there needs to be an overall
>> plan for pushing flow mapping information down to the physical NIC.
>> That's the only way to actually achieve the benefit of steering:
>> processing the packet on the CPU where the application is running.
>>
> I don't think this is necessarily true. Per flow steering amongst
> virtual queues should be beneficial in itself. virtio-net can leverage
> RFS or aRFS where it's available.
>
>> If it's not possible or too hard to implement aRFS down the entire
>> stack, we won't be able to process the packet on the right CPU.
>> Then we might as well not bother with aRFS and just distribute uniformly
>> across the rx virtqueues.
>>
>> Please post an outline of how rx packets will be steered up the stack so
>> we can discuss whether aRFS can bring any benefit.
>>
> 1. The aRFS interface for the guest to specify which virtual queue to
> receive a packet on is fairly straight forward.
> 2. To hook into RFS, we need to match the virtual queue to the real
> CPU it will processed on, and then program the RFS table for that flow
> and CPU.
> 3. NIC aRFS keys off the RFS tables so it can program the HW with the
> correct queue for the CPU.
>
>> Stefan
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: PANIC in vxlan <debugging now>
From: Eric Dumazet @ 2014-01-17  3:37 UTC (permalink / raw)
  To: Fan Du; +Cc: Jesse Brandeburg, netdev
In-Reply-To: <52D89EAE.2080308@windriver.com>

On Fri, 2014-01-17 at 11:08 +0800, Fan Du wrote:
> 
> On 2014年01月17日 09:14, Jesse Brandeburg wrote:
> > I'm currently debugging this but given where the kernel release cycle
> > is I wanted to let the list know.
> 
> How you reproduce it? I use below two methods, both works.
> And my net-next head is at: abfce3ef58b6a6c95de389f9d20047a05b10e484, pretty new.
> 
> Method 1:
> ip link add vxlan0 type vxlan id 42 group ff0e::110
> ip link set vxlan0 up
> ip link set vxlan0 down
> 
> Method 2:
> ip link add vxlan0 type vxlan id 42 group ff0e::110
> ip link set vxlan0 up
> ip link delete vxlan0
> 

Given the stack trace, it seems you need to setup a network namespace.

Try 

ip netns add foo


> 
> > It may well be a bug in our code, and if it is we'll find it, but here is
> > the panic, it doesn't occur when vxlan is not enabled.
> >
> > Jan 16 13:46:44 jbrandeb-cp2 kernel: [   17.331010] cgroup: libvirtd (1387) created nested cgroup for controller "memory" which has incomplete hierarchy supp
> > ort. Nested cgroups may change behavior in the future.
> > Jan 16 13:46:44 jbrandeb-cp2 kernel: [   17.331014] cgroup: "memory" requires setting use_hierarchy to 1 on the root.
> > Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.576568] ------------[ cut here ]------------
> > Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.586411] kernel BUG at include/net/netns/generic.h:45!
> > Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.596336] invalid opcode: 0000 [#1] SMP
> > Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.606268] Modules linked in: lockd sunrpc i40e igb iTCO_wdt iTCO_vendor_support sb_edac ioatdma ptp microcode lpc_ich edac_core i2c_i801 mfd_core dca pps_core wmi kvm uinput isci firewire_ohci libsas firewire_core crc_itu_t scsi_transport_sas mgag200 drm_kms_helper ttm
> > Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.637923] CPU: 0 PID: 1387 Comm: libvirtd Not tainted 3.13.0-rc7+ #30
> > Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.648599] Hardware name: Intel Corporation S2600CO ........../S2600CO, BIOS SE5C600.86B.01.08.6003.062420131549 06/24/2013
> > Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.659612] task: ffff88063b5c6000 ti: ffff8806333ca000 task.ti: ffff8806333ca000
> > Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.670661] RIP: 0010:[<ffffffff816df92f>]  [<ffffffff816df92f>] net_generic.isra.34.part.35+0x4/0x6
> > Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.681738] RSP: 0018:ffff8806333cbb80  EFLAGS: 00010246
> > Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.692536] RAX: 0000000000000000 RBX: 00000000ffffffed RCX: 0000000000000010
> > Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.703577] RDX: ffff88063d03d380 RSI: 0000000000000010 RDI: ffffffff81cfd9f0
> > Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.714612] RBP: ffff8806333cbb80 R08: 0000000000000000 R09: ffffffff81cfd9f0
> > Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.725531] R10: 00000000000002cc R11: 0000000000000004 R12: 0000000000000000
> > Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.736448] R13: ffff880639118000 R14: ffff8806333cbc68 R15: 0000000000000000
> > Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.747292] FS:  00007f6381830700(0000) GS:ffff880647600000(0000) knlGS:0000000000000000
> > Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.758248] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> > Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.769263] CR2: 00007f637c04b000 CR3: 0000000c3aa1f000 CR4: 00000000000407f0
> > Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.780402] Stack:
> > Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.791386]  ffff8806333cbbc0 ffffffff814d0865 ffff8806333cbc40 00000000ffffffef
> > Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.802702]  00000000ffffffed ffffffff81cc67d0 0000000000000010 ffff8806333cbc68
> > Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.814021]  ffff8806333cbc00 ffffffff816e9e5d 0000000000000004 ffff8806333cbc68
> > Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.825185] Call Trace:
> > Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.836106]  [<ffffffff814d0865>] vxlan_lowerdev_event+0xf5/0x100
> > Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.847254]  [<ffffffff816e9e5d>] notifier_call_chain+0x4d/0x70
> > Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.858457]  [<ffffffff810912be>] __raw_notifier_call_chain+0xe/0x10
> > Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.869696]  [<ffffffff810912d6>] raw_notifier_call_chain+0x16/0x20
> > Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.880896]  [<ffffffff815d9610>] call_netdevice_notifiers_info+0x40/0x70
> > Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.892063]  [<ffffffff815d9656>] call_netdevice_notifiers+0x16/0x20
> > Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.903107]  [<ffffffff815e1bce>] register_netdevice+0x1be/0x3a0
> > Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.914128]  [<ffffffff815e1dce>] register_netdev+0x1e/0x30
> > Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.925072]  [<ffffffff814cb94a>] loopback_net_init+0x4a/0xb0
> > Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.936048]  [<ffffffffa016ed6e>] ? lockd_init_net+0x6e/0xb0 [lockd]
> > Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.947081]  [<ffffffff815d6bac>] ops_init+0x4c/0x150
> > Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.958070]  [<ffffffff815d6d23>] setup_net+0x73/0x110
> > Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.969006]  [<ffffffff815d725b>] copy_net_ns+0x7b/0x100
> > Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.979897]  [<ffffffff81090e11>] create_new_namespaces+0x101/0x1b0
> > Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.990855]  [<ffffffff81090f45>] copy_namespaces+0x85/0xb0
> > Jan 16 13:46:45 jbrandeb-cp2 kernel: [   18.001656]  [<ffffffff810693d5>] copy_process.part.26+0x935/0x1500
> > Jan 16 13:46:45 jbrandeb-cp2 kernel: [   18.012370]  [<ffffffff811d5186>] ? mntput+0x26/0x40
> > Jan 16 13:46:45 jbrandeb-cp2 kernel: [   18.022924]  [<ffffffff8106a15c>] do_fork+0xbc/0x2e0
> > Jan 16 13:46:45 jbrandeb-cp2 kernel: [   18.033331]  [<ffffffff811b7f2e>] ? ____fput+0xe/0x10
> > Jan 16 13:46:45 jbrandeb-cp2 kernel: [   18.043622]  [<ffffffff81089c5c>] ? task_work_run+0xac/0xe0
> > Jan 16 13:46:45 jbrandeb-cp2 kernel: [   18.053905]  [<ffffffff8106a406>] SyS_clone+0x16/0x20
> > Jan 16 13:46:45 jbrandeb-cp2 kernel: [   18.064265]  [<ffffffff816ee689>] stub_clone+0x69/0x90
> > Jan 16 13:46:45 jbrandeb-cp2 kernel: [   18.074600]  [<ffffffff816ee329>] ? system_call_fastpath+0x16/0x1b
> > Jan 16 13:46:45 jbrandeb-cp2 kernel: [   18.084879] Code: 00 75 1d 55 be 2f 00 00 00 48 c7 c7 65 93 a2 81 48 89 e5 e8 f4 b5 98 ff 5d c6 05 30 aa 5f 00 01 c3 55 48 89 e5 0f 0b 55 48 89 e5<0f>  0b 55 48 89 e5 0f 0b 66 66 66 66 90 55 48 c7 c7 c0 4c cb 81
> > Jan 16 13:46:45 jbrandeb-cp2 kernel: [   18.105818] RIP  [<ffffffff816df92f>] net_generic.isra.34.part.35+0x4/0x6
> > Jan 16 13:46:45 jbrandeb-cp2 kernel: [   18.116106]  RSP<ffff8806333cbb80>
> > Jan 16 13:46:45 jbrandeb-cp2 kernel: [   18.172366] ---[ end trace 0bb84cf9aa76a384 ]---
> > Jan 16 13:46:47 jbrandeb-cp2 systemd[1]: Startup finished in 4s 918ms 164us (kernel) + 3s 548ms 460us (initrd) + 11s 2ms 474us (userspace) = 19s 469ms 98us.
> > Jan 16 13:46:47 jbrandeb-cp2 dbus-daemon[989]: dbus[989]: [system] Activating via systemd: service name='org.freedesktop.Accounts' unit='accounts-daemon.service'
> >
> > code says:
> > (gdb) l *(vxlan_lowerdev_event+0xf5)
> > 0xffffffff814d0865 is at include/net/netns/generic.h:41.
> > 34      static inline void *net_generic(const struct net *net, int id)
> > 35      {
> > 36              struct net_generic *ng;
> > 37              void *ptr;
> > 38
> > 39              rcu_read_lock();
> > 40              ng = rcu_dereference(net->gen);
> > 41              BUG_ON(id == 0 || id>  ng->len);
> > 42              ptr = ng->ptr[id - 1];
> > 43              rcu_read_unlock();
> > 44
> >>>>> 45              BUG_ON(!ptr);
> > 46              return ptr;
> > 47      }
> > 48      #endif
> >
> > --
> > To unsubscribe from this list: send the line "unsubscribe netdev" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> >
> 

^ permalink raw reply

* [PATCH v6 net-next 1/2] ixgbe: define IXGBE_MAX_VFS_DRV_LIMIT macro and cleanup const 63
From: Aaron Brown @ 2014-01-17  3:41 UTC (permalink / raw)
  To: davem; +Cc: ethan.zhao, netdev, gospo, sassmann, ethan.kernel, Aaron Brown
In-Reply-To: <1389930065-3330-1-git-send-email-aaron.f.brown@intel.com>

From: "ethan.zhao" <ethan.zhao@oracle.com>

Because ixgbe driver limit the max number of VF
 functions could be enabled to 63, so define one macro IXGBE_MAX_VFS_DRV_LIMIT
 and cleanup the const 63 in code.

v3: revised for net-next tree.

Signed-off-by: Ethan Zhao <ethan.kernel@gmail.com>
Tested-by: Phil Schmitt <phillip.j.schmitt@intel.com>
Signed-off-by: Aaron Brown <aaron.f.brown@intel.com>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c  | 4 ++--
 drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c | 4 ++--
 drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.h | 5 +++++
 3 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index b445ad1..3fd4d3f 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -5067,7 +5067,7 @@ static int ixgbe_sw_init(struct ixgbe_adapter *adapter)
 
 	/* assign number of SR-IOV VFs */
 	if (hw->mac.type != ixgbe_mac_82598EB) {
-		if (max_vfs > 63) {
+		if (max_vfs > IXGBE_MAX_VFS_DRV_LIMIT) {
 			adapter->num_vfs = 0;
 			e_dev_warn("max_vfs parameter out of range. Not assigning any SR-IOV VFs\n");
 		} else {
@@ -8020,7 +8020,7 @@ static int ixgbe_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	ixgbe_init_mbx_params_pf(hw);
 	memcpy(&hw->mbx.ops, ii->mbx_ops, sizeof(hw->mbx.ops));
 	ixgbe_enable_sriov(adapter);
-	pci_sriov_set_totalvfs(pdev, 63);
+	pci_sriov_set_totalvfs(pdev, IXGBE_MAX_VFS_DRV_LIMIT);
 skip_sriov:
 
 #endif
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c
index 0558c71..dff0977 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c
@@ -148,7 +148,7 @@ void ixgbe_enable_sriov(struct ixgbe_adapter *adapter)
 		 * physical function.  If the user requests greater thn
 		 * 63 VFs then it is an error - reset to default of zero.
 		 */
-		adapter->num_vfs = min_t(unsigned int, adapter->num_vfs, 63);
+		adapter->num_vfs = min_t(unsigned int, adapter->num_vfs, IXGBE_MAX_VFS_DRV_LIMIT);
 
 		err = pci_enable_sriov(adapter->pdev, adapter->num_vfs);
 		if (err) {
@@ -257,7 +257,7 @@ static int ixgbe_pci_sriov_enable(struct pci_dev *dev, int num_vfs)
 	 * PF.  The PCI bus driver already checks for other values out of
 	 * range.
 	 */
-	if (num_vfs > 63) {
+	if (num_vfs > IXGBE_MAX_VFS_DRV_LIMIT) {
 		err = -EPERM;
 		goto err_out;
 	}
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.h b/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.h
index 4713f9f..8bd2919 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.h
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.h
@@ -28,6 +28,11 @@
 #ifndef _IXGBE_SRIOV_H_
 #define _IXGBE_SRIOV_H_
 
+/*  ixgbe driver limit the max number of VFs could be enabled to
+ *  63 (IXGBE_MAX_VF_FUNCTIONS - 1)
+ */
+#define IXGBE_MAX_VFS_DRV_LIMIT  (IXGBE_MAX_VF_FUNCTIONS - 1)
+
 void ixgbe_restore_vf_multicasts(struct ixgbe_adapter *adapter);
 void ixgbe_msg_task(struct ixgbe_adapter *adapter);
 int ixgbe_vf_configuration(struct pci_dev *pdev, unsigned int event_mask);
-- 
1.8.5.GIT

^ permalink raw reply related

* [Patch v6 net-next 0/2] Intel Wired LAN Driver Updates
From: Aaron Brown @ 2014-01-17  3:41 UTC (permalink / raw)
  To: davem; +Cc: Aaron Brown, netdev, gospo, sassmann, ethan.kernel

This series contains updates to ixgbe Ethan Zhao.  The first one replaces
the magic number "63" with a macro, IXGBE_MAX_VFS_DRV_LIMIT, the second 
moves the call to set driver_max_VFS to before SRIOV is enabled.

The code of these patches match the v3 (1/2) and v2 (2/2) versions sent
to the e1000-devel and netdev mailing lists.  The intermediate versions
(v4, v5) are from sorting out style issues, mostly tabs to spaces and
split lines probably introduced via mailer.

ethan.zhao (2):
  1/2 ixgbe: define IXGBE_MAX_VFS_DRV_LIMIT macro and cleanup const 63
  2/2 ixgbe: set driver_max_VFs should be done before enabling SRIOV  

 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c  | 4 ++--
 drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c | 4 ++--
 drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.h | 5 +++++
 3 files changed, 9 insertions(+), 4 deletions(-)

-- 
1.8.5.GIT

^ permalink raw reply

* [Patch 2/2 net-next v6] ixgbe: set driver_max_VFs should be done before enabling SRIOV
From: Aaron Brown @ 2014-01-17  3:41 UTC (permalink / raw)
  To: davem; +Cc: ethan.zhao, netdev, gospo, sassmann, ethan.kernel, Aaron Brown
In-Reply-To: <1389930065-3330-1-git-send-email-aaron.f.brown@intel.com>

From: "ethan.zhao" <ethan.zhao@oracle.com>

commit 43dc4e01 Limit number of reported VFs to device
 specific value It doesn't work and always returns -EBUSY because VFs are
 already enabled.

ixgbe_enable_sriov()
        pci_enable_sriov()
                sriov_enable()
                {
                ... ..
                iov->ctrl |= PCI_SRIOV_CTRL_VFE | PCI_SRIOV_CTRL_MSE;
                pci_cfg_access_lock(dev);
                ... ...
                }

pci_sriov_set_totalvfs()
{
... ...
if (dev->sriov->ctrl & PCI_SRIOV_CTRL_VFE)
                return -EBUSY;
...
}

So should set driver_max_VFs with pci_sriov_set_totalvfs() before
enable VFs with ixgbe_enable_sriov().

V2: revised for net-next tree.

Signed-off-by: Ethan Zhao <ethan.kernel@gmail.com>
Tested-by: Phil Schmitt <phillip.j.schmitt@intel.com>
Signed-off-by: Aaron Brown <aaron.f.brown@intel.com>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 3fd4d3f..61d985c 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -8019,8 +8019,8 @@ static int ixgbe_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	/* Mailbox */
 	ixgbe_init_mbx_params_pf(hw);
 	memcpy(&hw->mbx.ops, ii->mbx_ops, sizeof(hw->mbx.ops));
-	ixgbe_enable_sriov(adapter);
 	pci_sriov_set_totalvfs(pdev, IXGBE_MAX_VFS_DRV_LIMIT);
+	ixgbe_enable_sriov(adapter);
 skip_sriov:
 
 #endif
-- 
1.8.5.GIT

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox