Netdev List
 help / color / mirror / Atom feed
* Re: [patch net-next v9 2/3] net: core: Add offload stats to if_stats_msg
From: Nikolay Aleksandrov @ 2016-09-16  8:00 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: Linux Kernel Network Developers, David S. Miller, nogahf,
	Ido Schimmel, eladr, yotamg, ogerlitz, Roopa Prabhu, linville,
	tgraf, Andy Gospodarek, sfeldma, sd, eranbe, ast, edumazet,
	hannes, f.fainelli, dsa
In-Reply-To: <1473845322-16679-3-git-send-email-jiri@resnulli.us>


> On Sep 14, 2016, at 12:28 PM, Jiri Pirko <jiri@resnulli.us> wrote:
> 
> From: Nogah Frankel <nogahf@mellanox.com>
> 
> Add a nested attribute of offload stats to if_stats_msg
> named IFLA_STATS_LINK_OFFLOAD_XSTATS.
> Under it, add SW stats, meaning stats only per packets that went via
> slowpath to the cpu, named IFLA_OFFLOAD_XSTATS_CPU_HIT.
> 
> Signed-off-by: Nogah Frankel <nogahf@mellanox.com>
> Signed-off-by: Jiri Pirko <jiri@mellanox.com>
> ---
> include/uapi/linux/if_link.h |   9 ++++
> net/core/rtnetlink.c         | 111 +++++++++++++++++++++++++++++++++++++++++--
> 2 files changed, 116 insertions(+), 4 deletions(-)
> 
[snip]
> @@ -3655,6 +3737,24 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev,
> 		}
> 	}
> 
> +	if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_OFFLOAD_XSTATS,
> +			     *idxattr)) {
> +		*idxattr = IFLA_STATS_LINK_OFFLOAD_XSTATS;
> +		attr = nla_nest_start(skb, IFLA_STATS_LINK_OFFLOAD_XSTATS);
> +		if (!attr)
> +			goto nla_put_failure;
> +
> +		err = rtnl_get_offload_stats(skb, dev, prividx);
> +		if (err == -ENODATA)
> +			nla_nest_cancel(skb, attr);
> +		else
> +			nla_nest_end(skb, attr);
> +
> +		if ((err) && (err != -ENODATA))
                       ^^^^       ^^^^^^^^^^^^^^^^^^
The extra braces are still there. [1]
The rest looks good to me.

Thanks,
 Nik

[1] http://www.spinics.net/lists/netdev/msg394257.html

> +			goto nla_put_failure;
> +		*idxattr = 0;
> +	}
> +
> 	nlmsg_end(skb, nlh);
> 
> 	return 0;
> @@ -3708,6 +3808,9 @@ static size_t if_nlmsg_stats_size(const struct net_device *dev,
> 		}
> 	}
> 
> +	if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_OFFLOAD_XSTATS, 0))
> +		size += rtnl_get_offload_stats_size(dev);
> +
> 	return size;
> }
> 
> -- 
> 2.5.5
> 

^ permalink raw reply

* Re: [PATCH net-next 0/5] mlx4 misc fixes and improvements
From: Or Gerlitz @ 2016-09-16  7:46 UTC (permalink / raw)
  To: David Miller; +Cc: Tariq Toukan, Linux Netdev List, Eran Ben Elisha
In-Reply-To: <20160915.192106.739428651744920778.davem@davemloft.net>

On Fri, Sep 16, 2016 at 2:21 AM, David Miller <davem@davemloft.net> wrote:
> From: Tariq Toukan <tariqt@mellanox.com>
> Date: Mon, 12 Sep 2016 16:20:11 +0300
>
>> This patchset contains some bug fixes, a cleanup, and small improvements
>> from the team to the mlx4 Eth and core drivers.
>>
>> Series generated against net-next commit:
>> 02154927c115 "net: dsa: bcm_sf2: Get VLAN_PORT_MASK from b53_device"
>>
>> Please push the following patch to -stable  >= 4.6 as well:
>> "net/mlx4_core: Fix to clean devlink resources"
>
> Again, coding style fixes and optimizations like branch prediction
> hints are not bug fixes and therefore not appropriate for 'net'.

Hi Dave,

He sent it to net-next, not net... ok?

Or.

^ permalink raw reply

* Re: [PATCH net-next 3/4] samples/bpf: extend test_tunnel_bpf.sh with IPIP test
From: Daniel Borkmann @ 2016-09-16  7:22 UTC (permalink / raw)
  To: William Tu, Alexei Starovoitov
  Cc: David S . Miller, Thomas Graf, Linux Kernel Network Developers,
	kernel-team
In-Reply-To: <CALDO+SYktwsw-uCQRdSvU4U2jP3L1YuOUU4VThaCGDuqKWFLfA@mail.gmail.com>

Hi William,

On 09/16/2016 07:16 AM, William Tu wrote:
> Hi Alexei,
>
> Is there a corresponding patch for iproute2? I tested this patch but fails at:
> + ip link add dev ipip11 type ipip external
> because my ip command does not support "external".

Yes, like any other collect metadata backends you need a small patch
to iproute2 that sets in this case IFLA_IPTUN_COLLECT_METADATA flag
via conventional "external" keyword. Will be posted at latest on Monday
(Alexei mentioned he's pto today).

Cheers,
Daniel

> Thanks
> William
>
>
> On Thu, Sep 15, 2016 at 1:00 PM, Alexei Starovoitov <ast@fb.com> wrote:
>> extend existing tests for vxlan, geneve, gre to include IPIP tunnel.
>> It tests both traditional tunnel configuration and
>> dynamic via bpf helpers.
>>
>> Signed-off-by: Alexei Starovoitov <ast@kernel.org>
>> ---
>>   samples/bpf/tcbpf2_kern.c      | 58 ++++++++++++++++++++++++++++++++++++++++++
>>   samples/bpf/test_tunnel_bpf.sh | 56 ++++++++++++++++++++++++++++++++++------
>>   2 files changed, 106 insertions(+), 8 deletions(-)
>>
>> diff --git a/samples/bpf/tcbpf2_kern.c b/samples/bpf/tcbpf2_kern.c
>> index 7a15289da6cc..c1917d968fb4 100644
>> --- a/samples/bpf/tcbpf2_kern.c
>> +++ b/samples/bpf/tcbpf2_kern.c
>> @@ -1,4 +1,5 @@
>>   /* Copyright (c) 2016 VMware
>> + * Copyright (c) 2016 Facebook
>>    *
>>    * This program is free software; you can redistribute it and/or
>>    * modify it under the terms of version 2 of the GNU General Public
>> @@ -188,4 +189,61 @@ int _geneve_get_tunnel(struct __sk_buff *skb)
>>          return TC_ACT_OK;
>>   }
>>
>> +SEC("ipip_set_tunnel")
>> +int _ipip_set_tunnel(struct __sk_buff *skb)
>> +{
>> +       struct bpf_tunnel_key key = {};
>> +       void *data = (void *)(long)skb->data;
>> +       struct iphdr *iph = data;
>> +       struct tcphdr *tcp = data + sizeof(*iph);
>> +       void *data_end = (void *)(long)skb->data_end;
>> +       int ret;
>> +
>> +       /* single length check */
>> +       if (data + sizeof(*iph) + sizeof(*tcp) > data_end) {
>> +               ERROR(1);
>> +               return TC_ACT_SHOT;
>> +       }
>> +
>> +       key.tunnel_ttl = 64;
>> +       if (iph->protocol == IPPROTO_ICMP) {
>> +               key.remote_ipv4 = 0xac100164; /* 172.16.1.100 */
>> +       } else {
>> +               if (iph->protocol != IPPROTO_TCP || iph->ihl != 5)
>> +                       return TC_ACT_SHOT;
>> +
>> +               if (tcp->dest == htons(5200))
>> +                       key.remote_ipv4 = 0xac100164; /* 172.16.1.100 */
>> +               else if (tcp->dest == htons(5201))
>> +                       key.remote_ipv4 = 0xac100165; /* 172.16.1.101 */
>> +               else
>> +                       return TC_ACT_SHOT;
>> +       }
>> +
>> +       ret = bpf_skb_set_tunnel_key(skb, &key, sizeof(key), 0);
>> +       if (ret < 0) {
>> +               ERROR(ret);
>> +               return TC_ACT_SHOT;
>> +       }
>> +
>> +       return TC_ACT_OK;
>> +}
>> +
>> +SEC("ipip_get_tunnel")
>> +int _ipip_get_tunnel(struct __sk_buff *skb)
>> +{
>> +       int ret;
>> +       struct bpf_tunnel_key key;
>> +       char fmt[] = "remote ip 0x%x\n";
>> +
>> +       ret = bpf_skb_get_tunnel_key(skb, &key, sizeof(key), 0);
>> +       if (ret < 0) {
>> +               ERROR(ret);
>> +               return TC_ACT_SHOT;
>> +       }
>> +
>> +       bpf_trace_printk(fmt, sizeof(fmt), key.remote_ipv4);
>> +       return TC_ACT_OK;
>> +}
>> +
>>   char _license[] SEC("license") = "GPL";
>> diff --git a/samples/bpf/test_tunnel_bpf.sh b/samples/bpf/test_tunnel_bpf.sh
>> index 4956589a83ae..1ff634f187b7 100755
>> --- a/samples/bpf/test_tunnel_bpf.sh
>> +++ b/samples/bpf/test_tunnel_bpf.sh
>> @@ -9,15 +9,13 @@
>>   # local 172.16.1.200 remote 172.16.1.100
>>   # veth1 IP: 172.16.1.200, tunnel dev <type>11
>>
>> -set -e
>> -
>>   function config_device {
>>          ip netns add at_ns0
>>          ip link add veth0 type veth peer name veth1
>>          ip link set veth0 netns at_ns0
>>          ip netns exec at_ns0 ip addr add 172.16.1.100/24 dev veth0
>>          ip netns exec at_ns0 ip link set dev veth0 up
>> -       ip link set dev veth1 up
>> +       ip link set dev veth1 up mtu 1500
>>          ip addr add dev veth1 172.16.1.200/24
>>   }
>>
>> @@ -67,6 +65,19 @@ function add_geneve_tunnel {
>>          ip addr add dev $DEV 10.1.1.200/24
>>   }
>>
>> +function add_ipip_tunnel {
>> +       # in namespace
>> +       ip netns exec at_ns0 \
>> +               ip link add dev $DEV_NS type $TYPE local 172.16.1.100 remote 172.16.1.200
>> +       ip netns exec at_ns0 ip link set dev $DEV_NS up
>> +       ip netns exec at_ns0 ip addr add dev $DEV_NS 10.1.1.100/24
>> +
>> +       # out of namespace
>> +       ip link add dev $DEV type $TYPE external
>> +       ip link set dev $DEV up
>> +       ip addr add dev $DEV 10.1.1.200/24
>> +}
>> +
>>   function attach_bpf {
>>          DEV=$1
>>          SET_TUNNEL=$2
>> @@ -85,6 +96,7 @@ function test_gre {
>>          attach_bpf $DEV gre_set_tunnel gre_get_tunnel
>>          ping -c 1 10.1.1.100
>>          ip netns exec at_ns0 ping -c 1 10.1.1.200
>> +       cleanup
>>   }
>>
>>   function test_vxlan {
>> @@ -96,6 +108,7 @@ function test_vxlan {
>>          attach_bpf $DEV vxlan_set_tunnel vxlan_get_tunnel
>>          ping -c 1 10.1.1.100
>>          ip netns exec at_ns0 ping -c 1 10.1.1.200
>> +       cleanup
>>   }
>>
>>   function test_geneve {
>> @@ -107,21 +120,48 @@ function test_geneve {
>>          attach_bpf $DEV geneve_set_tunnel geneve_get_tunnel
>>          ping -c 1 10.1.1.100
>>          ip netns exec at_ns0 ping -c 1 10.1.1.200
>> +       cleanup
>> +}
>> +
>> +function test_ipip {
>> +       TYPE=ipip
>> +       DEV_NS=ipip00
>> +       DEV=ipip11
>> +       config_device
>> +       tcpdump -nei veth1 &
>> +       cat /sys/kernel/debug/tracing/trace_pipe &
>> +       add_ipip_tunnel
>> +       ethtool -K veth1 gso off gro off rx off tx off
>> +       ip link set dev veth1 mtu 1500
>> +       attach_bpf $DEV ipip_set_tunnel ipip_get_tunnel
>> +       ping -c 1 10.1.1.100
>> +       ip netns exec at_ns0 ping -c 1 10.1.1.200
>> +       ip netns exec at_ns0 iperf -sD -p 5200 > /dev/null
>> +       sleep 0.2
>> +       iperf -c 10.1.1.100 -n 5k -p 5200
>> +       cleanup
>>   }
>>
>>   function cleanup {
>> +       set +ex
>> +       pkill iperf
>>          ip netns delete at_ns0
>>          ip link del veth1
>> -       ip link del $DEV
>> +       ip link del ipip11
>> +       ip link del gretap11
>> +       ip link del geneve11
>> +       pkill tcpdump
>> +       pkill cat
>> +       set -ex
>>   }
>>
>> +cleanup
>>   echo "Testing GRE tunnel..."
>>   test_gre
>> -cleanup
>>   echo "Testing VXLAN tunnel..."
>>   test_vxlan
>> -cleanup
>>   echo "Testing GENEVE tunnel..."
>>   test_geneve
>> -cleanup
>> -echo "Success"
>> +echo "Testing IPIP tunnel..."
>> +test_ipip
>> +echo "*** PASS ***"
>> --
>> 2.8.0
>>

^ permalink raw reply

* [PATCH v2] iproute2: build nsid-name cache only for commands that need it
From: Anton Aksola @ 2016-09-16  7:22 UTC (permalink / raw)
  To: netdev; +Cc: nicolas.dichtel

The calling of netns_map_init() before command parsing introduced
a performance issue with large number of namespaces.

As commands such as add, del and exec do not need to iterate through
/var/run/netns it would be good not no build the cache before executing
these commands.

Example:
unpatched:
time seq 1 1000 | xargs -n 1 ip netns add

real    0m16.832s
user    0m1.350s
sys    0m15.029s

patched:
time seq 1 1000 | xargs -n 1 ip netns add

real    0m3.859s
user    0m0.132s
sys    0m3.205s

Signed-off-by: Anton Aksola <aakso@iki.fi>
---
 ip/ipnetns.c | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/ip/ipnetns.c b/ip/ipnetns.c
index af87065..564d034 100644
--- a/ip/ipnetns.c
+++ b/ip/ipnetns.c
@@ -775,17 +775,21 @@ static int netns_monitor(int argc, char **argv)
 
 int do_netns(int argc, char **argv)
 {
-	netns_map_init();
-
-	if (argc < 1)
+	if (argc < 1) {
+		netns_map_init();
 		return netns_list(0, NULL);
+	}
 
 	if ((matches(*argv, "list") == 0) || (matches(*argv, "show") == 0) ||
-	    (matches(*argv, "lst") == 0))
+	    (matches(*argv, "lst") == 0)) {
+		netns_map_init();
 		return netns_list(argc-1, argv+1);
+	}
 
-	if ((matches(*argv, "list-id") == 0))
+	if ((matches(*argv, "list-id") == 0)) {
+		netns_map_init();
 		return netns_list_id(argc-1, argv+1);
+	}
 
 	if (matches(*argv, "help") == 0)
 		return usage();
-- 
1.8.3.1

^ permalink raw reply related

* Re: [PATCH v3] net: ip, diag -- Add diag interface for raw sockets
From: Cyrill Gorcunov @ 2016-09-16  7:06 UTC (permalink / raw)
  To: David Ahern, Eric Dumazet
  Cc: netdev, linux-kernel, David Miller, kuznet, jmorris, yoshfuji,
	kaber, avagin, stephen
In-Reply-To: <999f0ddb-82e4-ea07-b52a-59d08bc7816d@cumulusnetworks.com>

On Thu, Sep 15, 2016 at 05:45:02PM -0600, David Ahern wrote:
> > 
> > Try to be selective in the -K , do not kill tcp sockets ?
> 
> I am running
>    ss -aKw 'dev == red'
> 
> to kill raw sockets bound to device named 'red'.

Thanks David, Eric! I'll play with this option today and report the results.

^ permalink raw reply

* Re: cdc_ncm driver padding problem
From: Bjørn Mork @ 2016-09-16  6:33 UTC (permalink / raw)
  To: Marek Brudka
  Cc: netdev-u79uwXL29TY76Z2rM5mHXA, linux-usb-u79uwXL29TY76Z2rM5mHXA,
	Enrico Mioso
In-Reply-To: <alpine.LNX.2.20.1609160141270.19559-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>

> ==From: Marek Brudka <mbrudka-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
..
> ==The bad thing is DHCP. It seems, that cdc_ncm driver somehow consumes DHCP
> ==replies. I had to manually setup wwan0 interface as well as routing
> ==using the result
> ==of Hayes command
> ==
> ==AT^DHCP?
> ==^DHCP:
> ==EC684764,F8FFFFFF,E9684764,E9684764,356002D4,366002D4,43200000,43200000
> ==OK


Are we sure that this firmware supports DHCP?  It's not uncommon for
modem firmwares to lack such support, and I find this a much more likely
explanation than the driver somehow messing up the DHCP replies while
letting other packets through.

Unfortunately there is no easy way to tell for sure if a specific
feature is enabled or not in a specific firmware.  Is it possible to get
a snoop from Windows while connecting, using the modem in the same mode?
That would tell us whether the Windows software depends on DHCP or is
using the AT^DHCP command.



Bjørn
--
To unsubscribe from this list: send the line "unsubscribe linux-usb" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH] iproute2: build nsid-name cache only for commands that need it
From: Anton Aksola @ 2016-09-16  6:27 UTC (permalink / raw)
  To: Nicolas Dichtel; +Cc: netdev
In-Reply-To: <2ab25704-d8aa-1532-a07a-c1765b937d66@6wind.com>

On Thu, Sep 15, 2016 at 03:26:18PM +0200, Nicolas Dichtel wrote:
[snip]
> 'ip netns' (ip netns list) also need it.

Thanks, I missed your other commit that introduced it. Sending an updated patch.

^ permalink raw reply

* Re: [PATCH net-next v2 0/7] add enhancement into the existing reset flow
From: David Miller @ 2016-09-16  6:23 UTC (permalink / raw)
  To: sean.wang-NuS5LvNUpcJWk0Htik3J/w
  Cc: nbd-p3rKhJxN3npAfugRpC6u6w, keyhaede-Re5JQEeQqe8AvxtiuMwx3w,
	netdev-u79uwXL29TY76Z2rM5mHXA, john-Pj+rj9U5foFAfugRpC6u6w,
	linux-mediatek-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r,
	objelf-Re5JQEeQqe8AvxtiuMwx3w
In-Reply-To: <1473866001-9805-1-git-send-email-sean.wang-NuS5LvNUpcJWk0Htik3J/w@public.gmane.org>

From: <sean.wang-NuS5LvNUpcJWk0Htik3J/w@public.gmane.org>
Date: Wed, 14 Sep 2016 23:13:14 +0800

> From: Sean Wang <sean.wang-NuS5LvNUpcJWk0Htik3J/w@public.gmane.org>
> 
> Current driver only resets DMA used by descriptor rings which
> can't guarantee it can recover all various kinds of fatal
> errors, so the patch
> 1) tries to reset the underlying hardware resource from scratch on
> Mediatek SoC required for ethernet running.
> 2) refactors code in order to the reusability of existing code.
> 3) considers handling for race condition between the reset flow and
> callbacks registered into core driver called about hardware accessing.
> 4) introduces power domain usage to hardware setup which leads to have
> cleanly and completely restore to the state as the initial.
> 
> Changes since v1:
> - fix the build error with module built causing undefined symbol for
>   pinctrl_bind_pins, so using pinctrl_select_state instead accomplishes
>   the pin mux setup during the reset process.

Series applied, thanks.

^ permalink raw reply

* Re: MDB offloading of local ipv4 multicast groups
From: John Crispin @ 2016-09-16  6:22 UTC (permalink / raw)
  To: Ido Schimmel
  Cc: Elad Raz, netdev@vger.kernel.org, Ido Schimmel, Jiri Pirko,
	Nikolay Aleksandrov, David S. Miller
In-Reply-To: <20160915204218.GA21035@splinter>



On 15/09/2016 22:42, Ido Schimmel wrote:
> On Thu, Sep 15, 2016 at 08:58:50PM +0200, John Crispin wrote:
>> Hi,
>>
>> While adding MDB support to the qca8k dsa driver I found that ipv4 mcast
>> groups don't always get propagated to the dsa driver. In my setup there
>> are 2 clients connected to the switch, both running a mdns client. The
>> .port_mdb_add() callback is properly called for 33:33:00:00:00:FB but
>> 01:00:5E:00:00:FB never got propagated to the dsa driver.
>>
>> The reason is that the call to ipv4_is_local_multicast() here [1] will
>> return true and the notifier is never called. Is this intentional or is
>> there something missing in the code ?
> 
> I believe this is based on RFC 4541:
> 
> "Packets with a destination IP (DIP) address in the 224.0.0.X range
> which are not IGMP must be forwarded on all ports."
> https://tools.ietf.org/html/rfc4541
> 
> But, we are missing the offloading of router ports, which is needed for
> the device to correctly flood unregistered multicast packets. That's
> also according to the mentioned RFC:
> 
> "If a switch receives an unregistered packet, it must forward that
> packet on all ports to which an IGMP router is attached."
> 
> Implemented at br_flood_multicast()
> 
> However, the marking is done per-port and not per-{port, VID}. We need
> that in case vlan filtering is enabled. I think Nik is working on that,
> but he can correct me if I'm wrong :). The switchdev bits can be added
> soon after.
> 

thanks for the explanation. i was not aware the the local groups should
always be flooded to all ports.

	John

^ permalink raw reply

* Re: [PATCH net-next 0/4] rxrpc: Support IPv6
From: David Miller @ 2016-09-16  5:57 UTC (permalink / raw)
  To: dhowells; +Cc: netdev, linux-afs, linux-kernel
In-Reply-To: <147380649153.30728.6717292274642860064.stgit@warthog.procyon.org.uk>

From: David Howells <dhowells@redhat.com>
Date: Tue, 13 Sep 2016 23:41:31 +0100

> 
> Here is a set of patches that add IPv6 support.  They need to be applied on
> top of the just-posted miscellaneous fix patches.  They are:
> 
>  (1) Make autobinding of an unconnected socket work when sendmsg() is
>      called to initiate a client call.
> 
>  (2) Don't specify the protocol when creating the client socket, but rather
>      take the default instead.
> 
>  (3) Use rxrpc_extract_addr_from_skb() in a couple of places that were
>      doing the same thing manually.  This allows the IPv6 address
>      extraction to be done in fewer places.
> 
>  (4) Add IPv6 support.  With this, calls can be made to IPv6 servers from
>      userspace AF_RXRPC programs; AFS, however, can't use IPv6 yet as the
>      RPC calls need to be upgradeable.
 ...
> Tagged thusly:
> 
> 	git://git.kernel.org/pub/scm/linux/kernel/git/dhowells/linux-fs.git
> 	rxrpc-rewrite-20160913-2

Looks good, pulled, thanks.

^ permalink raw reply

* Re: [PATCH net-next 00/10] rxrpc: Miscellaneous fixes
From: David Miller @ 2016-09-16  5:55 UTC (permalink / raw)
  To: dhowells; +Cc: netdev, linux-afs, linux-kernel
In-Reply-To: <147380525614.23135.7539695651371953351.stgit@warthog.procyon.org.uk>

From: David Howells <dhowells@redhat.com>
Date: Tue, 13 Sep 2016 23:20:56 +0100

> 
> Here's a set of miscellaneous fix patches.  There are a couple of points of
> note:
> 
>  (1) There is one non-fix patch that adjusts the call ref tracking
>      tracepoint to make kernel API-held refs on calls more obvious.  This
>      is a prerequisite for the patch that fixes prealloc refcounting.
> 
>  (2) The final patch alters how jumbo packets that partially exceed the
>      receive window are handled.  Previously, space was being left in the
>      Rx buffer for them, but this significantly hurts performance as the Rx
>      window can't be increased to match the OpenAFS Tx window size.
> 
>      Instead, the excess subpackets are discarded and an EXCEEDS_WINDOW ACK
>      is generated for the first.  To avoid the problem of someone trying to
>      run the kernel out of space by feeding the kernel a series of
>      overlapping maximal jumbo packets, we stop allowing jumbo packets on a
>      call if we encounter more than three jumbo packets with duplicate or
>      excessive subpackets.
 ...
> Tagged thusly:
> 
> 	git://git.kernel.org/pub/scm/linux/kernel/git/dhowells/linux-fs.git
> 	rxrpc-rewrite-20160913-1

Pulled, thanks David.

^ permalink raw reply

* Re: [PATCH] MAINTAINERS: Remove myself from PA Semi entries
From: David Miller @ 2016-09-16  5:50 UTC (permalink / raw)
  To: mpe; +Cc: olof, netdev, linux-i2c, jdelvare
In-Reply-To: <87k2eeoo30.fsf@concordia.ellerman.id.au>

From: Michael Ellerman <mpe@ellerman.id.au>
Date: Wed, 14 Sep 2016 18:57:55 +1000

> Olof Johansson <olof@lixom.net> writes:
> 
>> Jean, Dave,
>>
>> I was hoping to have Michael merge this since the bulk of the platform is under him,
>> cc:ing you mostly to be aware that I am orphaning a driver in your subsystems.
> 
> I'll merge it unless I hear otherwise from Dave.

Feel free to merge this.

Thanks.

^ permalink raw reply

* Re: pull-request: mac80211 2016-09-13
From: David Miller @ 2016-09-16  5:34 UTC (permalink / raw)
  To: johannes-cdvu00un1VgdHxzADdlk8Q
  Cc: netdev-u79uwXL29TY76Z2rM5mHXA,
	linux-wireless-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <1473797004-24158-1-git-send-email-johannes-cdvu00un1VgdHxzADdlk8Q@public.gmane.org>

From: Johannes Berg <johannes-cdvu00un1VgdHxzADdlk8Q@public.gmane.org>
Date: Tue, 13 Sep 2016 22:03:23 +0200

> We found a few more issues, I'm sending you small fixes here. The diffstat
> would be even shorter, but one of Felix's patches has to move about 30 lines
> of code, which makes it seem much bigger than it really is.
> 
> Let me know if there's any problem.

Pulled, thanks Johannes.

^ permalink raw reply

* Re: [PATCH net-next 3/4] samples/bpf: extend test_tunnel_bpf.sh with IPIP test
From: William Tu @ 2016-09-16  5:16 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: David S . Miller, Daniel Borkmann, Thomas Graf,
	Linux Kernel Network Developers, kernel-team
In-Reply-To: <1473969632-2408261-4-git-send-email-ast@fb.com>

Hi Alexei,

Is there a corresponding patch for iproute2? I tested this patch but fails at:
+ ip link add dev ipip11 type ipip external
because my ip command does not support "external".

Thanks
William


On Thu, Sep 15, 2016 at 1:00 PM, Alexei Starovoitov <ast@fb.com> wrote:
> extend existing tests for vxlan, geneve, gre to include IPIP tunnel.
> It tests both traditional tunnel configuration and
> dynamic via bpf helpers.
>
> Signed-off-by: Alexei Starovoitov <ast@kernel.org>
> ---
>  samples/bpf/tcbpf2_kern.c      | 58 ++++++++++++++++++++++++++++++++++++++++++
>  samples/bpf/test_tunnel_bpf.sh | 56 ++++++++++++++++++++++++++++++++++------
>  2 files changed, 106 insertions(+), 8 deletions(-)
>
> diff --git a/samples/bpf/tcbpf2_kern.c b/samples/bpf/tcbpf2_kern.c
> index 7a15289da6cc..c1917d968fb4 100644
> --- a/samples/bpf/tcbpf2_kern.c
> +++ b/samples/bpf/tcbpf2_kern.c
> @@ -1,4 +1,5 @@
>  /* Copyright (c) 2016 VMware
> + * Copyright (c) 2016 Facebook
>   *
>   * This program is free software; you can redistribute it and/or
>   * modify it under the terms of version 2 of the GNU General Public
> @@ -188,4 +189,61 @@ int _geneve_get_tunnel(struct __sk_buff *skb)
>         return TC_ACT_OK;
>  }
>
> +SEC("ipip_set_tunnel")
> +int _ipip_set_tunnel(struct __sk_buff *skb)
> +{
> +       struct bpf_tunnel_key key = {};
> +       void *data = (void *)(long)skb->data;
> +       struct iphdr *iph = data;
> +       struct tcphdr *tcp = data + sizeof(*iph);
> +       void *data_end = (void *)(long)skb->data_end;
> +       int ret;
> +
> +       /* single length check */
> +       if (data + sizeof(*iph) + sizeof(*tcp) > data_end) {
> +               ERROR(1);
> +               return TC_ACT_SHOT;
> +       }
> +
> +       key.tunnel_ttl = 64;
> +       if (iph->protocol == IPPROTO_ICMP) {
> +               key.remote_ipv4 = 0xac100164; /* 172.16.1.100 */
> +       } else {
> +               if (iph->protocol != IPPROTO_TCP || iph->ihl != 5)
> +                       return TC_ACT_SHOT;
> +
> +               if (tcp->dest == htons(5200))
> +                       key.remote_ipv4 = 0xac100164; /* 172.16.1.100 */
> +               else if (tcp->dest == htons(5201))
> +                       key.remote_ipv4 = 0xac100165; /* 172.16.1.101 */
> +               else
> +                       return TC_ACT_SHOT;
> +       }
> +
> +       ret = bpf_skb_set_tunnel_key(skb, &key, sizeof(key), 0);
> +       if (ret < 0) {
> +               ERROR(ret);
> +               return TC_ACT_SHOT;
> +       }
> +
> +       return TC_ACT_OK;
> +}
> +
> +SEC("ipip_get_tunnel")
> +int _ipip_get_tunnel(struct __sk_buff *skb)
> +{
> +       int ret;
> +       struct bpf_tunnel_key key;
> +       char fmt[] = "remote ip 0x%x\n";
> +
> +       ret = bpf_skb_get_tunnel_key(skb, &key, sizeof(key), 0);
> +       if (ret < 0) {
> +               ERROR(ret);
> +               return TC_ACT_SHOT;
> +       }
> +
> +       bpf_trace_printk(fmt, sizeof(fmt), key.remote_ipv4);
> +       return TC_ACT_OK;
> +}
> +
>  char _license[] SEC("license") = "GPL";
> diff --git a/samples/bpf/test_tunnel_bpf.sh b/samples/bpf/test_tunnel_bpf.sh
> index 4956589a83ae..1ff634f187b7 100755
> --- a/samples/bpf/test_tunnel_bpf.sh
> +++ b/samples/bpf/test_tunnel_bpf.sh
> @@ -9,15 +9,13 @@
>  # local 172.16.1.200 remote 172.16.1.100
>  # veth1 IP: 172.16.1.200, tunnel dev <type>11
>
> -set -e
> -
>  function config_device {
>         ip netns add at_ns0
>         ip link add veth0 type veth peer name veth1
>         ip link set veth0 netns at_ns0
>         ip netns exec at_ns0 ip addr add 172.16.1.100/24 dev veth0
>         ip netns exec at_ns0 ip link set dev veth0 up
> -       ip link set dev veth1 up
> +       ip link set dev veth1 up mtu 1500
>         ip addr add dev veth1 172.16.1.200/24
>  }
>
> @@ -67,6 +65,19 @@ function add_geneve_tunnel {
>         ip addr add dev $DEV 10.1.1.200/24
>  }
>
> +function add_ipip_tunnel {
> +       # in namespace
> +       ip netns exec at_ns0 \
> +               ip link add dev $DEV_NS type $TYPE local 172.16.1.100 remote 172.16.1.200
> +       ip netns exec at_ns0 ip link set dev $DEV_NS up
> +       ip netns exec at_ns0 ip addr add dev $DEV_NS 10.1.1.100/24
> +
> +       # out of namespace
> +       ip link add dev $DEV type $TYPE external
> +       ip link set dev $DEV up
> +       ip addr add dev $DEV 10.1.1.200/24
> +}
> +
>  function attach_bpf {
>         DEV=$1
>         SET_TUNNEL=$2
> @@ -85,6 +96,7 @@ function test_gre {
>         attach_bpf $DEV gre_set_tunnel gre_get_tunnel
>         ping -c 1 10.1.1.100
>         ip netns exec at_ns0 ping -c 1 10.1.1.200
> +       cleanup
>  }
>
>  function test_vxlan {
> @@ -96,6 +108,7 @@ function test_vxlan {
>         attach_bpf $DEV vxlan_set_tunnel vxlan_get_tunnel
>         ping -c 1 10.1.1.100
>         ip netns exec at_ns0 ping -c 1 10.1.1.200
> +       cleanup
>  }
>
>  function test_geneve {
> @@ -107,21 +120,48 @@ function test_geneve {
>         attach_bpf $DEV geneve_set_tunnel geneve_get_tunnel
>         ping -c 1 10.1.1.100
>         ip netns exec at_ns0 ping -c 1 10.1.1.200
> +       cleanup
> +}
> +
> +function test_ipip {
> +       TYPE=ipip
> +       DEV_NS=ipip00
> +       DEV=ipip11
> +       config_device
> +       tcpdump -nei veth1 &
> +       cat /sys/kernel/debug/tracing/trace_pipe &
> +       add_ipip_tunnel
> +       ethtool -K veth1 gso off gro off rx off tx off
> +       ip link set dev veth1 mtu 1500
> +       attach_bpf $DEV ipip_set_tunnel ipip_get_tunnel
> +       ping -c 1 10.1.1.100
> +       ip netns exec at_ns0 ping -c 1 10.1.1.200
> +       ip netns exec at_ns0 iperf -sD -p 5200 > /dev/null
> +       sleep 0.2
> +       iperf -c 10.1.1.100 -n 5k -p 5200
> +       cleanup
>  }
>
>  function cleanup {
> +       set +ex
> +       pkill iperf
>         ip netns delete at_ns0
>         ip link del veth1
> -       ip link del $DEV
> +       ip link del ipip11
> +       ip link del gretap11
> +       ip link del geneve11
> +       pkill tcpdump
> +       pkill cat
> +       set -ex
>  }
>
> +cleanup
>  echo "Testing GRE tunnel..."
>  test_gre
> -cleanup
>  echo "Testing VXLAN tunnel..."
>  test_vxlan
> -cleanup
>  echo "Testing GENEVE tunnel..."
>  test_geneve
> -cleanup
> -echo "Success"
> +echo "Testing IPIP tunnel..."
> +test_ipip
> +echo "*** PASS ***"
> --
> 2.8.0
>

^ permalink raw reply

* Re: [PATCH] mwifiex: fix null pointer deference when adapter is null
From: Kalle Valo @ 2016-09-16  3:56 UTC (permalink / raw)
  To: kbuild test robot
  Cc: Colin King, kbuild-all-JC7UmRfGjtg, Amitkumar Karwar,
	Nishant Sarmukadam, linux-wireless-u79uwXL29TY76Z2rM5mHXA,
	netdev-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <201609160029.fRGnTGuA%fengguang.wu-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>

kbuild test robot <lkp-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org> writes:

> url:    https://github.com/0day-ci/linux/commits/Colin-King/mwifiex-fix-null-pointer-deference-when-adapter-is-null/20160915-231625
> base:   https://git.kernel.org/pub/scm/linux/kernel/git/kvalo/wireless-drivers-next.git master
> config: x86_64-randconfig-x013-201637 (attached as .config)
> compiler: gcc-6 (Debian 6.1.1-9) 6.1.1 20160705
> reproduce:
>         # save the attached .config to linux build tree
>         make ARCH=x86_64 
>
> All warnings (new ones prefixed by >>):
>
>    drivers/net/wireless/marvell/mwifiex/main.c: In function 'mwifiex_shutdown_sw':
>>> drivers/net/wireless/marvell/mwifiex/main.c:1433:1: warning: label 'exit_remove' defined but not used [-Wunused-label]
>     exit_remove:
>     ^~~~~~~~~~~

Looks like a valid warning to me, so please resend.

-- 
Kalle Valo

^ permalink raw reply

* Re: [PATCHv3 next 3/3] ipvlan: Introduce l3s mode
From: David Ahern @ 2016-09-16  1:49 UTC (permalink / raw)
  To: Mahesh Bandewar, netdev; +Cc: Eric Dumazet, David Miller, Mahesh Bandewar
In-Reply-To: <1473984847-32243-1-git-send-email-mahesh@bandewar.net>

On 9/15/16 6:14 PM, Mahesh Bandewar wrote:
> diff --git a/drivers/net/ipvlan/ipvlan.h b/drivers/net/ipvlan/ipvlan.h
> index 695a5dc9ace3..371f4548c42d 100644
> --- a/drivers/net/ipvlan/ipvlan.h
> +++ b/drivers/net/ipvlan/ipvlan.h
> @@ -23,11 +23,13 @@
>  #include <linux/if_vlan.h>
>  #include <linux/ip.h>
>  #include <linux/inetdevice.h>
> +#include <linux/netfilter.h>
>  #include <net/ip.h>
>  #include <net/ip6_route.h>
>  #include <net/rtnetlink.h>
>  #include <net/route.h>
>  #include <net/addrconf.h>
> +#include <net/l3mdev.h>
>  
>  #define IPVLAN_DRV	"ipvlan"
>  #define IPV_DRV_VER	"0.1"
> @@ -96,6 +98,7 @@ struct ipvl_port {
>  	struct work_struct	wq;
>  	struct sk_buff_head	backlog;
>  	int			count;
> +	bool			hooks_attached;

With a refcnt on the hook registration you don't need this bool and removing simplifies the set_mode logic.


> diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c
> index 18b4e8c7f68a..aca690f41559 100644
> --- a/drivers/net/ipvlan/ipvlan_main.c
> +++ b/drivers/net/ipvlan/ipvlan_main.c

....

> +static void ipvlan_unregister_nf_hook(void)
> +{
> +	BUG_ON(!ipvl_nf_hook_refcnt);

not a panic() worthy issue. just a pr_warn or WARN_ON_ONCE should be ok.

^ permalink raw reply

* Re: [PATCH V3 1/3] Documentation: devicetree: add qca8k binding
From: Florian Fainelli @ 2016-09-16  1:00 UTC (permalink / raw)
  To: John Crispin, David S. Miller, Andrew Lunn
  Cc: linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	netdev-u79uwXL29TY76Z2rM5mHXA, qsdk-review-A+ZNKFmMK5xy9aJCnZT0Uw,
	devicetree-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <1473949601-20674-2-git-send-email-john-Pj+rj9U5foFAfugRpC6u6w@public.gmane.org>

On 09/15/2016 07:26 AM, John Crispin wrote:
> Add device-tree binding for ar8xxx switch families.
> 
> Cc: devicetree-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> Signed-off-by: John Crispin <john-Pj+rj9U5foFAfugRpC6u6w@public.gmane.org>

Reviewed-by: Florian Fainelli <f.fainelli-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
-- 
Florian
--
To unsubscribe from this list: send the line "unsubscribe devicetree" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: Modification to skb->queue_mapping affecting performance
From: Michael Ma @ 2016-09-16  0:51 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: netdev
In-Reply-To: <CAAmHdhzKyZ0M_L6OSxaOoLwf4u-T+yFOvBckaiq9OpVJA7Ca0A@mail.gmail.com>

2016-09-14 10:46 GMT-07:00 Michael Ma <make0818@gmail.com>:
> 2016-09-13 22:22 GMT-07:00 Eric Dumazet <eric.dumazet@gmail.com>:
>> On Tue, 2016-09-13 at 22:13 -0700, Michael Ma wrote:
>>
>>> I don't intend to install multiple qdisc - the only reason that I'm
>>> doing this now is to leverage MQ to workaround the lock contention,
>>> and based on the profile this all worked. However to simplify the way
>>> to setup HTB I wanted to use TXQ to partition HTB classes so that a
>>> HTB class only belongs to one TXQ, which also requires mapping skb to
>>> TXQ using some rules (here I'm using priority but I assume it's
>>> straightforward to use other information such as classid). And the
>>> problem I found here is that when using priority to infer the TXQ so
>>> that queue_mapping is changed, bandwidth is affected significantly -
>>> the only thing I can guess is that due to queue switch, there are more
>>> cache misses assuming processor cores have a static mapping to all the
>>> queues. Any suggestion on what to do next for the investigation?
>>>
>>> I would also guess that this should be a common problem if anyone
>>> wants to use MQ+IFB to workaround the qdisc lock contention on the
>>> receiver side and classful qdisc is used on IFB, but haven't really
>>> found a similar thread here...
>>
>> But why are you changing the queue ?
>>
>> NIC already does the proper RSS thing, meaning all packets of one flow
>> should land on one RX queue. No need to ' classify yourself and risk
>> lock contention'
>>
>> I use IFB + MQ + netem every day, and it scales to 10 Mpps with no
>> problem.
>>
>> Do you really need to rate limit flows ? Not clear what are your goals,
>> why for example you use HTB to begin with.
>>
> Yes. My goal is to set different min/max bandwidth limits for
> different processes, so we started with HTB. However with HTB the
> qdisc root lock contention caused some unintended correlation between
> flows in different classes. For example if some flows belonging to one
> class have large amount of small packets, other flows in a different
> class will get their effective bandwidth reduced because they'll wait
> longer for the root lock. Using MQ this can be avoided because I'll
> just put flows belonging to one class to its dedicated TXQ. Then
> classes within one HTB on a TXQ will still have the lock contention
> problem but classes in different HTB will use different root locks so
> the contention doesn't exist.
>
> This also means that I'll need to classify packets to different
> TXQ/HTB based on some skb metadata (essentially similar to what mqprio
> is doing). So TXQ might need to be switched to achieve this.

My current theory to this problem is that tasklets in IFB might be
scheduled to the same cpu core if the RXQ happens to be the same for
two different flows. When queue_mapping is modified and multiple flows
are concentrated to the same IFB TXQ because they need to be
controlled by the same HTB, they'll have to use the same tasklet
because of the way IFB is implemented. So if other flows belonging to
a different TXQ/tasklet happens to be scheduled on the same core, that
core can be overloaded and becomes the bottleneck. Without modifying
the queue_mapping the chance of this contention is much lower.

This is a speculation based on the increased si time in softirqd
process. I'll try to affinitize each tasklet with a cpu core to verify
whether this is the problem. I also noticed that in the past there was
a similar proposal of scheduling the tasklet to a dedicated core which
was not committed(https://patchwork.ozlabs.org/patch/38486/). I'll try
something similar to verify this theory.

^ permalink raw reply

* Re: [net-next PATCH 00/11] iw_cxgb4,cxgbit: remove duplicate code
From: David Miller @ 2016-09-16  0:50 UTC (permalink / raw)
  To: varun
  Cc: netdev, linux-rdma, target-devel, nab, dledford, swise,
	gerlitz.or, indranil
In-Reply-To: <cover.1473781521.git.varun@chelsio.com>

From: Varun Prakash <varun@chelsio.com>
Date: Tue, 13 Sep 2016 21:23:55 +0530

> This patch series removes duplicate code from
> iw_cxgb4 and cxgbit by adding common function
> definitions in libcxgb.
> 
> Please review.

Series applied, thanks.

^ permalink raw reply

* Re: [PATCH net-next] openvswitch: avoid deferred execution of recirc actions
From: David Miller @ 2016-09-16  0:36 UTC (permalink / raw)
  To: lrichard-H+wXaHxf7aLQT0dZR+AlfA
  Cc: dev-yBygre7rU0TnMu66kgdUjQ, netdev-u79uwXL29TY76Z2rM5mHXA,
	sramamur-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8
In-Reply-To: <1473775734-27382-1-git-send-email-lrichard-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>

From: Lance Richardson <lrichard@redhat.com>
Date: Tue, 13 Sep 2016 10:08:54 -0400

> The ovs kernel data path currently defers the execution of all
> recirc actions until stack utilization is at a minimum.
> This is too limiting for some packet forwarding scenarios due to
> the small size of the deferred action FIFO (10 entries). For
> example, broadcast traffic sent out more than 10 ports with
> recirculation results in packet drops when the deferred action
> FIFO becomes full, as reported here:
> 
>      http://openvswitch.org/pipermail/dev/2016-March/067672.html
> 
> Since the current recursion depth is available (it is already tracked
> by the exec_actions_level pcpu variable), we can use it to determine
> whether to execute recirculation actions immediately (safe when
> recursion depth is low) or defer execution until more stack space is
> available.
> 
> With this change, the deferred action fifo size becomes a non-issue
> for currently failing scenarios because it is no longer used when
> there are three or fewer recursions through ovs_execute_actions().
> 
> Suggested-by: Pravin Shelar <pshelar@ovn.org>
> Signed-off-by: Lance Richardson <lrichard@redhat.com>

Applied.
_______________________________________________
dev mailing list
dev@openvswitch.org
http://openvswitch.org/mailman/listinfo/dev

^ permalink raw reply

* Re: [PATCH net-next V2 0/3] net/sched: cls_flower: Add ports masks
From: David Miller @ 2016-09-16  0:28 UTC (permalink / raw)
  To: ogerlitz; +Cc: jiri, netdev, hadarh, paulb
In-Reply-To: <1473942504-23216-1-git-send-email-ogerlitz@mellanox.com>

From: Or Gerlitz <ogerlitz@mellanox.com>
Date: Thu, 15 Sep 2016 15:28:21 +0300

> This series adds the ability to specify tcp/udp ports masks 
> for TC/flower filter matches.
> 
> I also removed an unused fields from the flower keys struct 
> and clarified the format of the recently added vlan attibutes.

Series applied.

^ permalink raw reply

* Re: [PATCH net-next v2 2/5] cxgb4: add common api support for configuring filters
From: David Miller @ 2016-09-16  0:18 UTC (permalink / raw)
  To: rahul.lakkireddy; +Cc: netdev, hariprasad, leedom, nirranjan, indranil
In-Reply-To: <09f7d2b95eaf850004b45a5dab48446ed6f7257f.1473759872.git.rahul.lakkireddy@chelsio.com>

From: Rahul Lakkireddy <rahul.lakkireddy@chelsio.com>
Date: Tue, 13 Sep 2016 17:12:26 +0530

> +/* Fill up default masks for set match fields. */
> +static void fill_default_mask(struct ch_filter_specification *fs)
> +{
> +	unsigned int i;
> +	unsigned int lip = 0, lip_mask = 0;
> +	unsigned int fip = 0, fip_mask = 0;

Always order local variable declarations from longest to shortest
line.

Please audit your entire submission for this issue.

Thanks.

^ permalink raw reply

* Re: [PATCH net-next] alx: fix error handling in __alx_open
From: David Miller @ 2016-09-16  0:15 UTC (permalink / raw)
  To: tobias.regnery; +Cc: netdev, jcliburn, chris.snook
In-Reply-To: <1473761217-18905-1-git-send-email-tobias.regnery@gmail.com>

From: Tobias Regnery <tobias.regnery@gmail.com>
Date: Tue, 13 Sep 2016 12:06:57 +0200

> In commit 9ee7b683ea63 we moved the enablement of msi interrupts earlier in
> alx_init_intr. If there is an error in alx_alloc_rings, __alx_open returns
> with an error but msi (or msi-x) interrupts stays enabled. Add a new error
> label to disable msi (or msi-x) interrupts.
> 
> Fixes: 9ee7b683ea63 ("alx: refactor msi enablement and disablement")
> Signed-off-by: Tobias Regnery <tobias.regnery@gmail.com>

Applied.

^ permalink raw reply

* [PATCHv3 next 3/3] ipvlan: Introduce l3s mode
From: Mahesh Bandewar @ 2016-09-16  0:14 UTC (permalink / raw)
  To: netdev; +Cc: Eric Dumazet, David Miller, Mahesh Bandewar, David Ahern

From: Mahesh Bandewar <maheshb@google.com>

In a typical IPvlan L3 setup where master is in default-ns and
each slave is into different (slave) ns. In this setup egress
packet processing for traffic originating from slave-ns will
hit all NF_HOOKs in slave-ns as well as default-ns. However same
is not true for ingress processing. All these NF_HOOKs are
hit only in the slave-ns skipping them in the default-ns.
IPvlan in L3 mode is restrictive and if admins want to deploy
iptables rules in default-ns, this asymmetric data path makes it
impossible to do so.

This patch makes use of the l3_rcv() (added as part of l3mdev
enhancements) to perform input route lookup on RX packets without
changing the skb->dev and then uses nf_hook at NF_INET_LOCAL_IN
to change the skb->dev just before handing over skb to L4.

Signed-off-by: Mahesh Bandewar <maheshb@google.com>
CC: David Ahern <dsa@cumulusnetworks.com>
---
 Documentation/networking/ipvlan.txt |  7 ++-
 drivers/net/Kconfig                 |  1 +
 drivers/net/ipvlan/ipvlan.h         |  7 +++
 drivers/net/ipvlan/ipvlan_core.c    | 94 +++++++++++++++++++++++++++++++++++++
 drivers/net/ipvlan/ipvlan_main.c    | 92 +++++++++++++++++++++++++++++++++---
 include/uapi/linux/if_link.h        |  1 +
 6 files changed, 194 insertions(+), 8 deletions(-)

diff --git a/Documentation/networking/ipvlan.txt b/Documentation/networking/ipvlan.txt
index 14422f8fcdc4..24196cef7c91 100644
--- a/Documentation/networking/ipvlan.txt
+++ b/Documentation/networking/ipvlan.txt
@@ -22,7 +22,7 @@ The driver can be built into the kernel (CONFIG_IPVLAN=y) or as a module
 	There are no module parameters for this driver and it can be configured
 using IProute2/ip utility.
 
-	ip link add link <master-dev> <slave-dev> type ipvlan mode { l2 | L3 }
+	ip link add link <master-dev> <slave-dev> type ipvlan mode { l2 | l3 | l3s }
 
 	e.g. ip link add link ipvl0 eth0 type ipvlan mode l2
 
@@ -48,6 +48,11 @@ master device for the L2 processing and routing from that instance will be
 used before packets are queued on the outbound device. In this mode the slaves
 will not receive nor can send multicast / broadcast traffic.
 
+4.3 L3S mode:
+	This is very similar to the L3 mode except that iptables (conn-tracking)
+works in this mode and hence it is L3-symmetric (L3s). This will have slightly less
+performance but that shouldn't matter since you are choosing this mode over plain-L3
+mode to make conn-tracking work.
 
 5. What to choose (macvlan vs. ipvlan)?
 	These two devices are very similar in many regards and the specific use
diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index 0c5415b05ea9..8768a625350d 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -149,6 +149,7 @@ config IPVLAN
     tristate "IP-VLAN support"
     depends on INET
     depends on IPV6
+    depends on NET_L3_MASTER_DEV
     ---help---
       This allows one to create virtual devices off of a main interface
       and packets will be delivered based on the dest L3 (IPv6/IPv4 addr)
diff --git a/drivers/net/ipvlan/ipvlan.h b/drivers/net/ipvlan/ipvlan.h
index 695a5dc9ace3..371f4548c42d 100644
--- a/drivers/net/ipvlan/ipvlan.h
+++ b/drivers/net/ipvlan/ipvlan.h
@@ -23,11 +23,13 @@
 #include <linux/if_vlan.h>
 #include <linux/ip.h>
 #include <linux/inetdevice.h>
+#include <linux/netfilter.h>
 #include <net/ip.h>
 #include <net/ip6_route.h>
 #include <net/rtnetlink.h>
 #include <net/route.h>
 #include <net/addrconf.h>
+#include <net/l3mdev.h>
 
 #define IPVLAN_DRV	"ipvlan"
 #define IPV_DRV_VER	"0.1"
@@ -96,6 +98,7 @@ struct ipvl_port {
 	struct work_struct	wq;
 	struct sk_buff_head	backlog;
 	int			count;
+	bool			hooks_attached;
 	struct rcu_head		rcu;
 };
 
@@ -124,4 +127,8 @@ struct ipvl_addr *ipvlan_find_addr(const struct ipvl_dev *ipvlan,
 				   const void *iaddr, bool is_v6);
 bool ipvlan_addr_busy(struct ipvl_port *port, void *iaddr, bool is_v6);
 void ipvlan_ht_addr_del(struct ipvl_addr *addr);
+struct sk_buff *ipvlan_l3_rcv(struct net_device *dev, struct sk_buff *skb,
+			      u16 proto);
+unsigned int ipvlan_nf_input(void *priv, struct sk_buff *skb,
+			     const struct nf_hook_state *state);
 #endif /* __IPVLAN_H */
diff --git a/drivers/net/ipvlan/ipvlan_core.c b/drivers/net/ipvlan/ipvlan_core.c
index b5f9511d819e..b4e990743e1d 100644
--- a/drivers/net/ipvlan/ipvlan_core.c
+++ b/drivers/net/ipvlan/ipvlan_core.c
@@ -560,6 +560,7 @@ int ipvlan_queue_xmit(struct sk_buff *skb, struct net_device *dev)
 	case IPVLAN_MODE_L2:
 		return ipvlan_xmit_mode_l2(skb, dev);
 	case IPVLAN_MODE_L3:
+	case IPVLAN_MODE_L3S:
 		return ipvlan_xmit_mode_l3(skb, dev);
 	}
 
@@ -664,6 +665,8 @@ rx_handler_result_t ipvlan_handle_frame(struct sk_buff **pskb)
 		return ipvlan_handle_mode_l2(pskb, port);
 	case IPVLAN_MODE_L3:
 		return ipvlan_handle_mode_l3(pskb, port);
+	case IPVLAN_MODE_L3S:
+		return RX_HANDLER_PASS;
 	}
 
 	/* Should not reach here */
@@ -672,3 +675,94 @@ rx_handler_result_t ipvlan_handle_frame(struct sk_buff **pskb)
 	kfree_skb(skb);
 	return RX_HANDLER_CONSUMED;
 }
+
+static struct ipvl_addr *ipvlan_skb_to_addr(struct sk_buff *skb,
+					    struct net_device *dev)
+{
+	struct ipvl_addr *addr = NULL;
+	struct ipvl_port *port;
+	void *lyr3h;
+	int addr_type;
+
+	if (!dev || !netif_is_ipvlan_port(dev))
+		goto out;
+
+	port = ipvlan_port_get_rcu(dev);
+	if (!port || port->mode != IPVLAN_MODE_L3S)
+		goto out;
+
+	lyr3h = ipvlan_get_L3_hdr(skb, &addr_type);
+	if (!lyr3h)
+		goto out;
+
+	addr = ipvlan_addr_lookup(port, lyr3h, addr_type, true);
+out:
+	return addr;
+}
+
+struct sk_buff *ipvlan_l3_rcv(struct net_device *dev, struct sk_buff *skb,
+			      u16 proto)
+{
+	struct ipvl_addr *addr;
+	struct net_device *sdev;
+
+	addr = ipvlan_skb_to_addr(skb, dev);
+	if (!addr)
+		goto out;
+
+	sdev = addr->master->dev;
+	switch (proto) {
+	case AF_INET:
+	{
+		int err;
+		struct iphdr *ip4h = ip_hdr(skb);
+
+		err = ip_route_input_noref(skb, ip4h->daddr, ip4h->saddr,
+					   ip4h->tos, sdev);
+		if (unlikely(err))
+			goto out;
+		break;
+	}
+	case AF_INET6:
+	{
+		struct dst_entry *dst;
+		struct ipv6hdr *ip6h = ipv6_hdr(skb);
+		int flags = RT6_LOOKUP_F_HAS_SADDR;
+		struct flowi6 fl6 = {
+			.flowi6_iif   = sdev->ifindex,
+			.daddr        = ip6h->daddr,
+			.saddr        = ip6h->saddr,
+			.flowlabel    = ip6_flowinfo(ip6h),
+			.flowi6_mark  = skb->mark,
+			.flowi6_proto = ip6h->nexthdr,
+		};
+
+		skb_dst_drop(skb);
+		dst = ip6_route_input_lookup(dev_net(sdev), sdev, &fl6, flags);
+		skb_dst_set(skb, dst);
+		break;
+	}
+	default:
+		break;
+	}
+
+out:
+	return skb;
+}
+
+unsigned int ipvlan_nf_input(void *priv, struct sk_buff *skb,
+			     const struct nf_hook_state *state)
+{
+	struct ipvl_addr *addr;
+	unsigned int len;
+
+	addr = ipvlan_skb_to_addr(skb, skb->dev);
+	if (!addr)
+		goto out;
+
+	skb->dev = addr->master->dev;
+	len = skb->len + ETH_HLEN;
+	ipvlan_count_rx(addr->master, len, true, false);
+out:
+	return NF_ACCEPT;
+}
diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c
index 18b4e8c7f68a..aca690f41559 100644
--- a/drivers/net/ipvlan/ipvlan_main.c
+++ b/drivers/net/ipvlan/ipvlan_main.c
@@ -9,24 +9,91 @@
 
 #include "ipvlan.h"
 
+static u32 ipvl_nf_hook_refcnt = 0;
+
+static struct nf_hook_ops ipvl_nfops[] __read_mostly = {
+	{
+		.hook     = ipvlan_nf_input,
+		.pf       = NFPROTO_IPV4,
+		.hooknum  = NF_INET_LOCAL_IN,
+		.priority = INT_MAX,
+	},
+	{
+		.hook     = ipvlan_nf_input,
+		.pf       = NFPROTO_IPV6,
+		.hooknum  = NF_INET_LOCAL_IN,
+		.priority = INT_MAX,
+	},
+};
+
+static struct l3mdev_ops ipvl_l3mdev_ops __read_mostly = {
+	.l3mdev_l3_rcv = ipvlan_l3_rcv,
+};
+
 static void ipvlan_adjust_mtu(struct ipvl_dev *ipvlan, struct net_device *dev)
 {
 	ipvlan->dev->mtu = dev->mtu - ipvlan->mtu_adj;
 }
 
-static void ipvlan_set_port_mode(struct ipvl_port *port, u16 nval)
+static int ipvlan_register_nf_hook(void)
+{
+	int err = 0;
+
+	if (!ipvl_nf_hook_refcnt) {
+		err = _nf_register_hooks(ipvl_nfops, ARRAY_SIZE(ipvl_nfops));
+		if (!err)
+			ipvl_nf_hook_refcnt = 1;
+	} else {
+		ipvl_nf_hook_refcnt++;
+	}
+
+	return err;
+}
+
+static void ipvlan_unregister_nf_hook(void)
+{
+	BUG_ON(!ipvl_nf_hook_refcnt);
+
+	ipvl_nf_hook_refcnt--;
+	if (!ipvl_nf_hook_refcnt)
+		_nf_unregister_hooks(ipvl_nfops, ARRAY_SIZE(ipvl_nfops));
+}
+
+static int ipvlan_set_port_mode(struct ipvl_port *port, u16 nval)
 {
 	struct ipvl_dev *ipvlan;
+	struct net_device *mdev = port->dev;
+	int err = 0;
 
+	ASSERT_RTNL();
 	if (port->mode != nval) {
+		if (nval == IPVLAN_MODE_L3S) {
+			if (!port->hooks_attached) {
+				err = ipvlan_register_nf_hook();
+				if (!err) {
+					mdev->l3mdev_ops = &ipvl_l3mdev_ops;
+					mdev->priv_flags |= IFF_L3MDEV_MASTER;
+					port->hooks_attached = true;
+				} else
+					return err;
+			}
+		} else {
+			if (port->hooks_attached) {
+				mdev->priv_flags &= ~IFF_L3MDEV_MASTER;
+				ipvlan_unregister_nf_hook();
+				mdev->l3mdev_ops = NULL;
+			}
+			port->hooks_attached = false;
+		}
 		list_for_each_entry(ipvlan, &port->ipvlans, pnode) {
-			if (nval == IPVLAN_MODE_L3)
+			if (nval == IPVLAN_MODE_L3 || nval == IPVLAN_MODE_L3S)
 				ipvlan->dev->flags |= IFF_NOARP;
 			else
 				ipvlan->dev->flags &= ~IFF_NOARP;
 		}
 		port->mode = nval;
 	}
+	return err;
 }
 
 static int ipvlan_port_create(struct net_device *dev)
@@ -74,6 +141,12 @@ static void ipvlan_port_destroy(struct net_device *dev)
 	struct ipvl_port *port = ipvlan_port_get_rtnl(dev);
 
 	dev->priv_flags &= ~IFF_IPVLAN_MASTER;
+	if (port->hooks_attached) {
+		dev->priv_flags &= ~IFF_L3MDEV_MASTER;
+		ipvlan_unregister_nf_hook();
+		dev->l3mdev_ops = NULL;
+		port->hooks_attached = false;
+	}
 	netdev_rx_handler_unregister(dev);
 	cancel_work_sync(&port->wq);
 	__skb_queue_purge(&port->backlog);
@@ -132,7 +205,8 @@ static int ipvlan_open(struct net_device *dev)
 	struct net_device *phy_dev = ipvlan->phy_dev;
 	struct ipvl_addr *addr;
 
-	if (ipvlan->port->mode == IPVLAN_MODE_L3)
+	if (ipvlan->port->mode == IPVLAN_MODE_L3 ||
+	    ipvlan->port->mode == IPVLAN_MODE_L3S)
 		dev->flags |= IFF_NOARP;
 	else
 		dev->flags &= ~IFF_NOARP;
@@ -372,13 +446,14 @@ static int ipvlan_nl_changelink(struct net_device *dev,
 {
 	struct ipvl_dev *ipvlan = netdev_priv(dev);
 	struct ipvl_port *port = ipvlan_port_get_rtnl(ipvlan->phy_dev);
+	int err = 0;
 
 	if (data && data[IFLA_IPVLAN_MODE]) {
 		u16 nmode = nla_get_u16(data[IFLA_IPVLAN_MODE]);
 
-		ipvlan_set_port_mode(port, nmode);
+		err = ipvlan_set_port_mode(port, nmode);
 	}
-	return 0;
+	return err;
 }
 
 static size_t ipvlan_nl_getsize(const struct net_device *dev)
@@ -473,10 +548,13 @@ static int ipvlan_link_new(struct net *src_net, struct net_device *dev,
 		unregister_netdevice(dev);
 		return err;
 	}
+	err = ipvlan_set_port_mode(port, mode);
+	if (err) {
+		unregister_netdevice(dev);
+		return err;
+	}
 
 	list_add_tail_rcu(&ipvlan->pnode, &port->ipvlans);
-	ipvlan_set_port_mode(port, mode);
-
 	netif_stacked_transfer_operstate(phy_dev, dev);
 	return 0;
 }
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 9bf3aecfe05b..a615583bab09 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -464,6 +464,7 @@ enum {
 enum ipvlan_mode {
 	IPVLAN_MODE_L2 = 0,
 	IPVLAN_MODE_L3,
+	IPVLAN_MODE_L3S,
 	IPVLAN_MODE_MAX
 };
 
-- 
2.8.0.rc3.226.g39d4020

^ permalink raw reply related

* [PATCHv3 next 2/3] net: Add _nf_(un)register_hooks symbols
From: Mahesh Bandewar @ 2016-09-16  0:13 UTC (permalink / raw)
  To: netdev; +Cc: Eric Dumazet, David Miller, Mahesh Bandewar, Pablo Neira Ayuso

From: Mahesh Bandewar <maheshb@google.com>

Add _nf_register_hooks() and _nf_unregister_hooks() calls which allow
caller to hold RTNL mutex.

Signed-off-by: Mahesh Bandewar <maheshb@google.com>
CC: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter.h |  2 ++
 net/netfilter/core.c      | 51 ++++++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 48 insertions(+), 5 deletions(-)

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index 9230f9aee896..e82b76781bf6 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -133,6 +133,8 @@ int nf_register_hook(struct nf_hook_ops *reg);
 void nf_unregister_hook(struct nf_hook_ops *reg);
 int nf_register_hooks(struct nf_hook_ops *reg, unsigned int n);
 void nf_unregister_hooks(struct nf_hook_ops *reg, unsigned int n);
+int _nf_register_hooks(struct nf_hook_ops *reg, unsigned int n);
+void _nf_unregister_hooks(struct nf_hook_ops *reg, unsigned int n);
 
 /* Functions to register get/setsockopt ranges (non-inclusive).  You
    need to check permissions yourself! */
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index f39276d1c2d7..2c5327e43a88 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -188,19 +188,17 @@ EXPORT_SYMBOL(nf_unregister_net_hooks);
 
 static LIST_HEAD(nf_hook_list);
 
-int nf_register_hook(struct nf_hook_ops *reg)
+static int _nf_register_hook(struct nf_hook_ops *reg)
 {
 	struct net *net, *last;
 	int ret;
 
-	rtnl_lock();
 	for_each_net(net) {
 		ret = nf_register_net_hook(net, reg);
 		if (ret && ret != -ENOENT)
 			goto rollback;
 	}
 	list_add_tail(&reg->list, &nf_hook_list);
-	rtnl_unlock();
 
 	return 0;
 rollback:
@@ -210,19 +208,34 @@ rollback:
 			break;
 		nf_unregister_net_hook(net, reg);
 	}
+	return ret;
+}
+
+int nf_register_hook(struct nf_hook_ops *reg)
+{
+	int ret;
+
+	rtnl_lock();
+	ret = _nf_register_hook(reg);
 	rtnl_unlock();
+
 	return ret;
 }
 EXPORT_SYMBOL(nf_register_hook);
 
-void nf_unregister_hook(struct nf_hook_ops *reg)
+static void _nf_unregister_hook(struct nf_hook_ops *reg)
 {
 	struct net *net;
 
-	rtnl_lock();
 	list_del(&reg->list);
 	for_each_net(net)
 		nf_unregister_net_hook(net, reg);
+}
+
+void nf_unregister_hook(struct nf_hook_ops *reg)
+{
+	rtnl_lock();
+	_nf_unregister_hook(reg);
 	rtnl_unlock();
 }
 EXPORT_SYMBOL(nf_unregister_hook);
@@ -246,6 +259,26 @@ err:
 }
 EXPORT_SYMBOL(nf_register_hooks);
 
+/* Caller MUST take rtnl_lock() */
+int _nf_register_hooks(struct nf_hook_ops *reg, unsigned int n)
+{
+	unsigned int i;
+	int err = 0;
+
+	for (i = 0; i < n; i++) {
+		err = _nf_register_hook(&reg[i]);
+		if (err)
+			goto err;
+	}
+	return err;
+
+err:
+	if (i > 0)
+		_nf_unregister_hooks(reg, i);
+	return err;
+}
+EXPORT_SYMBOL(_nf_register_hooks);
+
 void nf_unregister_hooks(struct nf_hook_ops *reg, unsigned int n)
 {
 	while (n-- > 0)
@@ -253,6 +286,14 @@ void nf_unregister_hooks(struct nf_hook_ops *reg, unsigned int n)
 }
 EXPORT_SYMBOL(nf_unregister_hooks);
 
+/* Caller MUST take rtnl_lock */
+void _nf_unregister_hooks(struct nf_hook_ops *reg, unsigned int n)
+{
+	while (n-- > 0)
+		_nf_unregister_hook(&reg[n]);
+}
+EXPORT_SYMBOL(_nf_unregister_hooks);
+
 unsigned int nf_iterate(struct list_head *head,
 			struct sk_buff *skb,
 			struct nf_hook_state *state,
-- 
2.8.0.rc3.226.g39d4020

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox