Netdev List
 help / color / mirror / Atom feed
* [PATCHv3 next 2/3] net: Add _nf_(un)register_hooks symbols
From: Mahesh Bandewar @ 2016-09-16  0:13 UTC (permalink / raw)
  To: netdev; +Cc: Eric Dumazet, David Miller, Mahesh Bandewar, Pablo Neira Ayuso

From: Mahesh Bandewar <maheshb@google.com>

Add _nf_register_hooks() and _nf_unregister_hooks() calls which allow
caller to hold RTNL mutex.

Signed-off-by: Mahesh Bandewar <maheshb@google.com>
CC: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter.h |  2 ++
 net/netfilter/core.c      | 51 ++++++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 48 insertions(+), 5 deletions(-)

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index 9230f9aee896..e82b76781bf6 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -133,6 +133,8 @@ int nf_register_hook(struct nf_hook_ops *reg);
 void nf_unregister_hook(struct nf_hook_ops *reg);
 int nf_register_hooks(struct nf_hook_ops *reg, unsigned int n);
 void nf_unregister_hooks(struct nf_hook_ops *reg, unsigned int n);
+int _nf_register_hooks(struct nf_hook_ops *reg, unsigned int n);
+void _nf_unregister_hooks(struct nf_hook_ops *reg, unsigned int n);
 
 /* Functions to register get/setsockopt ranges (non-inclusive).  You
    need to check permissions yourself! */
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index f39276d1c2d7..2c5327e43a88 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -188,19 +188,17 @@ EXPORT_SYMBOL(nf_unregister_net_hooks);
 
 static LIST_HEAD(nf_hook_list);
 
-int nf_register_hook(struct nf_hook_ops *reg)
+static int _nf_register_hook(struct nf_hook_ops *reg)
 {
 	struct net *net, *last;
 	int ret;
 
-	rtnl_lock();
 	for_each_net(net) {
 		ret = nf_register_net_hook(net, reg);
 		if (ret && ret != -ENOENT)
 			goto rollback;
 	}
 	list_add_tail(&reg->list, &nf_hook_list);
-	rtnl_unlock();
 
 	return 0;
 rollback:
@@ -210,19 +208,34 @@ rollback:
 			break;
 		nf_unregister_net_hook(net, reg);
 	}
+	return ret;
+}
+
+int nf_register_hook(struct nf_hook_ops *reg)
+{
+	int ret;
+
+	rtnl_lock();
+	ret = _nf_register_hook(reg);
 	rtnl_unlock();
+
 	return ret;
 }
 EXPORT_SYMBOL(nf_register_hook);
 
-void nf_unregister_hook(struct nf_hook_ops *reg)
+static void _nf_unregister_hook(struct nf_hook_ops *reg)
 {
 	struct net *net;
 
-	rtnl_lock();
 	list_del(&reg->list);
 	for_each_net(net)
 		nf_unregister_net_hook(net, reg);
+}
+
+void nf_unregister_hook(struct nf_hook_ops *reg)
+{
+	rtnl_lock();
+	_nf_unregister_hook(reg);
 	rtnl_unlock();
 }
 EXPORT_SYMBOL(nf_unregister_hook);
@@ -246,6 +259,26 @@ err:
 }
 EXPORT_SYMBOL(nf_register_hooks);
 
+/* Caller MUST take rtnl_lock() */
+int _nf_register_hooks(struct nf_hook_ops *reg, unsigned int n)
+{
+	unsigned int i;
+	int err = 0;
+
+	for (i = 0; i < n; i++) {
+		err = _nf_register_hook(&reg[i]);
+		if (err)
+			goto err;
+	}
+	return err;
+
+err:
+	if (i > 0)
+		_nf_unregister_hooks(reg, i);
+	return err;
+}
+EXPORT_SYMBOL(_nf_register_hooks);
+
 void nf_unregister_hooks(struct nf_hook_ops *reg, unsigned int n)
 {
 	while (n-- > 0)
@@ -253,6 +286,14 @@ void nf_unregister_hooks(struct nf_hook_ops *reg, unsigned int n)
 }
 EXPORT_SYMBOL(nf_unregister_hooks);
 
+/* Caller MUST take rtnl_lock */
+void _nf_unregister_hooks(struct nf_hook_ops *reg, unsigned int n)
+{
+	while (n-- > 0)
+		_nf_unregister_hook(&reg[n]);
+}
+EXPORT_SYMBOL(_nf_unregister_hooks);
+
 unsigned int nf_iterate(struct list_head *head,
 			struct sk_buff *skb,
 			struct nf_hook_state *state,
-- 
2.8.0.rc3.226.g39d4020

^ permalink raw reply related

* [PATCHv3 next 1/3] ipv6: Export p6_route_input_lookup symbol
From: Mahesh Bandewar @ 2016-09-16  0:13 UTC (permalink / raw)
  To: netdev; +Cc: Eric Dumazet, David Miller, Mahesh Bandewar

From: Mahesh Bandewar <maheshb@google.com>

Make ip6_route_input_lookup available outside of ipv6 the module
similar to ip_route_input_noref in the IPv4 world.

Signed-off-by: Mahesh Bandewar <maheshb@google.com>
---
 include/net/ip6_route.h | 3 +++
 net/ipv6/route.c        | 7 ++++---
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index d97305d0e71f..e0cd318d5103 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -64,6 +64,9 @@ static inline bool rt6_need_strict(const struct in6_addr *daddr)
 }
 
 void ip6_route_input(struct sk_buff *skb);
+struct dst_entry *ip6_route_input_lookup(struct net *net,
+					 struct net_device *dev,
+					 struct flowi6 *fl6, int flags);
 
 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
 					 struct flowi6 *fl6, int flags);
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index ad4a7ff301fc..4dab585f7642 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1147,15 +1147,16 @@ static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *
 	return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
 }
 
-static struct dst_entry *ip6_route_input_lookup(struct net *net,
-						struct net_device *dev,
-						struct flowi6 *fl6, int flags)
+struct dst_entry *ip6_route_input_lookup(struct net *net,
+					 struct net_device *dev,
+					 struct flowi6 *fl6, int flags)
 {
 	if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
 		flags |= RT6_LOOKUP_F_IFACE;
 
 	return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
 }
+EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
 
 void ip6_route_input(struct sk_buff *skb)
 {
-- 
2.8.0.rc3.226.g39d4020

^ permalink raw reply related

* [PATCHv3 next 0/3] IPvlan introduce l3s mode
From: Mahesh Bandewar @ 2016-09-16  0:13 UTC (permalink / raw)
  To: netdev; +Cc: Eric Dumazet, David Miller, Mahesh Bandewar

From: Mahesh Bandewar <maheshb@google.com>

Same old problem with new approach especially from suggestions from
earlier patch-series.

First thing is that this is introduced as a new mode rather than
modifying the old (L3) mode. So the behavior of the existing modes is
preserved as it is and the new L3s mode obeys iptables so that intended
conn-tracking can work. 

To do this, the code uses newly added l3mdev_rcv() handler and an
Iptables hook. l3mdev_rcv() to perform an inbound route lookup with the
correct (IPvlan slave) interface and then IPtable-hook at LOCAL_INPUT
to change the input device from master to the slave to complete the
formality.

Supporting stack changes are trivial changes to export symbol to get
IPv4 equivalent code exported for IPv6 and to allow netfilter hook
registration code to allow caller to hold RTNL. Please look into
individual patches for details.

Mahesh Bandewar (3):
  ipv6: Export p6_route_input_lookup symbol
  net: Add _nf_(un)register_hooks symbols
  ipvlan: Introduce l3s mode

 Documentation/networking/ipvlan.txt |  7 ++-
 drivers/net/Kconfig                 |  1 +
 drivers/net/ipvlan/ipvlan.h         |  7 +++
 drivers/net/ipvlan/ipvlan_core.c    | 94 +++++++++++++++++++++++++++++++++++++
 drivers/net/ipvlan/ipvlan_main.c    | 92 +++++++++++++++++++++++++++++++++---
 include/linux/netfilter.h           |  2 +
 include/net/ip6_route.h             |  3 ++
 include/uapi/linux/if_link.h        |  1 +
 net/ipv6/route.c                    |  7 +--
 net/netfilter/core.c                | 51 ++++++++++++++++++--
 10 files changed, 249 insertions(+), 16 deletions(-)

v1: Initial post
v2: Text correction and config changed from "select" to "depends on"
v3: separated nf_hook registration logic and made it independent of port
    as nf_hook registration is independant of how many IPvlan ports are
    present in the system.

-- 
2.8.0.rc3.226.g39d4020

^ permalink raw reply

* cdc_ncm driver padding problem (WAS: Question about CDC_NCM_FLAG_NDP_TO_END)
From: Enrico Mioso @ 2016-09-15 23:46 UTC (permalink / raw)
  To: Marek Brudka
  Cc: Bjron Mork, netdev-u79uwXL29TY76Z2rM5mHXA,
	linux-usb-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <15569aad-22e2-aa4d-e7f1-832063c6490a-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>

Hello guys.
Some very good people managed to detect there is a problem with some Huawei firmwares and NCM padding. I actually don't think I have the hardware to test btw.

On Wed, 14 Sep 2016, Marek Brudka wrote:
Sorry Marek - I forwarded this message without asking for your consent. Let me know anyway if this is a problem. thank you all guys for everything,
Enrico

==Date: Wed, 14 Sep 2016 19:31:50
==From: Marek Brudka <mbrudka-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
==To: Enrico Mioso <mrkiko.rs-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
==Subject: Re: Question about CDC_NCM_FLAG_NDP_TO_END
==
==Hello Enrico,
==
==As nobody at openwrt forum replied to my request on the way how to get
==the exact
==recompilation of OpenWrt 15.05.1 I decided to switch to the developement
==version
==(12/09/2016), which already contains your patch.
==
==The nice thing is that I got my modem (E3372 HiLink reflashed to E398)
==working
==in ncm mode!
==
==The bad thing is DHCP. It seems, that cdc_ncm driver somehow consumes DHCP
==replies. I had to manually setup wwan0 interface as well as routing
==using the result
==of Hayes command
==
==AT^DHCP?
==^DHCP:
==EC684764,F8FFFFFF,E9684764,E9684764,356002D4,366002D4,43200000,43200000
==OK
==
==Certainly, I will modify connect scripts
==   
==https://github.com/zabbius/smarthome/tree/master/openwrt/huawei-ncm/files/usr/sbin
==for me to parse this response. However it seems, that the problem is on
==driver level and
==is related with padding. Do you know this issue which is nicely
==described in the thread
==    https://forum.openwrt.org/viewtopic.php?pid=273099
==of OpenWrt forum?
==
==Thank you
==Marek Brudka
==
==
==W dniu 11.09.2016 o 15:19, Enrico Mioso pisze:
==> Hello Marek.
==>
==> First of all, thank you for your interest in this driver, and for writing.
==>
==> Unfortunately, I don't know the exact procedure to do that: you might be confortable putting those patches in generic-patches-kernel_version if I am not wrong, but I may well be wrong or imprecise, and recompile the whole Openwrt thing?
==> don't know. But yes, that message should appear in the dmesg.
==> NDPs need to be at end of NCM frames. Oh, I don't remember well what NDP stands for... ufh. Sorry.
==>
==> Anyway, let me know if I can do something for you.
==> Enrico
==>
==
==
==
--
To unsubscribe from this list: send the line "unsubscribe linux-usb" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH v3] net: ip, diag -- Add diag interface for raw sockets
From: David Ahern @ 2016-09-15 23:45 UTC (permalink / raw)
  To: Eric Dumazet, Cyrill Gorcunov
  Cc: netdev, linux-kernel, David Miller, kuznet, jmorris, yoshfuji,
	kaber, avagin, stephen
In-Reply-To: <1473979691.22679.55.camel@edumazet-glaptop3.roam.corp.google.com>

On 9/15/16 4:48 PM, Eric Dumazet wrote:
> On Fri, 2016-09-16 at 00:01 +0300, Cyrill Gorcunov wrote:
> 
>> Here I get kicked off the server. Login back
>>
>> [cyrill@uranus ~] ssh root@pcs7 
>> Last login: Thu Sep 15 23:20:42 2016 from gateway
>> [root@pcs7 ~]# cd /home/iproute2/
>> [root@pcs7 iproute2]# misc/ss -A raw
>> State      Recv-Q Send-Q                                Local Address:Port                                                 Peer Address:Port                
>> UNCONN     0      0                                                :::ipv6-icmp                                                      :::*                    
>> UNCONN     0      0                                                :::ipv6-icmp                                                      :::*                    
>>
>> Maybe I do something wrong for testing?
> 
> If you kill your shell, maybe /root/sock is killer as well, thus its raw
> sockets are closed.
> 
> Try to be selective in the -K , do not kill tcp sockets ?
> 
> 

I am running
   ss -aKw 'dev == red'

to kill raw sockets bound to device named 'red'.

^ permalink raw reply

* Re: [PATCH 0/5] Make /sys/class/net per net namespace objects belong to container
From: Eric W. Biederman @ 2016-09-15 23:26 UTC (permalink / raw)
  To: Dmitry Torokhov; +Cc: David Miller, Tejun Heo, lkml, netdev
In-Reply-To: <CAKdAkRTUjukVkXhMSPQ1YObRRVj0Fzvqivky_q34VCD9XRJdow@mail.gmail.com>

Dmitry Torokhov <dmitry.torokhov@gmail.com> writes:

> On Mon, Aug 29, 2016 at 5:38 AM, Eric W. Biederman
> <ebiederm@xmission.com> wrote:
>> David Miller <davem@davemloft.net> writes:
>>
>>> From: Dmitry Torokhov <dmitry.torokhov@gmail.com>
>>> Date: Tue, 16 Aug 2016 15:33:10 -0700
>>>
>>>> There are objects in /sys hierarchy (/sys/class/net/) that logically belong
>>>> to a namespace/container. Unfortunately all sysfs objects start their life
>>>> belonging to global root, and while we could change ownership manually,
>>>> keeping tracks of all objects that come and go is cumbersome. It would
>>>> be better if kernel created them using correct uid/gid from the beginning.
>>>>
>>>> This series changes kernfs to allow creating object's with arbitrary
>>>> uid/gid, adds get_ownership() callback to ktype structure so subsystems
>>>> could supply their own logic (likely tied to namespace support) for
>>>> determining ownership of kobjects, and adjusts sysfs code to make use of
>>>> this information. Lastly net-sysfs is adjusted to make sure that objects in
>>>> net namespace are owned by the root user from the owning user namespace.
>>>>
>>>> Note that we do not adjust ownership of objects moved into a new namespace
>>>> (as when moving a network device into a container) as userspace can easily
>>>> do it.
>>>
>>> I need some domain experts to review this series please.
>>
>> I just came back from vacation and I will aim to take a look shortly.
>>
>> The big picture idea seems sensible.  Having a better ownship of sysfs
>> files that are part of a network namespace.  I will have to look at the
>> details to see if the implementation is similarly sensible.
>
> Eric,
>
> Did you find anything objectionable in the series or should I fix up
> the !CONFIG_SYSFS error in networking patch and resubmit?

Thank you for the ping, I put this patchset down and forgot to look
back.

The notion of a get_ownership call seems sensible.

At some level I am not a fan of setting the uids and gids on the sysfs
nodes as that requires allocation of an additional data structure and it
will increase the code of sysfs nodes.   Certainly I don't think we
should incur that cost if we are not using user namespaces.  sysfs nodes
can be expensive data wise because we sometimes have so many of them.
So skipping the setattr when uid == GLOBAL_ROOT_UID and gid ==
GLOBAL_ROOT_GID seems very desirable.  Perhaps that is just an
optimization in setattr, but it should be somewhere.

I would very much prefer it if we can find a way not to touch all of the
layers, in the stack.  As I recall it is the code in drivers/base/core.c
that creates the attributes.  So my gut feel says we want to export a
sysfs_setattr modeled after sysfs_chmod from sysfs.h and then just have
the driver core level perform the setattr calls for non-default uids and
gids.

Symlinks we don't need to worry about changing their ownership they are
globally read, write, execute.

As long as the chattr happens before the uevent is triggered the code
should be essentially race free in dealing with userspace.

I think that will lead to a simpler more comprehensible and more
maintainable implementation.  Hooking in where or near where the
namespace bits hook in seems excessively complicated (although there may
be a good reason for it) that I am forgetting.

Eric

^ permalink raw reply

* Re: [PATCHv2 net-next] cxgb4vf: don't offload Rx checksums for IPv6 fragments
From: David Miller @ 2016-09-15 23:38 UTC (permalink / raw)
  To: hariprasad; +Cc: netdev, leedom, nirranjan
In-Reply-To: <1473754164-14478-1-git-send-email-hariprasad@chelsio.com>

From: Hariprasad Shenai <hariprasad@chelsio.com>
Date: Tue, 13 Sep 2016 13:39:24 +0530

> The checksum provided by the device doesn't include the L3 headers,
> as IPv6 expects
> 
> Signed-off-by: Hariprasad Shenai <hariprasad@chelsio.com>
> ---
> V2: Fixed compilation issue reported by kbuild bot

Applied.

^ permalink raw reply

* Re: [PATCH v2 2/2] openvswitch: use percpu flow stats
From: Thadeu Lima de Souza Cascardo @ 2016-09-15 23:36 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: dev-yBygre7rU0TnMu66kgdUjQ, netdev-u79uwXL29TY76Z2rM5mHXA,
	David Miller
In-Reply-To: <1473980966.22679.58.camel-XN9IlZ5yJG9HTL0Zs8A6p+yfmBU6pStAUsxypvmhUTTZJqsBc5GL+g@public.gmane.org>

On Thu, Sep 15, 2016 at 04:09:26PM -0700, Eric Dumazet wrote:
> On Thu, 2016-09-15 at 19:11 -0300, Thadeu Lima de Souza Cascardo wrote:
> > Instead of using flow stats per NUMA node, use it per CPU. When using
> > megaflows, the stats lock can be a bottleneck in scalability.
> > 
> > On a E5-2690 12-core system, usual throughput went from ~4Mpps to
> > ~15Mpps when forwarding between two 40GbE ports with a single flow
> > configured on the datapath.
> > 
> > This has been tested on a system with possible CPUs 0-7,16-23. After
> > module removal, there were no corruption on the slab cache.
> > 
> > Signed-off-by: Thadeu Lima de Souza Cascardo <cascardo@redhat.com>
> > Cc: pravin shelar <pshelar@ovn.org>
> > ---
> 
> > +	/* We open code this to make sure cpu 0 is always considered */
> > +	for (cpu = 0; cpu < nr_cpu_ids; cpu = cpumask_next(cpu, cpu_possible_mask))
> > +		if (flow->stats[cpu])
> >  			kmem_cache_free(flow_stats_cache,
> > -					(struct flow_stats __force *)flow->stats[node]);
> > +					(struct flow_stats __force *)flow->stats[cpu]);
> >  	kmem_cache_free(flow_cache, flow);
> >  }
> >  
> > @@ -757,7 +749,7 @@ int ovs_flow_init(void)
> >  	BUILD_BUG_ON(sizeof(struct sw_flow_key) % sizeof(long));
> >  
> >  	flow_cache = kmem_cache_create("sw_flow", sizeof(struct sw_flow)
> > -				       + (nr_node_ids
> > +				       + (nr_cpu_ids
> >  					  * sizeof(struct flow_stats *)),
> >  				       0, 0, NULL);
> >  	if (flow_cache == NULL)
> 
> Well, if you switch to percpu stats, better use normal
> alloc_percpu(struct flow_stats)
> 
> The code was dealing with per node allocation so could not use existing
> helper.
> 
> No need to keep this forever.

The problem is that the alloc_percpu uses a global spinlock and that affects
some workloads on OVS that creates lots of flows, as described in commit
9ac56358dec1a5aa7f4275a42971f55fad1f7f35 ("datapath: Per NUMA node flow
stats.").

This problem would not happen on this version as the flow allocation does not
suffer from the same scalability problem as when using alloc_percpu.

Cascardo.
_______________________________________________
dev mailing list
dev@openvswitch.org
http://openvswitch.org/mailman/listinfo/dev

^ permalink raw reply

* Re: [PATCH v6 net-next 1/1] net_sched: Introduce skbmod action
From: David Miller @ 2016-09-15 23:34 UTC (permalink / raw)
  To: jhs; +Cc: netdev, daniel, xiyou.wangcong, eric.dumazet, john.r.fastabend
In-Reply-To: <1473725589-27110-1-git-send-email-jhs@emojatatu.com>

From: Jamal Hadi Salim <jhs@mojatatu.com>
Date: Mon, 12 Sep 2016 20:13:09 -0400

> From: Jamal Hadi Salim <jhs@mojatatu.com>
> 
> This action is intended to be an upgrade from a usability perspective
> from pedit (as well as operational debugability).
> Compare this:
> 
> sudo tc filter add dev $ETH parent 1: protocol ip prio 10 \
> u32 match ip protocol 1 0xff flowid 1:2 \
> action pedit munge offset -14 u8 set 0x02 \
> munge offset -13 u8 set 0x15 \
> munge offset -12 u8 set 0x15 \
> munge offset -11 u8 set 0x15 \
> munge offset -10 u16 set 0x1515 \
> pipe
> 
> to:
> 
> sudo tc filter add dev $ETH parent 1: protocol ip prio 10 \
> u32 match ip protocol 1 0xff flowid 1:2 \
> action skbmod dmac 02:15:15:15:15:15
> 
> Also try to do a MAC address swap with pedit or worse
> try to debug a policy with destination mac, source mac and
> etherype. Then make few rules out of those and you'll get my point.
> 
> In the future common use cases on pedit can be migrated to this action
> (as an example different fields in ip v4/6, transports like tcp/udp/sctp
> etc). For this first cut, this allows modifying basic ethernet header.
> 
> The most important ethernet use case at the moment is when redirecting or
> mirroring packets to a remote machine. The dst mac address needs a re-write
> so that it doesnt get dropped or confuse an interconnecting (learning) switch
> or dropped by a target machine (which looks at the dst mac). And at times
> when flipping back the packet a swap of the MAC addresses is needed.
> 
> Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com>

Applied, thanks.

^ permalink raw reply

* Re: [PATCH v3 net 1/1] net sched actions: fix GETing actions
From: David Miller @ 2016-09-15 23:33 UTC (permalink / raw)
  To: jhs; +Cc: netdev, xiyou.wangcong
In-Reply-To: <1473721658-6034-1-git-send-email-jhs@emojatatu.com>

From: Jamal Hadi Salim <jhs@mojatatu.com>
Date: Mon, 12 Sep 2016 19:07:38 -0400

> From: Jamal Hadi Salim <jhs@mojatatu.com>
> 
> With the batch changes that translated transient actions into
> a temporary list lost in the translation was the fact that
> tcf_action_destroy() will eventually delete the action from
> the permanent location if the refcount is zero.
> 
> Example of what broke:
> ...add a gact action to drop
> sudo $TC actions add action drop index 10
> ...now retrieve it, looks good
> sudo $TC actions get action gact index 10
> ...retrieve it again and find it is gone!
> sudo $TC actions get action gact index 10
> 
> Fixes:
> commit 22dc13c837c3 ("net_sched: convert tcf_exts from list to pointer array"),
> commit 824a7e8863b3 ("net_sched: remove an unnecessary list_del()")
> commit f07fed82ad79 ("net_sched: remove the leftover cleanup_a()")
> 
> Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com>

Please incorporate Sergei's feedback and resubmit, thanks Jamal.

^ permalink raw reply

* Re: [PATCH net-next 0/2] Misc cls_bpf/act_bpf improvements
From: David Miller @ 2016-09-15 23:30 UTC (permalink / raw)
  To: daniel; +Cc: alexei.starovoitov, netdev
In-Reply-To: <cover.1473715365.git.daniel@iogearbox.net>

From: Daniel Borkmann <daniel@iogearbox.net>
Date: Mon, 12 Sep 2016 23:38:41 +0200

> Two minor improvements to {cls,act}_bpf. For details please see
> individual patches.

Series applied.

^ permalink raw reply

* RE: [Intel-wired-lan] [net-next PATCH v3 1/3] e1000: track BQL bytes regardless of skb or not
From: Brown, Aaron F @ 2016-09-15 23:29 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: John Fastabend, bblanco@plumgrid.com, Kirsher, Jeffrey T,
	brouer@redhat.com, davem@davemloft.net, xiyou.wangcong@gmail.com,
	intel-wired-lan@lists.osuosl.org, u9012063@gmail.com,
	netdev@vger.kernel.org
In-Reply-To: <20160915004353.GA63116@ast-mbp.thefacebook.com>

> > ------------[ cut here ]------------
> > WARNING: CPU: 1 PID: 0 at net/sched/sch_generic.c:316
> dev_watchdog+0x1c2/0x1d0
> > NETDEV WATCHDOG: eth1 (e1000): transmit queue 0 timed out
> 
> Thanks a lot for the tests! Really appreciate it.

np, I needed to get my old compatibility systems back in running order anyway.

^ permalink raw reply

* Re: [PATCH net-next 0/5] mlx4 misc fixes and improvements
From: David Miller @ 2016-09-15 23:21 UTC (permalink / raw)
  To: tariqt; +Cc: netdev, eranbe
In-Reply-To: <1473686416-11779-1-git-send-email-tariqt@mellanox.com>

From: Tariq Toukan <tariqt@mellanox.com>
Date: Mon, 12 Sep 2016 16:20:11 +0300

> This patchset contains some bug fixes, a cleanup, and small improvements
> from the team to the mlx4 Eth and core drivers.
> 
> Series generated against net-next commit:
> 02154927c115 "net: dsa: bcm_sf2: Get VLAN_PORT_MASK from b53_device"
> 
> Please push the following patch to -stable  >= 4.6 as well:
> "net/mlx4_core: Fix to clean devlink resources"

Again, coding style fixes and optimizations like branch prediction
hints are not bug fixes and therefore not appropriate for 'net'.

^ permalink raw reply

* [PATCH net-next] pkt_sched: fq: use proper locking in fq_dump_stats()
From: Eric Dumazet @ 2016-09-15 23:20 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

From: Eric Dumazet <edumazet@google.com>

When fq is used on 32bit kernels, we need to lock the qdisc before
copying 64bit fields.

Otherwise "tc -s qdisc ..." might report bogus values.

Fixes: afe4fd062416 ("pkt_sched: fq: Fair Queue packet scheduler")
Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 net/sched/sch_fq.c |   32 ++++++++++++++++++--------------
 1 file changed, 18 insertions(+), 14 deletions(-)

diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c
index e5458b99e09c..dc52cc10d6ed 100644
--- a/net/sched/sch_fq.c
+++ b/net/sched/sch_fq.c
@@ -823,20 +823,24 @@ nla_put_failure:
 static int fq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
 {
 	struct fq_sched_data *q = qdisc_priv(sch);
-	u64 now = ktime_get_ns();
-	struct tc_fq_qd_stats st = {
-		.gc_flows		= q->stat_gc_flows,
-		.highprio_packets	= q->stat_internal_packets,
-		.tcp_retrans		= q->stat_tcp_retrans,
-		.throttled		= q->stat_throttled,
-		.flows_plimit		= q->stat_flows_plimit,
-		.pkts_too_long		= q->stat_pkts_too_long,
-		.allocation_errors	= q->stat_allocation_errors,
-		.flows			= q->flows,
-		.inactive_flows		= q->inactive_flows,
-		.throttled_flows	= q->throttled_flows,
-		.time_next_delayed_flow	= q->time_next_delayed_flow - now,
-	};
+	struct tc_fq_qd_stats st;
+
+	sch_tree_lock(sch);
+
+	st.gc_flows		  = q->stat_gc_flows;
+	st.highprio_packets	  = q->stat_internal_packets;
+	st.tcp_retrans		  = q->stat_tcp_retrans;
+	st.throttled		  = q->stat_throttled;
+	st.flows_plimit		  = q->stat_flows_plimit;
+	st.pkts_too_long	  = q->stat_pkts_too_long;
+	st.allocation_errors	  = q->stat_allocation_errors;
+	st.time_next_delayed_flow = q->time_next_delayed_flow - ktime_get_ns();
+	st.flows		  = q->flows;
+	st.inactive_flows	  = q->inactive_flows;
+	st.throttled_flows	  = q->throttled_flows;
+	st.pad			  = 0;
+
+	sch_tree_unlock(sch);
 
 	return gnet_stats_copy_app(d, &st, sizeof(st));
 }

^ permalink raw reply related

* Re: [PATCH net-next] net/sched: act_tunnel_key: Remove rcu_read_lock protection
From: David Miller @ 2016-09-15 23:18 UTC (permalink / raw)
  To: hadarh
  Cc: netdev, jiri, jbenc, jhs, shmulik.ladkani, tom, edumazet,
	xiyou.wangcong, amirva, ogerlitz
In-Reply-To: <1473682762-8150-1-git-send-email-hadarh@mellanox.com>

From: Hadar Hen Zion <hadarh@mellanox.com>
Date: Mon, 12 Sep 2016 15:19:21 +0300

> Remove rcu_read_lock protection from tunnel_key_dump and use
> rtnl_dereference, dump operation is protected by  rtnl lock.
> 
> Also, remove rcu_read_lock from tunnel_key_release and use
> rcu_dereference_protected.
> 
> Both operations are running exclusively and a writer couldn't modify
> t->params while those functions are executed.
> 
> Fixes: 54d94fd89d90 ('net/sched: Introduce act_tunnel_key')
> Signed-off-by: Hadar Hen Zion <hadarh@mellanox.com>

Applied.

^ permalink raw reply

* Re: [PATCH] test_bpf: fix the dummy skb after dissector changes
From: David Miller @ 2016-09-15 23:17 UTC (permalink / raw)
  To: jakub.kicinski; +Cc: ast, daniel, netdev, hadarh
In-Reply-To: <1473681897-32260-1-git-send-email-jakub.kicinski@netronome.com>

From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Mon, 12 Sep 2016 13:04:57 +0100

> Commit d5709f7ab776 ("flow_dissector: For stripped vlan, get vlan
> info from skb->vlan_tci") made flow dissector look at vlan_proto
> when vlan is present.  Since test_bpf sets skb->vlan_tci to ~0
> (including VLAN_TAG_PRESENT) we have to populate skb->vlan_proto.
> 
> Fixes false negative on test #24:
> test_bpf: #24 LD_PAYLOAD_OFF jited:0 175 ret 0 != 42 FAIL (1 times)
> 
> Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
> Reviewed-by: Dinan Gunawardena <dinan.gunawardena@netronome.com>

Applied.

^ permalink raw reply

* Re: [PATCH][V2] atm: iphase: fix newline escape and minor tweak to source formatting
From: David Miller @ 2016-09-15 23:16 UTC (permalink / raw)
  To: colin.king; +Cc: 3chas3, linux-atm-general, netdev, linux-kernel
In-Reply-To: <20160912120150.31390-1-colin.king@canonical.com>

From: Colin King <colin.king@canonical.com>
Date: Mon, 12 Sep 2016 13:01:50 +0100

> From: Colin Ian King <colin.king@canonical.com>
> 
> The newline escape is incorrect and needs fixing. Also adjust source
> formatting / indentation and add { } to trailing else.
> 
> Signed-off-by: Colin Ian King <colin.king@canonical.com>

Applied.

^ permalink raw reply

* Re: [PATCH v2 2/2] openvswitch: use percpu flow stats
From: Eric Dumazet @ 2016-09-15 23:09 UTC (permalink / raw)
  To: Thadeu Lima de Souza Cascardo; +Cc: netdev, dev, pshelar, David Miller
In-Reply-To: <1473977513-7617-2-git-send-email-cascardo@redhat.com>

On Thu, 2016-09-15 at 19:11 -0300, Thadeu Lima de Souza Cascardo wrote:
> Instead of using flow stats per NUMA node, use it per CPU. When using
> megaflows, the stats lock can be a bottleneck in scalability.
> 
> On a E5-2690 12-core system, usual throughput went from ~4Mpps to
> ~15Mpps when forwarding between two 40GbE ports with a single flow
> configured on the datapath.
> 
> This has been tested on a system with possible CPUs 0-7,16-23. After
> module removal, there were no corruption on the slab cache.
> 
> Signed-off-by: Thadeu Lima de Souza Cascardo <cascardo@redhat.com>
> Cc: pravin shelar <pshelar@ovn.org>
> ---

> +	/* We open code this to make sure cpu 0 is always considered */
> +	for (cpu = 0; cpu < nr_cpu_ids; cpu = cpumask_next(cpu, cpu_possible_mask))
> +		if (flow->stats[cpu])
>  			kmem_cache_free(flow_stats_cache,
> -					(struct flow_stats __force *)flow->stats[node]);
> +					(struct flow_stats __force *)flow->stats[cpu]);
>  	kmem_cache_free(flow_cache, flow);
>  }
>  
> @@ -757,7 +749,7 @@ int ovs_flow_init(void)
>  	BUILD_BUG_ON(sizeof(struct sw_flow_key) % sizeof(long));
>  
>  	flow_cache = kmem_cache_create("sw_flow", sizeof(struct sw_flow)
> -				       + (nr_node_ids
> +				       + (nr_cpu_ids
>  					  * sizeof(struct flow_stats *)),
>  				       0, 0, NULL);
>  	if (flow_cache == NULL)

Well, if you switch to percpu stats, better use normal
alloc_percpu(struct flow_stats)

The code was dealing with per node allocation so could not use existing
helper.

No need to keep this forever.

^ permalink raw reply

* Re: [PATCH v3] net: ip, diag -- Add diag interface for raw sockets
From: Eric Dumazet @ 2016-09-15 22:48 UTC (permalink / raw)
  To: Cyrill Gorcunov
  Cc: David Ahern, netdev, linux-kernel, David Miller, kuznet, jmorris,
	yoshfuji, kaber, avagin, stephen
In-Reply-To: <20160915210126.GC1867@uranus.lan>

On Fri, 2016-09-16 at 00:01 +0300, Cyrill Gorcunov wrote:

> Here I get kicked off the server. Login back
> 
> [cyrill@uranus ~] ssh root@pcs7 
> Last login: Thu Sep 15 23:20:42 2016 from gateway
> [root@pcs7 ~]# cd /home/iproute2/
> [root@pcs7 iproute2]# misc/ss -A raw
> State      Recv-Q Send-Q                                Local Address:Port                                                 Peer Address:Port                
> UNCONN     0      0                                                :::ipv6-icmp                                                      :::*                    
> UNCONN     0      0                                                :::ipv6-icmp                                                      :::*                    
> 
> Maybe I do something wrong for testing?

If you kill your shell, maybe /root/sock is killer as well, thus its raw
sockets are closed.

Try to be selective in the -K , do not kill tcp sockets ?

^ permalink raw reply

* Re: MDB offloading of local ipv4 multicast groups
From: Andrew Lunn @ 2016-09-15 22:28 UTC (permalink / raw)
  To: John Crispin
  Cc: Elad Raz, netdev@vger.kernel.org, Ido Schimmel, Jiri Pirko,
	Nikolay Aleksandrov, David S. Miller
In-Reply-To: <ebb1092f-4fab-c490-0553-b108feb5e8d4@phrozen.org>

On Thu, Sep 15, 2016 at 08:58:50PM +0200, John Crispin wrote:
> Hi,
> 
> While adding MDB support to the qca8k dsa driver I found that ipv4 mcast
> groups don't always get propagated to the dsa driver. In my setup there
> are 2 clients connected to the switch, both running a mdns client. The
> .port_mdb_add() callback is properly called for 33:33:00:00:00:FB but
> 01:00:5E:00:00:FB never got propagated to the dsa driver.
> 
> The reason is that the call to ipv4_is_local_multicast() here [1] will
> return true and the notifier is never called. Is this intentional or is
> there something missing in the code ?

Hi John

I've not looked too deeply at this yet, but here is my take on how it
should work.

By default, the switch needs to flood all multicast traffic from any
port in a bridge, to all other ports in a bridge, including the host.

Adding an mdb entry allows you to reduce where such flooding should
occur, i.e. it allows you to implement IGMP snooping and block traffic
going out a port when you know there is nobody interested in the
traffic on that port.

	Andrew

^ permalink raw reply

* [PATCH v2 2/2] openvswitch: use percpu flow stats
From: Thadeu Lima de Souza Cascardo @ 2016-09-15 22:11 UTC (permalink / raw)
  To: netdev; +Cc: dev, pshelar, David Miller, Eric Dumazet
In-Reply-To: <1473977513-7617-1-git-send-email-cascardo@redhat.com>

Instead of using flow stats per NUMA node, use it per CPU. When using
megaflows, the stats lock can be a bottleneck in scalability.

On a E5-2690 12-core system, usual throughput went from ~4Mpps to
~15Mpps when forwarding between two 40GbE ports with a single flow
configured on the datapath.

This has been tested on a system with possible CPUs 0-7,16-23. After
module removal, there were no corruption on the slab cache.

Signed-off-by: Thadeu Lima de Souza Cascardo <cascardo@redhat.com>
Cc: pravin shelar <pshelar@ovn.org>
---

v2:
* use smp_processor_id as ovs_flow_stats_update is always called from BH
context
* use kmem_cache_zalloc to allocate flow

---
 net/openvswitch/flow.c       | 42 ++++++++++++++++++++++--------------------
 net/openvswitch/flow.h       |  4 ++--
 net/openvswitch/flow_table.c | 26 +++++++++-----------------
 3 files changed, 33 insertions(+), 39 deletions(-)

diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
index 5b80612..0fa45439 100644
--- a/net/openvswitch/flow.c
+++ b/net/openvswitch/flow.c
@@ -29,6 +29,7 @@
 #include <linux/module.h>
 #include <linux/in.h>
 #include <linux/rcupdate.h>
+#include <linux/cpumask.h>
 #include <linux/if_arp.h>
 #include <linux/ip.h>
 #include <linux/ipv6.h>
@@ -72,32 +73,33 @@ void ovs_flow_stats_update(struct sw_flow *flow, __be16 tcp_flags,
 {
 	struct flow_stats *stats;
 	int node = numa_node_id();
+	int cpu = smp_processor_id();
 	int len = skb->len + (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0);
 
-	stats = rcu_dereference(flow->stats[node]);
+	stats = rcu_dereference(flow->stats[cpu]);
 
-	/* Check if already have node-specific stats. */
+	/* Check if already have CPU-specific stats. */
 	if (likely(stats)) {
 		spin_lock(&stats->lock);
 		/* Mark if we write on the pre-allocated stats. */
-		if (node == 0 && unlikely(flow->stats_last_writer != node))
-			flow->stats_last_writer = node;
+		if (cpu == 0 && unlikely(flow->stats_last_writer != cpu))
+			flow->stats_last_writer = cpu;
 	} else {
 		stats = rcu_dereference(flow->stats[0]); /* Pre-allocated. */
 		spin_lock(&stats->lock);
 
-		/* If the current NUMA-node is the only writer on the
+		/* If the current CPU is the only writer on the
 		 * pre-allocated stats keep using them.
 		 */
-		if (unlikely(flow->stats_last_writer != node)) {
+		if (unlikely(flow->stats_last_writer != cpu)) {
 			/* A previous locker may have already allocated the
-			 * stats, so we need to check again.  If node-specific
+			 * stats, so we need to check again.  If CPU-specific
 			 * stats were already allocated, we update the pre-
 			 * allocated stats as we have already locked them.
 			 */
-			if (likely(flow->stats_last_writer != NUMA_NO_NODE)
-			    && likely(!rcu_access_pointer(flow->stats[node]))) {
-				/* Try to allocate node-specific stats. */
+			if (likely(flow->stats_last_writer != -1) &&
+			    likely(!rcu_access_pointer(flow->stats[cpu]))) {
+				/* Try to allocate CPU-specific stats. */
 				struct flow_stats *new_stats;
 
 				new_stats =
@@ -114,12 +116,12 @@ void ovs_flow_stats_update(struct sw_flow *flow, __be16 tcp_flags,
 					new_stats->tcp_flags = tcp_flags;
 					spin_lock_init(&new_stats->lock);
 
-					rcu_assign_pointer(flow->stats[node],
+					rcu_assign_pointer(flow->stats[cpu],
 							   new_stats);
 					goto unlock;
 				}
 			}
-			flow->stats_last_writer = node;
+			flow->stats_last_writer = cpu;
 		}
 	}
 
@@ -136,15 +138,15 @@ void ovs_flow_stats_get(const struct sw_flow *flow,
 			struct ovs_flow_stats *ovs_stats,
 			unsigned long *used, __be16 *tcp_flags)
 {
-	int node;
+	int cpu;
 
 	*used = 0;
 	*tcp_flags = 0;
 	memset(ovs_stats, 0, sizeof(*ovs_stats));
 
-	/* We open code this to make sure node 0 is always considered */
-	for (node = 0; node < MAX_NUMNODES; node = next_node(node, node_possible_map)) {
-		struct flow_stats *stats = rcu_dereference_ovsl(flow->stats[node]);
+	/* We open code this to make sure cpu 0 is always considered */
+	for (cpu = 0; cpu < nr_cpu_ids; cpu = cpumask_next(cpu, cpu_possible_mask)) {
+		struct flow_stats *stats = rcu_dereference_ovsl(flow->stats[cpu]);
 
 		if (stats) {
 			/* Local CPU may write on non-local stats, so we must
@@ -164,11 +166,11 @@ void ovs_flow_stats_get(const struct sw_flow *flow,
 /* Called with ovs_mutex. */
 void ovs_flow_stats_clear(struct sw_flow *flow)
 {
-	int node;
+	int cpu;
 
-	/* We open code this to make sure node 0 is always considered */
-	for (node = 0; node < MAX_NUMNODES; node = next_node(node, node_possible_map)) {
-		struct flow_stats *stats = ovsl_dereference(flow->stats[node]);
+	/* We open code this to make sure cpu 0 is always considered */
+	for (cpu = 0; cpu < nr_cpu_ids; cpu = cpumask_next(cpu, cpu_possible_mask)) {
+		struct flow_stats *stats = ovsl_dereference(flow->stats[cpu]);
 
 		if (stats) {
 			spin_lock_bh(&stats->lock);
diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h
index 156a302..ae783f5 100644
--- a/net/openvswitch/flow.h
+++ b/net/openvswitch/flow.h
@@ -178,14 +178,14 @@ struct sw_flow {
 		struct hlist_node node[2];
 		u32 hash;
 	} flow_table, ufid_table;
-	int stats_last_writer;		/* NUMA-node id of the last writer on
+	int stats_last_writer;		/* CPU id of the last writer on
 					 * 'stats[0]'.
 					 */
 	struct sw_flow_key key;
 	struct sw_flow_id id;
 	struct sw_flow_mask *mask;
 	struct sw_flow_actions __rcu *sf_acts;
-	struct flow_stats __rcu *stats[]; /* One for each NUMA node.  First one
+	struct flow_stats __rcu *stats[]; /* One for each CPU.  First one
 					   * is allocated at flow creation time,
 					   * the rest are allocated on demand
 					   * while holding the 'stats[0].lock'.
diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c
index 957a3c3..ea7a807 100644
--- a/net/openvswitch/flow_table.c
+++ b/net/openvswitch/flow_table.c
@@ -32,6 +32,7 @@
 #include <linux/module.h>
 #include <linux/in.h>
 #include <linux/rcupdate.h>
+#include <linux/cpumask.h>
 #include <linux/if_arp.h>
 #include <linux/ip.h>
 #include <linux/ipv6.h>
@@ -79,17 +80,12 @@ struct sw_flow *ovs_flow_alloc(void)
 {
 	struct sw_flow *flow;
 	struct flow_stats *stats;
-	int node;
 
-	flow = kmem_cache_alloc(flow_cache, GFP_KERNEL);
+	flow = kmem_cache_zalloc(flow_cache, GFP_KERNEL);
 	if (!flow)
 		return ERR_PTR(-ENOMEM);
 
-	flow->sf_acts = NULL;
-	flow->mask = NULL;
-	flow->id.unmasked_key = NULL;
-	flow->id.ufid_len = 0;
-	flow->stats_last_writer = NUMA_NO_NODE;
+	flow->stats_last_writer = -1;
 
 	/* Initialize the default stat node. */
 	stats = kmem_cache_alloc_node(flow_stats_cache,
@@ -102,10 +98,6 @@ struct sw_flow *ovs_flow_alloc(void)
 
 	RCU_INIT_POINTER(flow->stats[0], stats);
 
-	for_each_node(node)
-		if (node != 0)
-			RCU_INIT_POINTER(flow->stats[node], NULL);
-
 	return flow;
 err:
 	kmem_cache_free(flow_cache, flow);
@@ -142,17 +134,17 @@ static struct flex_array *alloc_buckets(unsigned int n_buckets)
 
 static void flow_free(struct sw_flow *flow)
 {
-	int node;
+	int cpu;
 
 	if (ovs_identifier_is_key(&flow->id))
 		kfree(flow->id.unmasked_key);
 	if (flow->sf_acts)
 		ovs_nla_free_flow_actions((struct sw_flow_actions __force *)flow->sf_acts);
-	/* We open code this to make sure node 0 is always considered */
-	for (node = 0; node < MAX_NUMNODES; node = next_node(node, node_possible_map))
-		if (node != 0 && flow->stats[node])
+	/* We open code this to make sure cpu 0 is always considered */
+	for (cpu = 0; cpu < nr_cpu_ids; cpu = cpumask_next(cpu, cpu_possible_mask))
+		if (flow->stats[cpu])
 			kmem_cache_free(flow_stats_cache,
-					(struct flow_stats __force *)flow->stats[node]);
+					(struct flow_stats __force *)flow->stats[cpu]);
 	kmem_cache_free(flow_cache, flow);
 }
 
@@ -757,7 +749,7 @@ int ovs_flow_init(void)
 	BUILD_BUG_ON(sizeof(struct sw_flow_key) % sizeof(long));
 
 	flow_cache = kmem_cache_create("sw_flow", sizeof(struct sw_flow)
-				       + (nr_node_ids
+				       + (nr_cpu_ids
 					  * sizeof(struct flow_stats *)),
 				       0, 0, NULL);
 	if (flow_cache == NULL)
-- 
2.7.4

^ permalink raw reply related

* [PATCH v2 1/2] openvswitch: fix flow stats accounting when node 0 is not possible
From: Thadeu Lima de Souza Cascardo @ 2016-09-15 22:11 UTC (permalink / raw)
  To: netdev-u79uwXL29TY76Z2rM5mHXA
  Cc: dev-yBygre7rU0TnMu66kgdUjQ, David Miller, Eric Dumazet

On a system with only node 1 as possible, all statistics is going to be
accounted on node 0 as it will have a single writer.

However, when getting and clearing the statistics, node 0 is not going
to be considered, as it's not a possible node.

Tested that statistics are not zero on a system with only node 1
possible. Also compile-tested with CONFIG_NUMA off.

Signed-off-by: Thadeu Lima de Souza Cascardo <cascardo@redhat.com>
---
 net/openvswitch/flow.c       | 6 ++++--
 net/openvswitch/flow_table.c | 5 +++--
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
index 1240ae3..5b80612 100644
--- a/net/openvswitch/flow.c
+++ b/net/openvswitch/flow.c
@@ -142,7 +142,8 @@ void ovs_flow_stats_get(const struct sw_flow *flow,
 	*tcp_flags = 0;
 	memset(ovs_stats, 0, sizeof(*ovs_stats));
 
-	for_each_node(node) {
+	/* We open code this to make sure node 0 is always considered */
+	for (node = 0; node < MAX_NUMNODES; node = next_node(node, node_possible_map)) {
 		struct flow_stats *stats = rcu_dereference_ovsl(flow->stats[node]);
 
 		if (stats) {
@@ -165,7 +166,8 @@ void ovs_flow_stats_clear(struct sw_flow *flow)
 {
 	int node;
 
-	for_each_node(node) {
+	/* We open code this to make sure node 0 is always considered */
+	for (node = 0; node < MAX_NUMNODES; node = next_node(node, node_possible_map)) {
 		struct flow_stats *stats = ovsl_dereference(flow->stats[node]);
 
 		if (stats) {
diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c
index d073fff..957a3c3 100644
--- a/net/openvswitch/flow_table.c
+++ b/net/openvswitch/flow_table.c
@@ -148,8 +148,9 @@ static void flow_free(struct sw_flow *flow)
 		kfree(flow->id.unmasked_key);
 	if (flow->sf_acts)
 		ovs_nla_free_flow_actions((struct sw_flow_actions __force *)flow->sf_acts);
-	for_each_node(node)
-		if (flow->stats[node])
+	/* We open code this to make sure node 0 is always considered */
+	for (node = 0; node < MAX_NUMNODES; node = next_node(node, node_possible_map))
+		if (node != 0 && flow->stats[node])
 			kmem_cache_free(flow_stats_cache,
 					(struct flow_stats __force *)flow->stats[node]);
 	kmem_cache_free(flow_cache, flow);
-- 
2.7.4

_______________________________________________
dev mailing list
dev@openvswitch.org
http://openvswitch.org/mailman/listinfo/dev

^ permalink raw reply related

* Re: [RFC v3 03/22] bpf,landlock: Add a new arraymap type to deal with (Landlock) handles
From: Mickaël Salaün @ 2016-09-15 21:51 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: linux-kernel, Alexei Starovoitov, Andy Lutomirski, Arnd Bergmann,
	Casey Schaufler, Daniel Borkmann, Daniel Mack, David Drysdale,
	David S . Miller, Elena Reshetova, Eric W . Biederman,
	James Morris, Kees Cook, Paul Moore, Sargun Dhillon,
	Serge E . Hallyn, Tejun Heo, Will Drewry, kernel-hardening,
	linux-api, linux-security-module, netdev
In-Reply-To: <20160914232815.GE60248@ast-mbp.thefacebook.com>


[-- Attachment #1.1: Type: text/plain, Size: 6355 bytes --]



On 15/09/2016 01:28, Alexei Starovoitov wrote:
> On Thu, Sep 15, 2016 at 01:22:49AM +0200, Mickaël Salaün wrote:
>>
>> On 14/09/2016 20:51, Alexei Starovoitov wrote:
>>> On Wed, Sep 14, 2016 at 09:23:56AM +0200, Mickaël Salaün wrote:
>>>> This new arraymap looks like a set and brings new properties:
>>>> * strong typing of entries: the eBPF functions get the array type of
>>>>   elements instead of CONST_PTR_TO_MAP (e.g.
>>>>   CONST_PTR_TO_LANDLOCK_HANDLE_FS);
>>>> * force sequential filling (i.e. replace or append-only update), which
>>>>   allow quick browsing of all entries.
>>>>
>>>> This strong typing is useful to statically check if the content of a map
>>>> can be passed to an eBPF function. For example, Landlock use it to store
>>>> and manage kernel objects (e.g. struct file) instead of dealing with
>>>> userland raw data. This improve efficiency and ensure that an eBPF
>>>> program can only call functions with the right high-level arguments.
>>>>
>>>> The enum bpf_map_handle_type list low-level types (e.g.
>>>> BPF_MAP_HANDLE_TYPE_LANDLOCK_FS_FD) which are identified when
>>>> updating a map entry (handle). This handle types are used to infer a
>>>> high-level arraymap type which are listed in enum bpf_map_array_type
>>>> (e.g. BPF_MAP_ARRAY_TYPE_LANDLOCK_FS).
>>>>
>>>> For now, this new arraymap is only used by Landlock LSM (cf. next
>>>> commits) but it could be useful for other needs.
>>>>
>>>> Changes since v2:
>>>> * add a RLIMIT_NOFILE-based limit to the maximum number of arraymap
>>>>   handle entries (suggested by Andy Lutomirski)
>>>> * remove useless checks
>>>>
>>>> Changes since v1:
>>>> * arraymap of handles replace custom checker groups
>>>> * simpler userland API
>>>>
>>>> Signed-off-by: Mickaël Salaün <mic@digikod.net>
>>>> Cc: Alexei Starovoitov <ast@kernel.org>
>>>> Cc: Andy Lutomirski <luto@amacapital.net>
>>>> Cc: Daniel Borkmann <daniel@iogearbox.net>
>>>> Cc: David S. Miller <davem@davemloft.net>
>>>> Cc: Kees Cook <keescook@chromium.org>
>>>> Link: https://lkml.kernel.org/r/CALCETrWwTiz3kZTkEgOW24-DvhQq6LftwEXh77FD2G5o71yD7g@mail.gmail.com
>>>> ---
>>>>  include/linux/bpf.h      |  14 ++++
>>>>  include/uapi/linux/bpf.h |  18 +++++
>>>>  kernel/bpf/arraymap.c    | 203 +++++++++++++++++++++++++++++++++++++++++++++++
>>>>  kernel/bpf/verifier.c    |  12 ++-
>>>>  4 files changed, 246 insertions(+), 1 deletion(-)
>>>>
>>>> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
>>>> index fa9a988400d9..eae4ce4542c1 100644
>>>> --- a/include/linux/bpf.h
>>>> +++ b/include/linux/bpf.h
>>>> @@ -13,6 +13,10 @@
>>>>  #include <linux/percpu.h>
>>>>  #include <linux/err.h>
>>>>  
>>>> +#ifdef CONFIG_SECURITY_LANDLOCK
>>>> +#include <linux/fs.h> /* struct file */
>>>> +#endif /* CONFIG_SECURITY_LANDLOCK */
>>>> +
>>>>  struct perf_event;
>>>>  struct bpf_map;
>>>>  
>>>> @@ -38,6 +42,7 @@ struct bpf_map_ops {
>>>>  struct bpf_map {
>>>>  	atomic_t refcnt;
>>>>  	enum bpf_map_type map_type;
>>>> +	enum bpf_map_array_type map_array_type;
>>>>  	u32 key_size;
>>>>  	u32 value_size;
>>>>  	u32 max_entries;
>>>> @@ -187,6 +192,9 @@ struct bpf_array {
>>>>  	 */
>>>>  	enum bpf_prog_type owner_prog_type;
>>>>  	bool owner_jited;
>>>> +#ifdef CONFIG_SECURITY_LANDLOCK
>>>> +	u32 n_entries;	/* number of entries in a handle array */
>>>> +#endif /* CONFIG_SECURITY_LANDLOCK */
>>>>  	union {
>>>>  		char value[0] __aligned(8);
>>>>  		void *ptrs[0] __aligned(8);
>>>> @@ -194,6 +202,12 @@ struct bpf_array {
>>>>  	};
>>>>  };
>>>>  
>>>> +#ifdef CONFIG_SECURITY_LANDLOCK
>>>> +struct map_landlock_handle {
>>>> +	u32 type; /* enum bpf_map_handle_type */
>>>> +};
>>>> +#endif /* CONFIG_SECURITY_LANDLOCK */
>>>> +
>>>>  #define MAX_TAIL_CALL_CNT 32
>>>>  
>>>>  struct bpf_event_entry {
>>>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
>>>> index 7cd36166f9b7..b68de57f7ab8 100644
>>>> --- a/include/uapi/linux/bpf.h
>>>> +++ b/include/uapi/linux/bpf.h
>>>> @@ -87,6 +87,15 @@ enum bpf_map_type {
>>>>  	BPF_MAP_TYPE_PERCPU_ARRAY,
>>>>  	BPF_MAP_TYPE_STACK_TRACE,P_TYPE_CGROUP_ARRAY
>>>>  	BPF_MAP_TYPE_CGROUP_ARRAY,
>>>> +	BPF_MAP_TYPE_LANDLOCK_ARRAY,
>>>> +};
>>>> +
>>>> +enum bpf_map_array_type {
>>>> +	BPF_MAP_ARRAY_TYPE_UNSPEC,
>>>> +};
>>>> +
>>>> +enum bpf_map_handle_type {
>>>> +	BPF_MAP_HANDLE_TYPE_UNSPEC,
>>>>  };
>>>
>>> missing something. why it has to be special to have it's own
>>> fd array implementation?
>>> Please take a look how BPF_MAP_TYPE_PERF_EVENT_ARRAY, 
>>> BPF_MAP_TYPE_CGROUP_ARRAY and BPF_MAP_TYPE_PROG_ARRAY are done.
>>> The all store objects into array map that user space passes via FD.
>>> I think the same model should apply here.
>>
>> The idea is to have multiple way for userland to describe a resource
>> (e.g. an open file descriptor, a path or a glob pattern). The kernel
>> representation could then be a "struct path *" or dedicated types (e.g.
>> custom glob).
> 
> hmm. I think user space api should only deal with FD. Everything
> else is user space job to encapsulate/hide.

How would you create a FD referring to a glob, a user or port ranges for
example ?

> 
>> Another interesting point (that could replace
>> check_map_func_compatibility()) is that BPF_MAP_TYPE_LANDLOCK_ARRAY
>> translate to dedicated (abstract) types (instead of CONST_PTR_TO_MAP)
>> thanks to bpf_reg_type_from_map(). This is useful to abstract userland
>> (map) interface with kernel object(s) dealing with that type.
> 
> I probably missing something. If user space interface is FD,
> to the kernel they're different object types. Nothing else.

Yes but what if there is more than one way to express a resource (cf.
previous comment). A FD can refer to an *existing file* but a glob
pattern could match a bunch of files (existing or not). This was a
concern for Kees Cook and James Morris [1].

[1]
https://lkml.kernel.org/r/CAGXu5jK1U12vMk11HD_x_gNz3Rk4ZgEfdThY7DHvm4e4sPRh4g@mail.gmail.com

> 
>> A third point is that BPF_MAP_TYPE_LANDLOCK_ARRAY is a kind of set. It
>> is optimized to quickly walk through all the elements in a sequential way.
> 
> why set is any faster to walk vs array?

It is an array with only sequential entries (i.e. no hole in the array).


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 455 bytes --]

^ permalink raw reply

* Re: [RFC v3 07/22] landlock: Handle file comparisons
From: Mickaël Salaün @ 2016-09-15 21:25 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: linux-kernel, Alexei Starovoitov, Andy Lutomirski, Arnd Bergmann,
	Casey Schaufler, Daniel Borkmann, Daniel Mack, David Drysdale,
	David S . Miller, Elena Reshetova, Eric W . Biederman,
	James Morris, Kees Cook, Paul Moore, Sargun Dhillon,
	Serge E . Hallyn, Tejun Heo, Will Drewry, kernel-hardening,
	linux-api, linux-security-module, netdev
In-Reply-To: <20160914232418.GD60248@ast-mbp.thefacebook.com>


[-- Attachment #1.1: Type: text/plain, Size: 3218 bytes --]


On 15/09/2016 01:24, Alexei Starovoitov wrote:
> On Thu, Sep 15, 2016 at 01:02:22AM +0200, Mickaël Salaün wrote:
>>>
>>> I would suggest for the next RFC to do minimal 7 patches up to this point
>>> with simple example that demonstrates the use case.
>>> I would avoid all unpriv stuff and all of seccomp for the next RFC as well,
>>> otherwise I don't think we can realistically make forward progress, since
>>> there are too many issues raised in the subsequent patches.
>>
>> I hope we will find a common agreement about seccomp vs cgroup… I think
>> both approaches have their advantages, can be complementary and nicely
>> combined.
> 
> I don't mind having both task based lsm and cgroup based as long as
> infrastracture is not duplicated and scaling issues from earlier version
> are resolved.

It should be much better with this RFC.

> I'm proposing to do cgroup only for the next RFC, since mine and Sargun's
> use case for this bpf+lsm+cgroup is _not_ security or sandboxing.

Well, LSM purpose is to do security stuff. The main goal of Landlock is
to bring security features to userland, including unprivileged
processes, at least via the seccomp interface [1].

> No need for unpriv, no_new_priv to cgroups are other things that Andy
> is concerned about.

I'm concern about security too! :)

> 
>> Unprivileged sandboxing is the main goal of Landlock. This should not be
>> a problem, even for privileged features, thanks to the new subtype/access.
> 
> yes. the point that unpriv stuff can come later after agreement is reached.
> If we keep arguing about seccomp details this set won't go anywhere.
> Even in basic part (which is cgroup+bpf+lsm) are plenty of questions
> to be still agreed.

Using the seccomp(2) (unpriv) *interface* is OK according to a more
recent thread [1].

[1]
https://lkml.kernel.org/r/20160915044852.GA66000@ast-mbp.thefacebook.com

> 
>> Agreed. With this RFC, the Checmate features (i.e. network helpers)
>> should be able to sit on top of Landlock.
> 
> I think neither of them should be called fancy names for no technical reason.
> We will have only one bpf based lsm. That's it and it doesn't
> need an obscure name. Directory name can be security/bpf/..stuff.c

I disagree on an LSM named "BPF". I first started with the "seccomp LSM"
name (first RFC) but I later realized that it is confusing because
seccomp is associated to its syscall and the underlying features. Same
thing goes for BPF. It is also artificially hard to grep on a name too
used in the kernel source tree.
Making an association between the generic eBPF mechanism and a security
centric approach (i.e. LSM) seems a bit reductive (for BPF). Moreover,
the seccomp interface [1] can still be used.

Landlock is a nice name to depict a sandbox as an enclave (i.e. a
landlocked country/state). I want to keep this name, which is simple,
express the goal of Landlock nicely and is comparable to other sandbox
mechanisms as Seatbelt or Pledge.
Landlock should not be confused with the underlying eBPF implementation.
Landlock could use more than only eBPF in the future and eBPF could be
used in other LSM as well.

 Mickaël


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 455 bytes --]

^ permalink raw reply

* [PATCH v2 net-next 7/7] ila: Resolver mechanism
From: Tom Herbert @ 2016-09-15 21:19 UTC (permalink / raw)
  To: davem, netdev; +Cc: kernel-team, roopa, tgraf
In-Reply-To: <1473974361-2275254-1-git-send-email-tom@herbertland.com>

Implement an ILA resolver. This uses LWT to implement the hook to a
userspace resolver and tracks pending unresolved address using the
backend net resolver.

The idea is that the kernel sets an ILA resolver route to the
SIR prefix, something like:

ip route add 3333::/64 encap ila-resolve \
     via 2401:db00:20:911a::27:0 dev eth0

When a packet hits the route the address is looked up in a resolver
table. If the entry is created (no entry with the address already
exists) then an rtnl message is generated with group
RTNLGRP_ILA_NOTIFY and type RTM_ADDR_RESOLVE. A userspace
daemon can listen for such messages and perform an ILA resolution
protocol to determine the ILA mapping. If the mapping is resolved
then a /128 ila encap router is set so that host can perform
ILA translation and send directly to destination.

Signed-off-by: Tom Herbert <tom@herbertland.com>
---
 include/uapi/linux/ila.h       |   9 ++
 include/uapi/linux/lwtunnel.h  |   1 +
 include/uapi/linux/rtnetlink.h |   8 +-
 net/ipv6/Kconfig               |   1 +
 net/ipv6/ila/Makefile          |   2 +-
 net/ipv6/ila/ila.h             |  16 +++
 net/ipv6/ila/ila_common.c      |   7 ++
 net/ipv6/ila/ila_lwt.c         |   9 ++
 net/ipv6/ila/ila_resolver.c    | 249 +++++++++++++++++++++++++++++++++++++++++
 net/ipv6/ila/ila_xlat.c        |  15 ++-
 10 files changed, 307 insertions(+), 10 deletions(-)
 create mode 100644 net/ipv6/ila/ila_resolver.c

diff --git a/include/uapi/linux/ila.h b/include/uapi/linux/ila.h
index 948c0a9..f186f8b 100644
--- a/include/uapi/linux/ila.h
+++ b/include/uapi/linux/ila.h
@@ -42,4 +42,13 @@ enum {
 	ILA_CSUM_NO_ACTION,
 };
 
+enum {
+	ILA_NOTIFY_ATTR_UNSPEC,
+	ILA_NOTIFY_ATTR_TIMEOUT,		/* u32 */
+
+	__ILA_NOTIFY_ATTR_MAX,
+};
+
+#define ILA_NOTIFY_ATTR_MAX	(__ILA_NOTIFY_ATTR_MAX - 1)
+
 #endif /* _UAPI_LINUX_ILA_H */
diff --git a/include/uapi/linux/lwtunnel.h b/include/uapi/linux/lwtunnel.h
index a478fe8..d880e49 100644
--- a/include/uapi/linux/lwtunnel.h
+++ b/include/uapi/linux/lwtunnel.h
@@ -9,6 +9,7 @@ enum lwtunnel_encap_types {
 	LWTUNNEL_ENCAP_IP,
 	LWTUNNEL_ENCAP_ILA,
 	LWTUNNEL_ENCAP_IP6,
+	LWTUNNEL_ENCAP_ILA_NOTIFY,
 	__LWTUNNEL_ENCAP_MAX,
 };
 
diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index 262f037..a775464 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -12,7 +12,8 @@
  */
 #define RTNL_FAMILY_IPMR		128
 #define RTNL_FAMILY_IP6MR		129
-#define RTNL_FAMILY_MAX			129
+#define RTNL_FAMILY_ILA			130
+#define RTNL_FAMILY_MAX			130
 
 /****
  *		Routing/neighbour discovery messages.
@@ -144,6 +145,9 @@ enum {
 	RTM_GETSTATS = 94,
 #define RTM_GETSTATS RTM_GETSTATS
 
+	RTM_ADDR_RESOLVE = 95,
+#define RTM_ADDR_RESOLVE RTM_ADDR_RESOLVE
+
 	__RTM_MAX,
 #define RTM_MAX		(((__RTM_MAX + 3) & ~3) - 1)
 };
@@ -656,6 +660,8 @@ enum rtnetlink_groups {
 #define RTNLGRP_MPLS_ROUTE	RTNLGRP_MPLS_ROUTE
 	RTNLGRP_NSID,
 #define RTNLGRP_NSID		RTNLGRP_NSID
+	RTNLGRP_ILA_NOTIFY,
+#define RTNLGRP_ILA_NOTIFY	RTNLGRP_ILA_NOTIFY
 	__RTNLGRP_MAX
 };
 #define RTNLGRP_MAX	(__RTNLGRP_MAX - 1)
diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig
index 2343e4f..cf3ea8e 100644
--- a/net/ipv6/Kconfig
+++ b/net/ipv6/Kconfig
@@ -97,6 +97,7 @@ config IPV6_ILA
 	tristate "IPv6: Identifier Locator Addressing (ILA)"
 	depends on NETFILTER
 	select LWTUNNEL
+	select NET_EXT_RESOLVER
 	---help---
 	  Support for IPv6 Identifier Locator Addressing (ILA).
 
diff --git a/net/ipv6/ila/Makefile b/net/ipv6/ila/Makefile
index 4b32e59..f2aadc3 100644
--- a/net/ipv6/ila/Makefile
+++ b/net/ipv6/ila/Makefile
@@ -4,4 +4,4 @@
 
 obj-$(CONFIG_IPV6_ILA) += ila.o
 
-ila-objs := ila_common.o ila_lwt.o ila_xlat.o
+ila-objs := ila_common.o ila_lwt.o ila_xlat.o ila_resolver.o
diff --git a/net/ipv6/ila/ila.h b/net/ipv6/ila/ila.h
index e0170f6..e369611 100644
--- a/net/ipv6/ila/ila.h
+++ b/net/ipv6/ila/ila.h
@@ -15,6 +15,7 @@
 #include <linux/ip.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
+#include <linux/rhashtable.h>
 #include <linux/socket.h>
 #include <linux/skbuff.h>
 #include <linux/types.h>
@@ -23,6 +24,16 @@
 #include <net/protocol.h>
 #include <uapi/linux/ila.h>
 
+extern unsigned int ila_net_id;
+
+struct ila_net {
+	struct rhashtable rhash_table;
+	spinlock_t *locks; /* Bucket locks for entry manipulation */
+	unsigned int locks_mask;
+	bool hooks_registered;
+	struct net_rslv *nrslv;
+};
+
 struct ila_locator {
 	union {
 		__u8            v8[8];
@@ -114,9 +125,14 @@ void ila_update_ipv6_locator(struct sk_buff *skb, struct ila_params *p,
 
 void ila_init_saved_csum(struct ila_params *p);
 
+void ila_rslv_resolved(struct ila_net *ilan, struct ila_addr *iaddr);
 int ila_lwt_init(void);
 void ila_lwt_fini(void);
 int ila_xlat_init(void);
 void ila_xlat_fini(void);
+int ila_rslv_init(void);
+void ila_rslv_fini(void);
+int ila_init_resolver_net(struct ila_net *ilan);
+void ila_exit_resolver_net(struct ila_net *ilan);
 
 #endif /* __ILA_H */
diff --git a/net/ipv6/ila/ila_common.c b/net/ipv6/ila/ila_common.c
index aba0998..83c7d4a 100644
--- a/net/ipv6/ila/ila_common.c
+++ b/net/ipv6/ila/ila_common.c
@@ -157,7 +157,13 @@ static int __init ila_init(void)
 	if (ret)
 		goto fail_xlat;
 
+	ret = ila_rslv_init();
+	if (ret)
+		goto fail_rslv;
+
 	return 0;
+fail_rslv:
+	ila_xlat_fini();
 fail_xlat:
 	ila_lwt_fini();
 fail_lwt:
@@ -168,6 +174,7 @@ static void __exit ila_fini(void)
 {
 	ila_xlat_fini();
 	ila_lwt_fini();
+	ila_rslv_fini();
 }
 
 module_init(ila_init);
diff --git a/net/ipv6/ila/ila_lwt.c b/net/ipv6/ila/ila_lwt.c
index 30a6920..70d8988 100644
--- a/net/ipv6/ila/ila_lwt.c
+++ b/net/ipv6/ila/ila_lwt.c
@@ -9,6 +9,7 @@
 #include <net/ip.h>
 #include <net/ip6_fib.h>
 #include <net/lwtunnel.h>
+#include <net/netns/generic.h>
 #include <net/protocol.h>
 #include <uapi/linux/ila.h>
 #include "ila.h"
@@ -122,6 +123,14 @@ static int ila_build_state(struct net *net, struct net_device *dev,
 
 	*ts = newts;
 
+	if (cfg6->fc_dst_len >= sizeof(struct ila_addr)) {
+		struct net *net = dev_net(dev);
+		struct ila_net *ilan = net_generic(net, ila_net_id);
+
+		/* Cancel any pending resolution on this address */
+		ila_rslv_resolved(ilan, iaddr);
+	}
+
 	return 0;
 }
 
diff --git a/net/ipv6/ila/ila_resolver.c b/net/ipv6/ila/ila_resolver.c
new file mode 100644
index 0000000..0f5a819
--- /dev/null
+++ b/net/ipv6/ila/ila_resolver.c
@@ -0,0 +1,249 @@
+#include <linux/errno.h>
+#include <linux/ip.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/skbuff.h>
+#include <linux/socket.h>
+#include <linux/types.h>
+#include <net/checksum.h>
+#include <net/ip.h>
+#include <net/ip6_fib.h>
+#include <net/lwtunnel.h>
+#include <net/netns/generic.h>
+#include <net/protocol.h>
+#include <net/resolver.h>
+#include <uapi/linux/ila.h>
+#include "ila.h"
+
+struct ila_notify_params {
+	unsigned int timeout;
+};
+
+static inline struct ila_notify_params *ila_notify_params_lwtunnel(
+	struct lwtunnel_state *lwstate)
+{
+	return (struct ila_notify_params *)lwstate->data;
+}
+
+static int ila_fill_notify(struct sk_buff *skb, struct in6_addr *addr,
+			   u32 pid, u32 seq, int event, int flags)
+{
+	struct nlmsghdr *nlh;
+	struct rtmsg *rtm;
+
+	nlh = nlmsg_put(skb, pid, seq, event, sizeof(*rtm), flags);
+	if (!nlh)
+		return -EMSGSIZE;
+
+	rtm = nlmsg_data(nlh);
+	rtm->rtm_family   = RTNL_FAMILY_ILA;
+	rtm->rtm_dst_len  = 128;
+	rtm->rtm_src_len  = 0;
+	rtm->rtm_tos      = 0;
+	rtm->rtm_table    = RT6_TABLE_UNSPEC;
+	rtm->rtm_type     = RTN_UNICAST;
+	rtm->rtm_scope    = RT_SCOPE_UNIVERSE;
+
+	if (nla_put_in6_addr(skb, RTA_DST, addr)) {
+		nlmsg_cancel(skb, nlh);
+		return -EMSGSIZE;
+	}
+
+	nlmsg_end(skb, nlh);
+	return 0;
+}
+
+static size_t ila_rslv_msgsize(void)
+{
+	size_t len =
+		NLMSG_ALIGN(sizeof(struct rtmsg))
+		+ nla_total_size(16)     /* RTA_DST */
+		;
+
+	return len;
+}
+
+void ila_rslv_notify(struct net *net, struct sk_buff *skb)
+{
+	struct ipv6hdr *ip6h = ipv6_hdr(skb);
+	struct sk_buff *nlskb;
+	int err = 0;
+
+	/* Send ILA notification to user */
+	nlskb = nlmsg_new(ila_rslv_msgsize(), GFP_KERNEL);
+	if (!nlskb)
+		goto errout;
+
+	err = ila_fill_notify(nlskb, &ip6h->daddr, 0, 0, RTM_ADDR_RESOLVE,
+			      NLM_F_MULTI);
+	if (err < 0) {
+		WARN_ON(err == -EMSGSIZE);
+		kfree_skb(nlskb);
+		goto errout;
+	}
+	rtnl_notify(nlskb, net, 0, RTNLGRP_ILA_NOTIFY, NULL, GFP_ATOMIC);
+	return;
+
+errout:
+	if (err < 0)
+		rtnl_set_sk_err(net, RTNLGRP_ILA_NOTIFY, err);
+}
+
+static int ila_rslv_output(struct net *net, struct sock *sk,
+			   struct sk_buff *skb)
+{
+	struct ila_net *ilan = net_generic(net, ila_net_id);
+	struct dst_entry *dst = skb_dst(skb);
+	struct ipv6hdr *ip6h = ipv6_hdr(skb);
+	struct ila_notify_params *p;
+	bool new;
+
+	p = ila_notify_params_lwtunnel(dst->lwtstate);
+
+	/* Don't bother taking rcu lock, we only want to know if the entry
+	 * exists or not.
+	 */
+	net_rslv_lookup_and_create(ilan->nrslv, &ip6h->daddr, &new,
+				   p->timeout);
+
+	if (new)
+		ila_rslv_notify(net, skb);
+
+	return dst->lwtstate->orig_output(net, sk, skb);
+}
+
+void ila_rslv_resolved(struct ila_net *ilan, struct ila_addr *iaddr)
+{
+	if (ilan->nrslv)
+		net_rslv_resolved(ilan->nrslv, iaddr);
+}
+
+static int ila_rslv_input(struct sk_buff *skb)
+{
+	struct dst_entry *dst = skb_dst(skb);
+
+	return dst->lwtstate->orig_input(skb);
+}
+
+static const struct nla_policy ila_notify_nl_policy[ILA_NOTIFY_ATTR_MAX + 1] = {
+	[ILA_NOTIFY_ATTR_TIMEOUT] = { .type = NLA_U32, },
+};
+
+static int ila_rslv_build_state(struct net *net, struct net_device *dev,
+				struct nlattr *nla, unsigned int family,
+				const void *cfg, struct lwtunnel_state **ts)
+{
+	struct ila_notify_params *p;
+	struct nlattr *tb[ILA_NOTIFY_ATTR_MAX + 1];
+	struct lwtunnel_state *newts;
+	struct ila_net *ilan = net_generic(net, ila_net_id);
+	size_t encap_len = sizeof(*p);
+	int ret;
+
+	if (unlikely(!ilan->nrslv)) {
+		int err;
+
+		/* Only create net resolver on demand */
+		err = ila_init_resolver_net(ilan);
+		if (err)
+			return err;
+	}
+
+	if (family != AF_INET6)
+		return -EINVAL;
+
+	ret = nla_parse_nested(tb, ILA_NOTIFY_ATTR_MAX, nla,
+			       ila_notify_nl_policy);
+
+	if (ret < 0)
+		return ret;
+
+	newts = lwtunnel_state_alloc(encap_len);
+	if (!newts)
+		return -ENOMEM;
+
+	newts->len = 0;
+	newts->type = LWTUNNEL_ENCAP_ILA_NOTIFY;
+	newts->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT |
+			LWTUNNEL_STATE_INPUT_REDIRECT;
+
+	p = ila_notify_params_lwtunnel(newts);
+
+	if (tb[ILA_NOTIFY_ATTR_TIMEOUT])
+		p->timeout = msecs_to_jiffies(nla_get_u32(
+			tb[ILA_NOTIFY_ATTR_TIMEOUT]));
+
+	*ts = newts;
+
+	return 0;
+}
+
+static int ila_rslv_fill_encap_info(struct sk_buff *skb,
+				    struct lwtunnel_state *lwtstate)
+{
+	struct ila_notify_params *p = ila_notify_params_lwtunnel(lwtstate);
+
+	if (nla_put_u32(skb, ILA_NOTIFY_ATTR_TIMEOUT,
+			(__force u32)jiffies_to_msecs(p->timeout)))
+		goto nla_put_failure;
+
+	return 0;
+
+nla_put_failure:
+	return -EMSGSIZE;
+}
+
+static int ila_rslv_nlsize(struct lwtunnel_state *lwtstate)
+{
+	return nla_total_size(sizeof(u32)) + /* ILA_NOTIFY_ATTR_TIMEOUT */
+	       0;
+}
+
+static int ila_rslv_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b)
+{
+	return 0;
+}
+
+static const struct lwtunnel_encap_ops ila_rslv_ops = {
+	.build_state = ila_rslv_build_state,
+	.output = ila_rslv_output,
+	.input = ila_rslv_input,
+	.fill_encap = ila_rslv_fill_encap_info,
+	.get_encap_size = ila_rslv_nlsize,
+	.cmp_encap = ila_rslv_cmp,
+};
+
+#define ILA_MAX_SIZE 8192
+
+int ila_init_resolver_net(struct ila_net *ilan)
+{
+	struct net_rslv *nrslv;
+
+	nrslv = net_rslv_create(sizeof(struct ila_addr),
+				sizeof(struct ila_addr), ILA_MAX_SIZE,
+				NULL, NULL, NULL);
+
+	if (IS_ERR(nrslv))
+		return PTR_ERR(nrslv);
+
+	ilan->nrslv = nrslv;
+
+	return 0;
+}
+
+void ila_exit_resolver_net(struct ila_net *ilan)
+{
+	if (ilan->nrslv)
+		net_rslv_destroy(ilan->nrslv);
+}
+
+int ila_rslv_init(void)
+{
+	return lwtunnel_encap_add_ops(&ila_rslv_ops, LWTUNNEL_ENCAP_ILA_NOTIFY);
+}
+
+void ila_rslv_fini(void)
+{
+	lwtunnel_encap_del_ops(&ila_rslv_ops, LWTUNNEL_ENCAP_ILA_NOTIFY);
+}
diff --git a/net/ipv6/ila/ila_xlat.c b/net/ipv6/ila/ila_xlat.c
index 7d1c34b..857f8b5 100644
--- a/net/ipv6/ila/ila_xlat.c
+++ b/net/ipv6/ila/ila_xlat.c
@@ -21,14 +21,7 @@ struct ila_map {
 	struct rcu_head rcu;
 };
 
-static unsigned int ila_net_id;
-
-struct ila_net {
-	struct rhashtable rhash_table;
-	spinlock_t *locks; /* Bucket locks for entry manipulation */
-	unsigned int locks_mask;
-	bool hooks_registered;
-};
+unsigned int ila_net_id;
 
 static u32 hashrnd __read_mostly;
 static __always_inline void __ila_hash_secret_init(void)
@@ -546,6 +539,10 @@ static __net_init int ila_init_net(struct net *net)
 	if (err)
 		return err;
 
+	/* Resolver net is created on demand when LWT ILA resolver route
+	 * is made.
+	 */
+
 	rhashtable_init(&ilan->rhash_table, &rht_params);
 
 	return 0;
@@ -557,6 +554,8 @@ static __net_exit void ila_exit_net(struct net *net)
 
 	rhashtable_free_and_destroy(&ilan->rhash_table, ila_free_cb, NULL);
 
+	ila_exit_resolver_net(ilan);
+
 	free_bucket_spinlocks(ilan->locks);
 
 	if (ilan->hooks_registered)
-- 
2.8.0.rc2

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox