Netdev List

Netdev List
 help / color / mirror / Atom feed

* Re: [PATCH v2 net-next 1/2] flow dissector: ICMP support
From: Tom Herbert @ 2016-12-03 18:52 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: Simon Horman, David Miller, Linux Kernel Network Developers,
	Jay Vosburgh, Veaceslav Falico, Andy Gospodarek, Jamal Hadi Salim,
	Jiri Pirko
In-Reply-To: <20161203104918.GA1880@nanopsycho.orion>

On Sat, Dec 3, 2016 at 2:49 AM, Jiri Pirko <jiri@resnulli.us> wrote:
> Fri, Dec 02, 2016 at 09:31:41PM CET, simon.horman@netronome.com wrote:
>>Allow dissection of ICMP(V6) type and code. This re-uses transport layer
>>port dissection code as although ICMP is not a transport protocol and their
>>type and code are not ports this allows sharing of both code and storage.
>>
>>Signed-off-by: Simon Horman <simon.horman@netronome.com>
>>---
>> drivers/net/bonding/bond_main.c |  6 +++--
>> include/linux/skbuff.h          |  5 +++++
>> include/net/flow_dissector.h    | 50 ++++++++++++++++++++++++++++++++++++++---
>> net/core/flow_dissector.c       | 34 +++++++++++++++++++++++++---
>> net/sched/cls_flow.c            |  4 ++--
>> 5 files changed, 89 insertions(+), 10 deletions(-)
>>
>>diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
>>index 8029dd4912b6..a6f75cfb2bf7 100644
>>--- a/drivers/net/bonding/bond_main.c
>>+++ b/drivers/net/bonding/bond_main.c
>>@@ -3181,7 +3181,8 @@ static bool bond_flow_dissect(struct bonding *bond, struct sk_buff *skb,
>>       } else {
>>               return false;
>>       }
>>-      if (bond->params.xmit_policy == BOND_XMIT_POLICY_LAYER34 && proto >= 0)
>>+      if (bond->params.xmit_policy == BOND_XMIT_POLICY_LAYER34 &&
>>+          proto >= 0 && !skb_flow_is_icmp_any(skb, proto))
>>               fk->ports.ports = skb_flow_get_ports(skb, noff, proto);
>>
>>       return true;
>>@@ -3209,7 +3210,8 @@ u32 bond_xmit_hash(struct bonding *bond, struct sk_buff *skb)
>>               return bond_eth_hash(skb);
>>
>>       if (bond->params.xmit_policy == BOND_XMIT_POLICY_LAYER23 ||
>>-          bond->params.xmit_policy == BOND_XMIT_POLICY_ENCAP23)
>>+          bond->params.xmit_policy == BOND_XMIT_POLICY_ENCAP23 ||
>>+          flow_keys_are_icmp_any(&flow))
>>               hash = bond_eth_hash(skb);
>>       else
>>               hash = (__force u32)flow.ports.ports;
>>diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
>>index 9c535fbccf2c..44a8f69a9198 100644
>>--- a/include/linux/skbuff.h
>>+++ b/include/linux/skbuff.h
>>@@ -1094,6 +1094,11 @@ u32 __skb_get_poff(const struct sk_buff *skb, void *data,
>> __be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto,
>>                           void *data, int hlen_proto);
>>
>>+static inline bool skb_flow_is_icmp_any(const struct sk_buff *skb, u8 ip_proto)
>>+{
>>+      return flow_protos_are_icmp_any(skb->protocol, ip_proto);
>>+}
>>+
>> static inline __be32 skb_flow_get_ports(const struct sk_buff *skb,
>>                                       int thoff, u8 ip_proto)
>> {
>>diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h
>>index c4f31666afd2..5540dfa18872 100644
>>--- a/include/net/flow_dissector.h
>>+++ b/include/net/flow_dissector.h
>>@@ -2,6 +2,7 @@
>> #define _NET_FLOW_DISSECTOR_H
>>
>> #include <linux/types.h>
>>+#include <linux/in.h>
>> #include <linux/in6.h>
>> #include <uapi/linux/if_ether.h>
>>
>>@@ -89,10 +90,15 @@ struct flow_dissector_key_addrs {
>> };
>>
>> /**
>>- * flow_dissector_key_tp_ports:
>>- *    @ports: port numbers of Transport header
>>+ * flow_dissector_key_ports:
>>+ *    @ports: port numbers of Transport header or
>>+ *            type and code of ICMP header
>>+ *            ports: source (high) and destination (low) port numbers
>>  *            src: source port number
>>  *            dst: destination port number
>>+ *            icmp: ICMP type (high) and code (low)
>>+ *            type: ICMP type
>>+ *            type: ICMP code
>>  */
>> struct flow_dissector_key_ports {
>>       union {
>>@@ -101,6 +107,11 @@ struct flow_dissector_key_ports {
>>                       __be16 src;
>>                       __be16 dst;
>>               };
>>+              __be16 icmp;
>>+              struct {
>>+                      u8 type;
>>+                      u8 code;
>>+              };
>
> Digging into this a bit more. I think it would be much nice not to mix
> up l4 ports and icmp stuff.
>
> How about to have FLOW_DISSECTOR_KEY_ICMP
> and
> struct flow_dissector_key_icmp {
>         u8 type;
>         u8 code;
> };
>
> The you can make this structure and struct flow_dissector_key_ports into
> an union in struct flow_keys.
>
> Looks much cleaner to me.
>
I agree, this patch adds to many conditionals into the fast path for
ICMP handling. Neither is there much point in using type and code as
input to the packet hash.

Tom

>
>
>>       };
>> };
>>
>>@@ -188,9 +199,42 @@ struct flow_keys_digest {
>> void make_flow_keys_digest(struct flow_keys_digest *digest,
>>                          const struct flow_keys *flow);
>>
>>+static inline bool flow_protos_are_icmpv4(__be16 n_proto, u8 ip_proto)
>>+{
>>+      return n_proto == htons(ETH_P_IP) && ip_proto == IPPROTO_ICMP;
>>+}
>>+
>>+static inline bool flow_protos_are_icmpv6(__be16 n_proto, u8 ip_proto)
>>+{
>>+      return n_proto == htons(ETH_P_IPV6) && ip_proto == IPPROTO_ICMPV6;
>>+}
>>+
>>+static inline bool flow_protos_are_icmp_any(__be16 n_proto, u8 ip_proto)
>>+{
>>+      return flow_protos_are_icmpv4(n_proto, ip_proto) ||
>>+              flow_protos_are_icmpv6(n_proto, ip_proto);
>>+}
>>+
>>+static inline bool flow_basic_key_is_icmpv4(const struct flow_dissector_key_basic *basic)
>>+{
>>+      return flow_protos_are_icmpv4(basic->n_proto, basic->ip_proto);
>>+}
>>+
>>+static inline bool flow_basic_key_is_icmpv6(const struct flow_dissector_key_basic *basic)
>>+{
>>+      return flow_protos_are_icmpv6(basic->n_proto, basic->ip_proto);
>>+}
>>+
>>+static inline bool flow_keys_are_icmp_any(const struct flow_keys *keys)
>>+{
>>+      return flow_protos_are_icmp_any(keys->basic.n_proto,
>>+                                      keys->basic.ip_proto);
>>+}
>>+
>> static inline bool flow_keys_have_l4(const struct flow_keys *keys)
>> {
>>-      return (keys->ports.ports || keys->tags.flow_label);
>>+      return (!flow_keys_are_icmp_any(keys) && keys->ports.ports) ||
>>+              keys->tags.flow_label;
>> }
>>
>> u32 flow_hash_from_keys(struct flow_keys *keys);
>>diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
>>index 1eb6f949e5b2..0584b4bb4390 100644
>>--- a/net/core/flow_dissector.c
>>+++ b/net/core/flow_dissector.c
>>@@ -58,6 +58,28 @@ void skb_flow_dissector_init(struct flow_dissector *flow_dissector,
>> EXPORT_SYMBOL(skb_flow_dissector_init);
>>
>> /**
>>+ * skb_flow_get_be16 - extract be16 entity
>>+ * @skb: sk_buff to extract from
>>+ * @poff: offset to extract at
>>+ * @data: raw buffer pointer to the packet
>>+ * @hlen: packet header length
>>+ *
>>+ * The function will try to retrieve a be32 entity at
>>+ * offset poff
>>+ */
>>+__be16 skb_flow_get_be16(const struct sk_buff *skb, int poff, void *data,
>>+                       int hlen)
>>+{
>>+      __be16 *u, _u;
>>+
>>+      u = __skb_header_pointer(skb, poff, sizeof(_u), data, hlen, &_u);
>>+      if (u)
>>+              return *u;
>>+
>>+      return 0;
>>+}
>>+
>>+/**
>>  * __skb_flow_get_ports - extract the upper layer ports and return them
>>  * @skb: sk_buff to extract the ports from
>>  * @thoff: transport header offset
>>@@ -542,8 +564,13 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
>>               key_ports = skb_flow_dissector_target(flow_dissector,
>>                                                     FLOW_DISSECTOR_KEY_PORTS,
>>                                                     target_container);
>>-              key_ports->ports = __skb_flow_get_ports(skb, nhoff, ip_proto,
>>-                                                      data, hlen);
>>+              if (flow_protos_are_icmp_any(proto, ip_proto))
>>+                      key_ports->icmp = skb_flow_get_be16(skb, nhoff, data,
>>+                                                          hlen);
>>+              else
>>+                      key_ports->ports = __skb_flow_get_ports(skb, nhoff,
>>+                                                              ip_proto, data,
>>+                                                              hlen);
>>       }
>>
>> out_good:
>>@@ -718,7 +745,8 @@ void make_flow_keys_digest(struct flow_keys_digest *digest,
>>
>>       data->n_proto = flow->basic.n_proto;
>>       data->ip_proto = flow->basic.ip_proto;
>>-      data->ports = flow->ports.ports;
>>+      if (flow_keys_have_l4(flow))
>>+              data->ports = flow->ports.ports;
>>       data->src = flow->addrs.v4addrs.src;
>>       data->dst = flow->addrs.v4addrs.dst;
>> }
>>diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c
>>index e39672394c7b..a1a7ae71aa62 100644
>>--- a/net/sched/cls_flow.c
>>+++ b/net/sched/cls_flow.c
>>@@ -96,7 +96,7 @@ static u32 flow_get_proto(const struct sk_buff *skb,
>> static u32 flow_get_proto_src(const struct sk_buff *skb,
>>                             const struct flow_keys *flow)
>> {
>>-      if (flow->ports.ports)
>>+      if (!flow_keys_are_icmp_any(flow) && flow->ports.ports)
>>               return ntohs(flow->ports.src);
>>
>>       return addr_fold(skb->sk);
>>@@ -105,7 +105,7 @@ static u32 flow_get_proto_src(const struct sk_buff *skb,
>> static u32 flow_get_proto_dst(const struct sk_buff *skb,
>>                             const struct flow_keys *flow)
>> {
>>-      if (flow->ports.ports)
>>+      if (!flow_keys_are_icmp_any(flow) && flow->ports.ports)
>>               return ntohs(flow->ports.dst);
>>
>>       return addr_fold(skb_dst(skb)) ^ (__force u16) tc_skb_protocol(skb);
>>--
>>2.7.0.rc3.207.g0ac5344
>>

^ permalink raw reply

* Re: [PATCH 1/1] netdev: broadcom: propagate error code
From: Michael Chan @ 2016-12-03 18:50 UTC (permalink / raw)
  To: Pan Bian
  Cc: David S. Miller, Prashant Sreedharan, Satish Baddipadige, Netdev,
	open list
In-Reply-To: <1480758977-4234-1-git-send-email-bianpan2016@163.com>

On Sat, Dec 3, 2016 at 1:56 AM, Pan Bian <bianpan2016@163.com> wrote:
> Function bnxt_hwrm_stat_ctx_alloc() always returns 0, even if the call
> to _hwrm_send_message() fails. It may be better to propagate the errors
> to the caller of bnxt_hwrm_stat_ctx_alloc().
>
> Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=188661
>
> Signed-off-by: Pan Bian <bianpan2016@163.com>

Acked-by: Michael Chan <michael.chan@broadcom.com>

^ permalink raw reply

* [Patch net-next] act_mirred: fix a typo in get_dev
From: Cong Wang @ 2016-12-03 18:36 UTC (permalink / raw)
  To: netdev; +Cc: Cong Wang, Hadar Hen Zion, Jiri Pirko

Cc: Hadar Hen Zion <hadarh@mellanox.com>
Cc: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
---
 net/sched/act_mirred.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index bb09ba3..2d9fa6e 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -321,7 +321,7 @@ static int tcf_mirred_device(const struct tc_action *a, struct net *net,
 	int ifindex = tcf_mirred_ifindex(a);
 
 	*mirred_dev = __dev_get_by_index(net, ifindex);
-	if (!mirred_dev)
+	if (!*mirred_dev)
 		return -EINVAL;
 	return 0;
 }
-- 
2.1.4

^ permalink raw reply related

* Re: net: use-after-free in worker_thread
From: Cong Wang @ 2016-12-03 18:14 UTC (permalink / raw)
  To: Andrey Konovalov
  Cc: David S. Miller, Johannes Berg, Florian Westphal, Herbert Xu,
	Eric Dumazet, Bob Copeland, Tom Herbert, David Decotigny, netdev,
	LKML
In-Reply-To: <CAM_iQpUSSc6xvAd3u5C3d=xk_yL4LUBgui7pgK5AHxJ8LP363A@mail.gmail.com>

[-- Attachment #1: Type: text/plain, Size: 2026 bytes --]

On Sat, Dec 3, 2016 at 9:41 AM, Cong Wang <xiyou.wangcong@gmail.com> wrote:
> On Sat, Dec 3, 2016 at 4:56 AM, Andrey Konovalov <andreyknvl@google.com> wrote:
>> Hi!
>>
>> I'm seeing lots of the following error reports while running the
>> syzkaller fuzzer.
>>
>> Reports appeared when I updated to 3c49de52 (Dec 2) from 2caceb32 (Dec 1).
>>
>> ==================================================================
>> BUG: KASAN: use-after-free in worker_thread+0x17d8/0x18a0
>> Read of size 8 at addr ffff880067f3ecd8 by task kworker/3:1/774
>>
>> page:ffffea00019fce00 count:1 mapcount:0 mapping:          (null)
>> index:0xffff880067f39c10 compound_mapcount: 0
>> flags: 0x500000000004080(slab|head)
>> page dumped because: kasan: bad access detected
>>
>> CPU: 3 PID: 774 Comm: kworker/3:1 Not tainted 4.9.0-rc7+ #66
>> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
>>  ffff88006c267838 ffffffff81f882da ffffffff6c25e338 1ffff1000d84ce9a
>>  ffffed000d84ce92 ffff88006c25e340 0000000041b58ab3 ffffffff8541e198
>>  ffffffff81f88048 0000000100000000 0000000041b58ab3 ffffffff853d3ee8
>> Call Trace:
>>  [<     inline     >] __dump_stack lib/dump_stack.c:15
>>  [<ffffffff81f882da>] dump_stack+0x292/0x398 lib/dump_stack.c:51
>>  [<     inline     >] describe_address mm/kasan/report.c:262
>>  [<ffffffff817e50d1>] kasan_report_error+0x121/0x560 mm/kasan/report.c:368
>>  [<     inline     >] kasan_report mm/kasan/report.c:390
>>  [<ffffffff817e560e>] __asan_report_load8_noabort+0x3e/0x40
>> mm/kasan/report.c:411
>>  [<ffffffff81329b88>] worker_thread+0x17d8/0x18a0 kernel/workqueue.c:2228
>>  [<ffffffff8133ebf3>] kthread+0x323/0x3e0 kernel/kthread.c:209
>>  [<ffffffff84a2a22a>] ret_from_fork+0x2a/0x40 arch/x86/entry/entry_64.S:433
>
> Heck... this is the pending work vs. sk_destruct() race. :-/
> We can't wait for the work in RCU callback, let me think about it...

Please try the attached patch, I only did compile test, I can't access
my desktop now, so can't do further tests.

Thanks!

[-- Attachment #2: netlink.diff --]
[-- Type: text/plain, Size: 2005 bytes --]

diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 602e5eb..6f33013 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -322,11 +322,13 @@ static void netlink_skb_set_owner_r(struct sk_buff *skb, struct sock *sk)
 	sk_mem_charge(sk, skb->truesize);
 }
 
-static void __netlink_sock_destruct(struct sock *sk)
+static void netlink_sock_destruct(struct sock *sk)
 {
 	struct netlink_sock *nlk = nlk_sk(sk);
 
 	if (nlk->cb_running) {
+		if (nlk->cb.done)
+			nlk->cb.done(&nlk->cb);
 		module_put(nlk->cb.module);
 		kfree_skb(nlk->cb.skb);
 	}
@@ -343,28 +345,6 @@ static void __netlink_sock_destruct(struct sock *sk)
 	WARN_ON(nlk_sk(sk)->groups);
 }
 
-static void netlink_sock_destruct_work(struct work_struct *work)
-{
-	struct netlink_sock *nlk = container_of(work, struct netlink_sock,
-						work);
-
-	nlk->cb.done(&nlk->cb);
-	__netlink_sock_destruct(&nlk->sk);
-}
-
-static void netlink_sock_destruct(struct sock *sk)
-{
-	struct netlink_sock *nlk = nlk_sk(sk);
-
-	if (nlk->cb_running && nlk->cb.done) {
-		INIT_WORK(&nlk->work, netlink_sock_destruct_work);
-		schedule_work(&nlk->work);
-		return;
-	}
-
-	__netlink_sock_destruct(sk);
-}
-
 /* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it is _very_ bad on
  * SMP. Look, when several writers sleep and reader wakes them up, all but one
  * immediately hit write lock and grab all the cpus. Exclusive sleep solves
@@ -664,11 +644,19 @@ static int netlink_create(struct net *net, struct socket *sock, int protocol,
 	goto out;
 }
 
+static void netlink_sock_put_work(struct work_struct *work)
+{
+	struct netlink_sock *nlk = container_of(work, struct netlink_sock,
+						work);
+	sock_put(&nlk->sk);
+}
+
 static void deferred_put_nlk_sk(struct rcu_head *head)
 {
 	struct netlink_sock *nlk = container_of(head, struct netlink_sock, rcu);
 
-	sock_put(&nlk->sk);
+	INIT_WORK(&nlk->work, netlink_sock_put_work);
+	schedule_work(&nlk->work);
 }
 
 static int netlink_release(struct socket *sock)

^ permalink raw reply related

* Re: Possible regression due to "net/sched: cls_flower: Add offload support using egress Hardware device"
From: Simon Horman @ 2016-12-03 17:54 UTC (permalink / raw)
  To: Or Gerlitz; +Cc: Hadar Hen Zion, Linux Netdev List, Roi Dayan, Amir Vadai
In-Reply-To: <CAJ3xEMiTgDq-qE_HEdGc-F7U24Be+XS6RtsG-22zUzrV1Z_neA@mail.gmail.com>

On Sat, Dec 03, 2016 at 06:06:08PM +0200, Or Gerlitz wrote:
> On Sat, Dec 3, 2016 at 3:17 PM, Simon Horman <horms@verge.net.au> wrote:
> 
> > in net-next I am observing what appears to be an regression in net-next due to:
> > 7091d8c7055d ("net/sched: cls_flower: Add offload support using egress Hardware device")
> >
> > The problem occurs when adding a flower filter (without offload to a virtio device).
> 
> > # ethtool -d eth0
> > ethtool -i eth0
> > driver: virtio_net
> 
> > # tc qdisc add dev eth0 ingress
> > # tc filter add dev eth0 protocol ip parent ffff: flower indev eth0
> > [  104.302779] BUG: unable to handle kernel NULL pointer dereference at 00000000000000d5
> 
> Simon, I don't see an action here, is that missing in purpose? wasn't
> sure what such filter does, but ofcourse
> if it was supported by the patches it has to be so after changing
> things as well, we will check and fix.
> 

Hi Or,

sorry, I trimmed the action by mistake when trying to make a minimal test
case. Originally I noticed this bug with something like the following:

tc filter add dev eth0 protocol ip parent ffff: flower indev eth0 ip_proto udp dst_port 53 action drop

I don't think the fields flower is matching on or the action are
particularly important when trying to reproduce the problem. But I could be
wrong.

^ permalink raw reply

* Re: net: use-after-free in worker_thread
From: Cong Wang @ 2016-12-03 17:41 UTC (permalink / raw)
  To: Andrey Konovalov
  Cc: David S. Miller, Johannes Berg, Florian Westphal, Herbert Xu,
	Eric Dumazet, Bob Copeland, Tom Herbert, David Decotigny, netdev,
	LKML
In-Reply-To: <CAAeHK+x95-n__YSbzebp51ez78yjqjK4CJL=tgOgPuBuGh+q1A@mail.gmail.com>

On Sat, Dec 3, 2016 at 4:56 AM, Andrey Konovalov <andreyknvl@google.com> wrote:
> Hi!
>
> I'm seeing lots of the following error reports while running the
> syzkaller fuzzer.
>
> Reports appeared when I updated to 3c49de52 (Dec 2) from 2caceb32 (Dec 1).
>
> ==================================================================
> BUG: KASAN: use-after-free in worker_thread+0x17d8/0x18a0
> Read of size 8 at addr ffff880067f3ecd8 by task kworker/3:1/774
>
> page:ffffea00019fce00 count:1 mapcount:0 mapping:          (null)
> index:0xffff880067f39c10 compound_mapcount: 0
> flags: 0x500000000004080(slab|head)
> page dumped because: kasan: bad access detected
>
> CPU: 3 PID: 774 Comm: kworker/3:1 Not tainted 4.9.0-rc7+ #66
> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
>  ffff88006c267838 ffffffff81f882da ffffffff6c25e338 1ffff1000d84ce9a
>  ffffed000d84ce92 ffff88006c25e340 0000000041b58ab3 ffffffff8541e198
>  ffffffff81f88048 0000000100000000 0000000041b58ab3 ffffffff853d3ee8
> Call Trace:
>  [<     inline     >] __dump_stack lib/dump_stack.c:15
>  [<ffffffff81f882da>] dump_stack+0x292/0x398 lib/dump_stack.c:51
>  [<     inline     >] describe_address mm/kasan/report.c:262
>  [<ffffffff817e50d1>] kasan_report_error+0x121/0x560 mm/kasan/report.c:368
>  [<     inline     >] kasan_report mm/kasan/report.c:390
>  [<ffffffff817e560e>] __asan_report_load8_noabort+0x3e/0x40
> mm/kasan/report.c:411
>  [<ffffffff81329b88>] worker_thread+0x17d8/0x18a0 kernel/workqueue.c:2228
>  [<ffffffff8133ebf3>] kthread+0x323/0x3e0 kernel/kthread.c:209
>  [<ffffffff84a2a22a>] ret_from_fork+0x2a/0x40 arch/x86/entry/entry_64.S:433

Heck... this is the pending work vs. sk_destruct() race. :-/
We can't wait for the work in RCU callback, let me think about it...

^ permalink raw reply

* Re: [PATCH v2 net] ixgbevf: fix invalid uses of napi_hash_del()
From: Jeff Kirsher @ 2016-12-03 16:46 UTC (permalink / raw)
  To: Eric Dumazet, David Miller; +Cc: netdev
In-Reply-To: <1480777216.18162.404.camel@edumazet-glaptop3.roam.corp.google.com>

[-- Attachment #1: Type: text/plain, Size: 2231 bytes --]

On Sat, 2016-12-03 at 07:00 -0800, Eric Dumazet wrote:
> On Wed, 2016-11-16 at 07:26 -0800, Eric Dumazet wrote:
> > From: Eric Dumazet <edumazet@google.com>
> > 
> > Calling napi_hash_del() before netif_napi_del() is dangerous
> > if a synchronize_rcu() is not enforced before NAPI struct freeing.
> > 
> > Lets leave this detail to core networking stack to get it right.
> > 
> > Signed-off-by: Eric Dumazet <edumazet@google.com>
> > Cc: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
> > ---
> >   drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c |    6 ------
> >   1 file changed, 6 deletions(-)
> > 
> > diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
> > b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
> > index 7eaac3234049..bf4d7efc7dbd 100644
> > --- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
> > +++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
> > @@ -2511,9 +2511,6 @@ static int ixgbevf_alloc_q_vectors(struct
> > ixgbevf_adapter *adapter)
> >        while (q_idx) {
> >                q_idx--;
> >                q_vector = adapter->q_vector[q_idx];
> > -#ifdef CONFIG_NET_RX_BUSY_POLL
> > -             napi_hash_del(&q_vector->napi);
> > -#endif
> >                netif_napi_del(&q_vector->napi);
> >                kfree(q_vector);
> >                adapter->q_vector[q_idx] = NULL;
> > @@ -2537,9 +2534,6 @@ static void ixgbevf_free_q_vectors(struct
> > ixgbevf_adapter *adapter)
> >                struct ixgbevf_q_vector *q_vector = adapter-
> > >q_vector[q_idx];
> >   
> >                adapter->q_vector[q_idx] = NULL;
> > -#ifdef CONFIG_NET_RX_BUSY_POLL
> > -             napi_hash_del(&q_vector->napi);
> > -#endif
> >                netif_napi_del(&q_vector->napi);
> >                kfree(q_vector);
> >        }
> > 
> > 
> 
> It looks this patch was not picked up ?

Yeah, sorry I missed it since it was not sent to intel-wired-lan mailing
list.  Dave I am fine if you want to pick this up for your net tree (if it
is not too late).

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 819 bytes --]

^ permalink raw reply

* Re: [flamebait] xdp, well meaning but pointless
From: Willem de Bruijn @ 2016-12-03 16:19 UTC (permalink / raw)
  To: Jesper Dangaard Brouer; +Cc: Florian Westphal, Network Development
In-Reply-To: <20161202182233.7d78459f@redhat.com>

On Fri, Dec 2, 2016 at 12:22 PM, Jesper Dangaard Brouer
<brouer@redhat.com> wrote:
>
> On Thu, 1 Dec 2016 10:11:08 +0100 Florian Westphal <fw@strlen.de> wrote:
>
>> In light of DPDKs existence it make a lot more sense to me to provide
>> a). a faster mmap based interface (possibly AF_PACKET based) that allows
>> to map nic directly into userspace, detaching tx/rx queue from kernel.
>>
>> John Fastabend sent something like this last year as a proof of
>> concept, iirc it was rejected because register space got exposed directly
>> to userspace.  I think we should re-consider merging netmap
>> (or something conceptually close to its design).
>
> I'm actually working in this direction, of zero-copy RX mapping packets
> into userspace.  This work is mostly related to page_pool, and I only
> plan to use XDP as a filter for selecting packets going to userspace,
> as this choice need to be taken very early.
>
> My design is here:
>  https://prototype-kernel.readthedocs.io/en/latest/vm/page_pool/design/memory_model_nic.html
>
> This is mostly about changing the memory model in the drivers, to allow
> for safely mapping pages to userspace.  (An efficient queue mechanism is
> not covered).

Virtio virtqueues are used in various other locations in the stack.
With separate memory pools and send + completion descriptor rings,
signal moderation, careful avoidance of cacheline bouncing, etc. these
seem like a good opportunity for a TPACKET_V4 format.

^ permalink raw reply

* Re: Possible regression due to "net/sched: cls_flower: Add offload support using egress Hardware device"
From: Or Gerlitz @ 2016-12-03 16:06 UTC (permalink / raw)
  To: Simon Horman; +Cc: Hadar Hen Zion, Linux Netdev List, Roi Dayan, Amir Vadai
In-Reply-To: <20161203131758.GA4695@verge.net.au>

On Sat, Dec 3, 2016 at 3:17 PM, Simon Horman <horms@verge.net.au> wrote:

> in net-next I am observing what appears to be an regression in net-next due to:
> 7091d8c7055d ("net/sched: cls_flower: Add offload support using egress Hardware device")
>
> The problem occurs when adding a flower filter (without offload to a virtio device).

> # ethtool -d eth0
> ethtool -i eth0
> driver: virtio_net

> # tc qdisc add dev eth0 ingress
> # tc filter add dev eth0 protocol ip parent ffff: flower indev eth0
> [  104.302779] BUG: unable to handle kernel NULL pointer dereference at 00000000000000d5

Simon, I don't see an action here, is that missing in purpose? wasn't
sure what such filter does, but ofcourse
if it was supported by the patches it has to be so after changing
things as well, we will check and fix.

Or.

^ permalink raw reply

* [patch net-next v4 10/10] ipv4: fib: Replay events when registering FIB notifier
From: Jiri Pirko @ 2016-12-03 15:45 UTC (permalink / raw)
  To: netdev
  Cc: davem, idosch, eladr, yotamg, nogahf, arkadis, ogerlitz, roopa,
	dsa, nikolay, andy, vivien.didelot, andrew, f.fainelli,
	alexander.h.duyck, hannes, kaber
In-Reply-To: <1480779907-32535-1-git-send-email-jiri@resnulli.us>

From: Ido Schimmel <idosch@mellanox.com>

Commit b90eb7549499 ("fib: introduce FIB notification infrastructure")
introduced a new notification chain to notify listeners (f.e., switchdev
drivers) about addition and deletion of routes.

However, upon registration to the chain the FIB tables can already be
populated, which means potential listeners will have an incomplete view
of the tables.

Solve that by dumping the FIB tables and replaying the events to the
passed notification block. The dump itself is done using RCU in order
not to starve consumers that need RTNL to make progress.

The integrity of the dump is ensured by reading the FIB change sequence
counter before and after the dump under RTNL. This allows us to avoid
the problematic situation in which the dumping process sends a ENTRY_ADD
notification following ENTRY_DEL generated by another process holding
RTNL.

Callers of the registration function may pass a callback that is
executed in case the dump was inconsistent with current FIB tables.

The number of retries until a consistent dump is achieved is set to a
fixed number to prevent callers from looping for long periods of time.
In case current limit proves to be problematic in the future, it can be
easily converted to be configurable using a sysctl.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
---
 .../net/ethernet/mellanox/mlxsw/spectrum_router.c  |  20 ++-
 drivers/net/ethernet/rocker/rocker_main.c          |   8 +-
 include/net/ip_fib.h                               |   3 +-
 net/ipv4/fib_trie.c                                | 148 ++++++++++++++++++++-
 4 files changed, 174 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
index 14bed1d..53126bf 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
@@ -2027,6 +2027,18 @@ static int mlxsw_sp_router_fib_event(struct notifier_block *nb,
 	return NOTIFY_DONE;
 }
 
+static void mlxsw_sp_router_fib_dump_flush(struct notifier_block *nb)
+{
+	struct mlxsw_sp *mlxsw_sp = container_of(nb, struct mlxsw_sp, fib_nb);
+
+	/* Flush pending FIB notifications and then flush the device's
+	 * table before requesting another dump. The FIB notification
+	 * block is unregistered, so no need to take RTNL.
+	 */
+	mlxsw_core_flush_owq();
+	mlxsw_sp_router_fib_flush(mlxsw_sp);
+}
+
 int mlxsw_sp_router_init(struct mlxsw_sp *mlxsw_sp)
 {
 	int err;
@@ -2047,9 +2059,15 @@ int mlxsw_sp_router_init(struct mlxsw_sp *mlxsw_sp)
 		goto err_neigh_init;
 
 	mlxsw_sp->fib_nb.notifier_call = mlxsw_sp_router_fib_event;
-	register_fib_notifier(&mlxsw_sp->fib_nb);
+	err = register_fib_notifier(&mlxsw_sp->fib_nb,
+				    mlxsw_sp_router_fib_dump_flush);
+	if (err)
+		goto err_register_fib_notifier;
+
 	return 0;
 
+err_register_fib_notifier:
+	mlxsw_sp_neigh_fini(mlxsw_sp);
 err_neigh_init:
 	mlxsw_sp_vrs_fini(mlxsw_sp);
 err_vrs_init:
diff --git a/drivers/net/ethernet/rocker/rocker_main.c b/drivers/net/ethernet/rocker/rocker_main.c
index 8c9c90a..7c450b5 100644
--- a/drivers/net/ethernet/rocker/rocker_main.c
+++ b/drivers/net/ethernet/rocker/rocker_main.c
@@ -2804,8 +2804,13 @@ static int rocker_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 		goto err_alloc_ordered_workqueue;
 	}
 
+	/* Only FIBs pointing to our own netdevs are programmed into
+	 * the device, so no need to pass a callback.
+	 */
 	rocker->fib_nb.notifier_call = rocker_router_fib_event;
-	register_fib_notifier(&rocker->fib_nb);
+	err = register_fib_notifier(&rocker->fib_nb, NULL);
+	if (err)
+		goto err_register_fib_notifier;
 
 	rocker->hw.id = rocker_read64(rocker, SWITCH_ID);
 
@@ -2822,6 +2827,7 @@ static int rocker_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 
 err_probe_ports:
 	unregister_fib_notifier(&rocker->fib_nb);
+err_register_fib_notifier:
 	destroy_workqueue(rocker->rocker_owq);
 err_alloc_ordered_workqueue:
 	free_irq(rocker_msix_vector(rocker, ROCKER_MSIX_VEC_EVENT), rocker);
diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 6c67b93..5f376af 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -221,7 +221,8 @@ enum fib_event_type {
 	FIB_EVENT_RULE_DEL,
 };
 
-int register_fib_notifier(struct notifier_block *nb);
+int register_fib_notifier(struct notifier_block *nb,
+			  void (*cb)(struct notifier_block *nb));
 int unregister_fib_notifier(struct notifier_block *nb);
 int call_fib_notifiers(struct net *net, enum fib_event_type event_type,
 		       struct fib_notifier_info *info);
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 2891356..73a6270 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -84,11 +84,99 @@
 #include <trace/events/fib.h>
 #include "fib_lookup.h"
 
+static unsigned int fib_seq_sum(void)
+{
+	unsigned int fib_seq = 0;
+	struct net *net;
+
+	rtnl_lock();
+	for_each_net(net)
+		fib_seq += net->ipv4.fib_seq;
+	rtnl_unlock();
+
+	return fib_seq;
+}
+
 static ATOMIC_NOTIFIER_HEAD(fib_chain);
 
-int register_fib_notifier(struct notifier_block *nb)
+static int call_fib_notifier(struct notifier_block *nb, struct net *net,
+			     enum fib_event_type event_type,
+			     struct fib_notifier_info *info)
 {
-	return atomic_notifier_chain_register(&fib_chain, nb);
+	info->net = net;
+	return nb->notifier_call(nb, event_type, info);
+}
+
+static void fib_rules_notify(struct net *net, struct notifier_block *nb,
+			     enum fib_event_type event_type)
+{
+#ifdef CONFIG_IP_MULTIPLE_TABLES
+	struct fib_notifier_info info;
+
+	if (net->ipv4.fib_has_custom_rules)
+		call_fib_notifier(nb, net, event_type, &info);
+#endif
+}
+
+static void fib_notify(struct net *net, struct notifier_block *nb,
+		       enum fib_event_type event_type);
+
+static int call_fib_entry_notifier(struct notifier_block *nb, struct net *net,
+				   enum fib_event_type event_type, u32 dst,
+				   int dst_len, struct fib_info *fi,
+				   u8 tos, u8 type, u32 tb_id, u32 nlflags)
+{
+	struct fib_entry_notifier_info info = {
+		.dst = dst,
+		.dst_len = dst_len,
+		.fi = fi,
+		.tos = tos,
+		.type = type,
+		.tb_id = tb_id,
+		.nlflags = nlflags,
+	};
+	return call_fib_notifier(nb, net, event_type, &info.info);
+}
+
+static bool fib_dump_is_consistent(struct notifier_block *nb,
+				   void (*cb)(struct notifier_block *nb),
+				   unsigned int fib_seq)
+{
+	atomic_notifier_chain_register(&fib_chain, nb);
+	if (fib_seq == fib_seq_sum())
+		return true;
+	atomic_notifier_chain_unregister(&fib_chain, nb);
+	if (cb)
+		cb(nb);
+	return false;
+}
+
+#define FIB_DUMP_MAX_RETRIES 5
+int register_fib_notifier(struct notifier_block *nb,
+			  void (*cb)(struct notifier_block *nb))
+{
+	int retries = 0;
+
+	do {
+		unsigned int fib_seq = fib_seq_sum();
+		struct net *net;
+
+		/* Mutex semantics guarantee that every change done to
+		 * FIB tries before we read the change sequence counter
+		 * is now visible to us.
+		 */
+		rcu_read_lock();
+		for_each_net_rcu(net) {
+			fib_rules_notify(net, nb, FIB_EVENT_RULE_ADD);
+			fib_notify(net, nb, FIB_EVENT_ENTRY_ADD);
+		}
+		rcu_read_unlock();
+
+		if (fib_dump_is_consistent(nb, cb, fib_seq))
+			return 0;
+	} while (++retries < FIB_DUMP_MAX_RETRIES);
+
+	return -EBUSY;
 }
 EXPORT_SYMBOL(register_fib_notifier);
 
@@ -1902,6 +1990,62 @@ int fib_table_flush(struct net *net, struct fib_table *tb)
 	return found;
 }
 
+static void fib_leaf_notify(struct net *net, struct key_vector *l,
+			    struct fib_table *tb, struct notifier_block *nb,
+			    enum fib_event_type event_type)
+{
+	struct fib_alias *fa;
+
+	hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) {
+		struct fib_info *fi = fa->fa_info;
+
+		if (!fi)
+			continue;
+
+		/* local and main table can share the same trie,
+		 * so don't notify twice for the same entry.
+		 */
+		if (tb->tb_id != fa->tb_id)
+			continue;
+
+		call_fib_entry_notifier(nb, net, event_type, l->key,
+					KEYLENGTH - fa->fa_slen, fi, fa->fa_tos,
+					fa->fa_type, fa->tb_id, 0);
+	}
+}
+
+static void fib_table_notify(struct net *net, struct fib_table *tb,
+			     struct notifier_block *nb,
+			     enum fib_event_type event_type)
+{
+	struct trie *t = (struct trie *)tb->tb_data;
+	struct key_vector *l, *tp = t->kv;
+	t_key key = 0;
+
+	while ((l = leaf_walk_rcu(&tp, key)) != NULL) {
+		fib_leaf_notify(net, l, tb, nb, event_type);
+
+		key = l->key + 1;
+		/* stop in case of wrap around */
+		if (key < l->key)
+			break;
+	}
+}
+
+static void fib_notify(struct net *net, struct notifier_block *nb,
+		       enum fib_event_type event_type)
+{
+	unsigned int h;
+
+	for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
+		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
+		struct fib_table *tb;
+
+		hlist_for_each_entry_rcu(tb, head, tb_hlist)
+			fib_table_notify(net, tb, nb, event_type);
+	}
+}
+
 static void __trie_free_rcu(struct rcu_head *head)
 {
 	struct fib_table *tb = container_of(head, struct fib_table, rcu);
-- 
2.7.4

^ permalink raw reply related

* [patch net-next v4 09/10] ipv4: fib: Allow for consistent FIB dumping
From: Jiri Pirko @ 2016-12-03 15:45 UTC (permalink / raw)
  To: netdev
  Cc: davem, idosch, eladr, yotamg, nogahf, arkadis, ogerlitz, roopa,
	dsa, nikolay, andy, vivien.didelot, andrew, f.fainelli,
	alexander.h.duyck, hannes, kaber
In-Reply-To: <1480779907-32535-1-git-send-email-jiri@resnulli.us>

From: Ido Schimmel <idosch@mellanox.com>

The next patch will enable listeners of the FIB notification chain to
request a dump of the FIB tables. However, since RTNL isn't taken during
the dump, it's possible for the FIB tables to change mid-dump, which
will result in inconsistency between the listener's table and the
kernel's.

Allow listeners to know about changes that occurred mid-dump, by adding
a change sequence counter to each net namespace. The counter is
incremented just before a notification is sent in the FIB chain.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
---
 include/net/netns/ipv4.h | 3 +++
 net/ipv4/fib_frontend.c  | 2 ++
 net/ipv4/fib_trie.c      | 1 +
 3 files changed, 6 insertions(+)

diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 7adf438..f0cf5a1 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -135,6 +135,9 @@ struct netns_ipv4 {
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
 	int sysctl_fib_multipath_use_neigh;
 #endif
+
+	unsigned int	fib_seq;	/* protected by rtnl_mutex */
+
 	atomic_t	rt_genid;
 };
 #endif
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 121384b..dbad5a1 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -1219,6 +1219,8 @@ static int __net_init ip_fib_net_init(struct net *net)
 	int err;
 	size_t size = sizeof(struct hlist_head) * FIB_TABLE_HASHSZ;
 
+	net->ipv4.fib_seq = 0;
+
 	/* Avoid false sharing : Use at least a full cache line */
 	size = max_t(size_t, size, L1_CACHE_BYTES);
 
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 9bfce0d..2891356 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -101,6 +101,7 @@ EXPORT_SYMBOL(unregister_fib_notifier);
 int call_fib_notifiers(struct net *net, enum fib_event_type event_type,
 		       struct fib_notifier_info *info)
 {
+	net->ipv4.fib_seq++;
 	info->net = net;
 	return atomic_notifier_call_chain(&fib_chain, event_type, info);
 }
-- 
2.7.4

^ permalink raw reply related

* [patch net-next v4 04/10] mlxsw: spectrum_router: Implement FIB offload in deferred work
From: Jiri Pirko @ 2016-12-03 15:45 UTC (permalink / raw)
  To: netdev
  Cc: davem, idosch, eladr, yotamg, nogahf, arkadis, ogerlitz, roopa,
	dsa, nikolay, andy, vivien.didelot, andrew, f.fainelli,
	alexander.h.duyck, hannes, kaber
In-Reply-To: <1480779907-32535-1-git-send-email-jiri@resnulli.us>

From: Ido Schimmel <idosch@mellanox.com>

FIB offload is currently done in process context with RTNL held, but
we're about to dump the FIB tables in RCU critical section, so we can no
longer sleep.

Instead, defer the operation to process context using deferred work. Make
sure fib info isn't freed while the work is queued by taking a reference
on it and releasing it after the operation is done.

Deferring the operation is valid because the upper layers always assume
the operation was successful. If it's not, then the driver-specific
abort mechanism is called and all routed traffic is directed to slow
path.

The work items are submitted to an ordered workqueue to prevent a
mismatch between the kernel's FIB table and the device's.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
---
 .../net/ethernet/mellanox/mlxsw/spectrum_router.c  | 72 +++++++++++++++++++---
 1 file changed, 62 insertions(+), 10 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
index 683f045..14bed1d 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
@@ -593,6 +593,14 @@ static void mlxsw_sp_router_fib_flush(struct mlxsw_sp *mlxsw_sp);
 
 static void mlxsw_sp_vrs_fini(struct mlxsw_sp *mlxsw_sp)
 {
+	/* At this stage we're guaranteed not to have new incoming
+	 * FIB notifications and the work queue is free from FIBs
+	 * sitting on top of mlxsw netdevs. However, we can still
+	 * have other FIBs queued. Flush the queue before flushing
+	 * the device's tables. No need for locks, as we're the only
+	 * writer.
+	 */
+	mlxsw_core_flush_owq();
 	mlxsw_sp_router_fib_flush(mlxsw_sp);
 	kfree(mlxsw_sp->router.vrs);
 }
@@ -1948,30 +1956,74 @@ static void __mlxsw_sp_router_fini(struct mlxsw_sp *mlxsw_sp)
 	kfree(mlxsw_sp->rifs);
 }
 
-static int mlxsw_sp_router_fib_event(struct notifier_block *nb,
-				     unsigned long event, void *ptr)
+struct mlxsw_sp_fib_event_work {
+	struct delayed_work dw;
+	struct fib_entry_notifier_info fen_info;
+	struct mlxsw_sp *mlxsw_sp;
+	unsigned long event;
+};
+
+static void mlxsw_sp_router_fib_event_work(struct work_struct *work)
 {
-	struct mlxsw_sp *mlxsw_sp = container_of(nb, struct mlxsw_sp, fib_nb);
-	struct fib_entry_notifier_info *fen_info = ptr;
+	struct mlxsw_sp_fib_event_work *fib_work =
+		container_of(work, struct mlxsw_sp_fib_event_work, dw.work);
+	struct mlxsw_sp *mlxsw_sp = fib_work->mlxsw_sp;
 	int err;
 
-	if (!net_eq(fen_info->info.net, &init_net))
-		return NOTIFY_DONE;
-
-	switch (event) {
+	/* Protect internal structures from changes */
+	rtnl_lock();
+	switch (fib_work->event) {
 	case FIB_EVENT_ENTRY_ADD:
-		err = mlxsw_sp_router_fib4_add(mlxsw_sp, fen_info);
+		err = mlxsw_sp_router_fib4_add(mlxsw_sp, &fib_work->fen_info);
 		if (err)
 			mlxsw_sp_router_fib4_abort(mlxsw_sp);
+		fib_info_put(fib_work->fen_info.fi);
 		break;
 	case FIB_EVENT_ENTRY_DEL:
-		mlxsw_sp_router_fib4_del(mlxsw_sp, fen_info);
+		mlxsw_sp_router_fib4_del(mlxsw_sp, &fib_work->fen_info);
+		fib_info_put(fib_work->fen_info.fi);
 		break;
 	case FIB_EVENT_RULE_ADD: /* fall through */
 	case FIB_EVENT_RULE_DEL:
 		mlxsw_sp_router_fib4_abort(mlxsw_sp);
 		break;
 	}
+	rtnl_unlock();
+	kfree(fib_work);
+}
+
+/* Called with rcu_read_lock() */
+static int mlxsw_sp_router_fib_event(struct notifier_block *nb,
+				     unsigned long event, void *ptr)
+{
+	struct mlxsw_sp *mlxsw_sp = container_of(nb, struct mlxsw_sp, fib_nb);
+	struct mlxsw_sp_fib_event_work *fib_work;
+	struct fib_notifier_info *info = ptr;
+
+	if (!net_eq(info->net, &init_net))
+		return NOTIFY_DONE;
+
+	fib_work = kzalloc(sizeof(*fib_work), GFP_ATOMIC);
+	if (WARN_ON(!fib_work))
+		return NOTIFY_BAD;
+
+	INIT_DELAYED_WORK(&fib_work->dw, mlxsw_sp_router_fib_event_work);
+	fib_work->mlxsw_sp = mlxsw_sp;
+	fib_work->event = event;
+
+	switch (event) {
+	case FIB_EVENT_ENTRY_ADD: /* fall through */
+	case FIB_EVENT_ENTRY_DEL:
+		memcpy(&fib_work->fen_info, ptr, sizeof(fib_work->fen_info));
+		/* Take referece on fib_info to prevent it from being
+		 * freed while work is queued. Release it afterwards.
+		 */
+		fib_info_hold(fib_work->fen_info.fi);
+		break;
+	}
+
+	mlxsw_core_schedule_odw(&fib_work->dw, 0);
+
 	return NOTIFY_DONE;
 }
 
-- 
2.7.4

^ permalink raw reply related

* [patch net-next v4 08/10] ipv4: fib: Convert FIB notification chain to be atomic
From: Jiri Pirko @ 2016-12-03 15:45 UTC (permalink / raw)
  To: netdev
  Cc: davem, idosch, eladr, yotamg, nogahf, arkadis, ogerlitz, roopa,
	dsa, nikolay, andy, vivien.didelot, andrew, f.fainelli,
	alexander.h.duyck, hannes, kaber
In-Reply-To: <1480779907-32535-1-git-send-email-jiri@resnulli.us>

From: Ido Schimmel <idosch@mellanox.com>

In order not to hold RTNL for long periods of time we're going to dump
the FIB tables using RCU.

Convert the FIB notification chain to be atomic, as we can't block in
RCU critical sections.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
---
 net/ipv4/fib_trie.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 026f309..9bfce0d 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -84,17 +84,17 @@
 #include <trace/events/fib.h>
 #include "fib_lookup.h"
 
-static BLOCKING_NOTIFIER_HEAD(fib_chain);
+static ATOMIC_NOTIFIER_HEAD(fib_chain);
 
 int register_fib_notifier(struct notifier_block *nb)
 {
-	return blocking_notifier_chain_register(&fib_chain, nb);
+	return atomic_notifier_chain_register(&fib_chain, nb);
 }
 EXPORT_SYMBOL(register_fib_notifier);
 
 int unregister_fib_notifier(struct notifier_block *nb)
 {
-	return blocking_notifier_chain_unregister(&fib_chain, nb);
+	return atomic_notifier_chain_unregister(&fib_chain, nb);
 }
 EXPORT_SYMBOL(unregister_fib_notifier);
 
@@ -102,7 +102,7 @@ int call_fib_notifiers(struct net *net, enum fib_event_type event_type,
 		       struct fib_notifier_info *info)
 {
 	info->net = net;
-	return blocking_notifier_call_chain(&fib_chain, event_type, info);
+	return atomic_notifier_call_chain(&fib_chain, event_type, info);
 }
 
 static int call_fib_entry_notifiers(struct net *net,
-- 
2.7.4

^ permalink raw reply related

* [patch net-next v4 07/10] rocker: Register FIB notifier before creating ports
From: Jiri Pirko @ 2016-12-03 15:45 UTC (permalink / raw)
  To: netdev
  Cc: davem, idosch, eladr, yotamg, nogahf, arkadis, ogerlitz, roopa,
	dsa, nikolay, andy, vivien.didelot, andrew, f.fainelli,
	alexander.h.duyck, hannes, kaber
In-Reply-To: <1480779907-32535-1-git-send-email-jiri@resnulli.us>

From: Ido Schimmel <idosch@mellanox.com>

We can miss FIB notifications sent between the time the ports were
created and the FIB notification block registered.

Instead of receiving these notifications only when they are replayed for
the FIB notification block during registration, just register the
notification block before the ports are created.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
---
 drivers/net/ethernet/rocker/rocker_main.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/rocker/rocker_main.c b/drivers/net/ethernet/rocker/rocker_main.c
index 914e9e1..8c9c90a 100644
--- a/drivers/net/ethernet/rocker/rocker_main.c
+++ b/drivers/net/ethernet/rocker/rocker_main.c
@@ -2804,6 +2804,9 @@ static int rocker_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 		goto err_alloc_ordered_workqueue;
 	}
 
+	rocker->fib_nb.notifier_call = rocker_router_fib_event;
+	register_fib_notifier(&rocker->fib_nb);
+
 	rocker->hw.id = rocker_read64(rocker, SWITCH_ID);
 
 	err = rocker_probe_ports(rocker);
@@ -2812,15 +2815,13 @@ static int rocker_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 		goto err_probe_ports;
 	}
 
-	rocker->fib_nb.notifier_call = rocker_router_fib_event;
-	register_fib_notifier(&rocker->fib_nb);
-
 	dev_info(&pdev->dev, "Rocker switch with id %*phN\n",
 		 (int)sizeof(rocker->hw.id), &rocker->hw.id);
 
 	return 0;
 
 err_probe_ports:
+	unregister_fib_notifier(&rocker->fib_nb);
 	destroy_workqueue(rocker->rocker_owq);
 err_alloc_ordered_workqueue:
 	free_irq(rocker_msix_vector(rocker, ROCKER_MSIX_VEC_EVENT), rocker);
@@ -2848,9 +2849,9 @@ static void rocker_remove(struct pci_dev *pdev)
 {
 	struct rocker *rocker = pci_get_drvdata(pdev);
 
+	rocker_remove_ports(rocker);
 	unregister_fib_notifier(&rocker->fib_nb);
 	rocker_write32(rocker, CONTROL, ROCKER_CONTROL_RESET);
-	rocker_remove_ports(rocker);
 	destroy_workqueue(rocker->rocker_owq);
 	free_irq(rocker_msix_vector(rocker, ROCKER_MSIX_VEC_EVENT), rocker);
 	free_irq(rocker_msix_vector(rocker, ROCKER_MSIX_VEC_CMD), rocker);
-- 
2.7.4

^ permalink raw reply related

* [patch net-next v4 06/10] rocker: Implement FIB offload in deferred work
From: Jiri Pirko @ 2016-12-03 15:45 UTC (permalink / raw)
  To: netdev
  Cc: davem, idosch, eladr, yotamg, nogahf, arkadis, ogerlitz, roopa,
	dsa, nikolay, andy, vivien.didelot, andrew, f.fainelli,
	alexander.h.duyck, hannes, kaber
In-Reply-To: <1480779907-32535-1-git-send-email-jiri@resnulli.us>

From: Ido Schimmel <idosch@mellanox.com>

Convert rocker to offload FIBs in deferred work in a similar fashion to
mlxsw, which was converted in the previous commits.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
---
 drivers/net/ethernet/rocker/rocker_main.c  | 58 +++++++++++++++++++++++++-----
 drivers/net/ethernet/rocker/rocker_ofdpa.c |  1 +
 2 files changed, 51 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/rocker/rocker_main.c b/drivers/net/ethernet/rocker/rocker_main.c
index 424be96..914e9e1 100644
--- a/drivers/net/ethernet/rocker/rocker_main.c
+++ b/drivers/net/ethernet/rocker/rocker_main.c
@@ -2166,28 +2166,70 @@ static const struct switchdev_ops rocker_port_switchdev_ops = {
 	.switchdev_port_obj_dump	= rocker_port_obj_dump,
 };
 
-static int rocker_router_fib_event(struct notifier_block *nb,
-				   unsigned long event, void *ptr)
+struct rocker_fib_event_work {
+	struct work_struct work;
+	struct fib_entry_notifier_info fen_info;
+	struct rocker *rocker;
+	unsigned long event;
+};
+
+static void rocker_router_fib_event_work(struct work_struct *work)
 {
-	struct rocker *rocker = container_of(nb, struct rocker, fib_nb);
-	struct fib_entry_notifier_info *fen_info = ptr;
+	struct rocker_fib_event_work *fib_work =
+		container_of(work, struct rocker_fib_event_work, work);
+	struct rocker *rocker = fib_work->rocker;
 	int err;
 
-	switch (event) {
+	/* Protect internal structures from changes */
+	rtnl_lock();
+	switch (fib_work->event) {
 	case FIB_EVENT_ENTRY_ADD:
-		err = rocker_world_fib4_add(rocker, fen_info);
+		err = rocker_world_fib4_add(rocker, &fib_work->fen_info);
 		if (err)
 			rocker_world_fib4_abort(rocker);
-		else
+		fib_info_put(fib_work->fen_info.fi);
 		break;
 	case FIB_EVENT_ENTRY_DEL:
-		rocker_world_fib4_del(rocker, fen_info);
+		rocker_world_fib4_del(rocker, &fib_work->fen_info);
+		fib_info_put(fib_work->fen_info.fi);
 		break;
 	case FIB_EVENT_RULE_ADD: /* fall through */
 	case FIB_EVENT_RULE_DEL:
 		rocker_world_fib4_abort(rocker);
 		break;
 	}
+	rtnl_unlock();
+	kfree(fib_work);
+}
+
+/* Called with rcu_read_lock() */
+static int rocker_router_fib_event(struct notifier_block *nb,
+				   unsigned long event, void *ptr)
+{
+	struct rocker *rocker = container_of(nb, struct rocker, fib_nb);
+	struct rocker_fib_event_work *fib_work;
+
+	fib_work = kzalloc(sizeof(*fib_work), GFP_ATOMIC);
+	if (WARN_ON(!fib_work))
+		return NOTIFY_BAD;
+
+	INIT_WORK(&fib_work->work, rocker_router_fib_event_work);
+	fib_work->rocker = rocker;
+	fib_work->event = event;
+
+	switch (event) {
+	case FIB_EVENT_ENTRY_ADD: /* fall through */
+	case FIB_EVENT_ENTRY_DEL:
+		memcpy(&fib_work->fen_info, ptr, sizeof(fib_work->fen_info));
+		/* Take referece on fib_info to prevent it from being
+		 * freed while work is queued. Release it afterwards.
+		 */
+		fib_info_hold(fib_work->fen_info.fi);
+		break;
+	}
+
+	queue_work(rocker->rocker_owq, &fib_work->work);
+
 	return NOTIFY_DONE;
 }
 
diff --git a/drivers/net/ethernet/rocker/rocker_ofdpa.c b/drivers/net/ethernet/rocker/rocker_ofdpa.c
index 4ca4613..7cd76b6 100644
--- a/drivers/net/ethernet/rocker/rocker_ofdpa.c
+++ b/drivers/net/ethernet/rocker/rocker_ofdpa.c
@@ -2516,6 +2516,7 @@ static void ofdpa_fini(struct rocker *rocker)
 	int bkt;
 
 	del_timer_sync(&ofdpa->fdb_cleanup_timer);
+	flush_workqueue(rocker->rocker_owq);
 
 	spin_lock_irqsave(&ofdpa->flow_tbl_lock, flags);
 	hash_for_each_safe(ofdpa->flow_tbl, bkt, tmp, flow_entry, entry)
-- 
2.7.4

^ permalink raw reply related

* [patch net-next v4 05/10] rocker: Create an ordered workqueue for FIB offload
From: Jiri Pirko @ 2016-12-03 15:45 UTC (permalink / raw)
  To: netdev
  Cc: davem, idosch, eladr, yotamg, nogahf, arkadis, ogerlitz, roopa,
	dsa, nikolay, andy, vivien.didelot, andrew, f.fainelli,
	alexander.h.duyck, hannes, kaber
In-Reply-To: <1480779907-32535-1-git-send-email-jiri@resnulli.us>

From: Ido Schimmel <idosch@mellanox.com>

As explained in the previous commits, we need to process FIB entries
addition / deletion events in FIFO order or otherwise we can have a
mismatch between the kernel's FIB table and the device's.

Create an ordered workqueue for rocker to which these work items will be
submitted to.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
---
 drivers/net/ethernet/rocker/rocker.h      |  1 +
 drivers/net/ethernet/rocker/rocker_main.c | 11 +++++++++++
 2 files changed, 12 insertions(+)

diff --git a/drivers/net/ethernet/rocker/rocker.h b/drivers/net/ethernet/rocker/rocker.h
index 2eb9b49..ee9675d 100644
--- a/drivers/net/ethernet/rocker/rocker.h
+++ b/drivers/net/ethernet/rocker/rocker.h
@@ -72,6 +72,7 @@ struct rocker {
 	struct rocker_dma_ring_info event_ring;
 	struct notifier_block fib_nb;
 	struct rocker_world_ops *wops;
+	struct workqueue_struct *rocker_owq;
 	void *wpriv;
 };
 
diff --git a/drivers/net/ethernet/rocker/rocker_main.c b/drivers/net/ethernet/rocker/rocker_main.c
index 67df4cf..424be96 100644
--- a/drivers/net/ethernet/rocker/rocker_main.c
+++ b/drivers/net/ethernet/rocker/rocker_main.c
@@ -28,6 +28,7 @@
 #include <linux/if_bridge.h>
 #include <linux/bitops.h>
 #include <linux/ctype.h>
+#include <linux/workqueue.h>
 #include <net/switchdev.h>
 #include <net/rtnetlink.h>
 #include <net/netevent.h>
@@ -2754,6 +2755,13 @@ static int rocker_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 		goto err_request_event_irq;
 	}
 
+	rocker->rocker_owq = alloc_ordered_workqueue(rocker_driver_name,
+						     WQ_MEM_RECLAIM);
+	if (!rocker->rocker_owq) {
+		err = -ENOMEM;
+		goto err_alloc_ordered_workqueue;
+	}
+
 	rocker->hw.id = rocker_read64(rocker, SWITCH_ID);
 
 	err = rocker_probe_ports(rocker);
@@ -2771,6 +2779,8 @@ static int rocker_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	return 0;
 
 err_probe_ports:
+	destroy_workqueue(rocker->rocker_owq);
+err_alloc_ordered_workqueue:
 	free_irq(rocker_msix_vector(rocker, ROCKER_MSIX_VEC_EVENT), rocker);
 err_request_event_irq:
 	free_irq(rocker_msix_vector(rocker, ROCKER_MSIX_VEC_CMD), rocker);
@@ -2799,6 +2809,7 @@ static void rocker_remove(struct pci_dev *pdev)
 	unregister_fib_notifier(&rocker->fib_nb);
 	rocker_write32(rocker, CONTROL, ROCKER_CONTROL_RESET);
 	rocker_remove_ports(rocker);
+	destroy_workqueue(rocker->rocker_owq);
 	free_irq(rocker_msix_vector(rocker, ROCKER_MSIX_VEC_EVENT), rocker);
 	free_irq(rocker_msix_vector(rocker, ROCKER_MSIX_VEC_CMD), rocker);
 	rocker_dma_rings_fini(rocker);
-- 
2.7.4

^ permalink raw reply related

* [patch net-next v4 03/10] mlxsw: core: Create an ordered workqueue for FIB offload
From: Jiri Pirko @ 2016-12-03 15:45 UTC (permalink / raw)
  To: netdev
  Cc: davem, idosch, eladr, yotamg, nogahf, arkadis, ogerlitz, roopa,
	dsa, nikolay, andy, vivien.didelot, andrew, f.fainelli,
	alexander.h.duyck, hannes, kaber
In-Reply-To: <1480779907-32535-1-git-send-email-jiri@resnulli.us>

From: Ido Schimmel <idosch@mellanox.com>

We're going to start processing FIB entries addition / deletion events
in deferred work. These work items must be processed in the order they
were submitted or otherwise we can have differences between the kernel's
FIB table and the device's.

Solve this by creating an ordered workqueue to which these work items
will be submitted to. Note that we can't simply convert the current
workqueue to be ordered, as EMADs re-transmissions are also processed in
deferred work.

Later on, we can migrate other work items to this workqueue, such as FDB
notification processing and nexthop resolution, since they all take the
same lock anyway.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlxsw/core.c | 22 ++++++++++++++++++++++
 drivers/net/ethernet/mellanox/mlxsw/core.h |  2 ++
 2 files changed, 24 insertions(+)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.c b/drivers/net/ethernet/mellanox/mlxsw/core.c
index 4dc028b..57a9884 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/core.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/core.c
@@ -77,6 +77,7 @@ static const char mlxsw_core_driver_name[] = "mlxsw_core";
 static struct dentry *mlxsw_core_dbg_root;
 
 static struct workqueue_struct *mlxsw_wq;
+static struct workqueue_struct *mlxsw_owq;
 
 struct mlxsw_core_pcpu_stats {
 	u64			trap_rx_packets[MLXSW_TRAP_ID_MAX];
@@ -1900,6 +1901,18 @@ int mlxsw_core_schedule_dw(struct delayed_work *dwork, unsigned long delay)
 }
 EXPORT_SYMBOL(mlxsw_core_schedule_dw);
 
+int mlxsw_core_schedule_odw(struct delayed_work *dwork, unsigned long delay)
+{
+	return queue_delayed_work(mlxsw_owq, dwork, delay);
+}
+EXPORT_SYMBOL(mlxsw_core_schedule_odw);
+
+void mlxsw_core_flush_owq(void)
+{
+	flush_workqueue(mlxsw_owq);
+}
+EXPORT_SYMBOL(mlxsw_core_flush_owq);
+
 static int __init mlxsw_core_module_init(void)
 {
 	int err;
@@ -1907,6 +1920,12 @@ static int __init mlxsw_core_module_init(void)
 	mlxsw_wq = alloc_workqueue(mlxsw_core_driver_name, WQ_MEM_RECLAIM, 0);
 	if (!mlxsw_wq)
 		return -ENOMEM;
+	mlxsw_owq = alloc_ordered_workqueue("%s_ordered", WQ_MEM_RECLAIM,
+					    mlxsw_core_driver_name);
+	if (!mlxsw_owq) {
+		err = -ENOMEM;
+		goto err_alloc_ordered_workqueue;
+	}
 	mlxsw_core_dbg_root = debugfs_create_dir(mlxsw_core_driver_name, NULL);
 	if (!mlxsw_core_dbg_root) {
 		err = -ENOMEM;
@@ -1915,6 +1934,8 @@ static int __init mlxsw_core_module_init(void)
 	return 0;
 
 err_debugfs_create_dir:
+	destroy_workqueue(mlxsw_owq);
+err_alloc_ordered_workqueue:
 	destroy_workqueue(mlxsw_wq);
 	return err;
 }
@@ -1922,6 +1943,7 @@ static int __init mlxsw_core_module_init(void)
 static void __exit mlxsw_core_module_exit(void)
 {
 	debugfs_remove_recursive(mlxsw_core_dbg_root);
+	destroy_workqueue(mlxsw_owq);
 	destroy_workqueue(mlxsw_wq);
 }
 
diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.h b/drivers/net/ethernet/mellanox/mlxsw/core.h
index e856b49..a7f94fb 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/core.h
+++ b/drivers/net/ethernet/mellanox/mlxsw/core.h
@@ -207,6 +207,8 @@ enum devlink_port_type mlxsw_core_port_type_get(struct mlxsw_core *mlxsw_core,
 						u8 local_port);
 
 int mlxsw_core_schedule_dw(struct delayed_work *dwork, unsigned long delay);
+int mlxsw_core_schedule_odw(struct delayed_work *dwork, unsigned long delay);
+void mlxsw_core_flush_owq(void);
 
 #define MLXSW_CONFIG_PROFILE_SWID_COUNT 8
 
-- 
2.7.4

^ permalink raw reply related

* [patch net-next v4 02/10] ipv4: fib: Add fib_info_hold() helper
From: Jiri Pirko @ 2016-12-03 15:44 UTC (permalink / raw)
  To: netdev
  Cc: davem, idosch, eladr, yotamg, nogahf, arkadis, ogerlitz, roopa,
	dsa, nikolay, andy, vivien.didelot, andrew, f.fainelli,
	alexander.h.duyck, hannes, kaber
In-Reply-To: <1480779907-32535-1-git-send-email-jiri@resnulli.us>

From: Ido Schimmel <idosch@mellanox.com>

As explained in the previous commit, modules are going to need to take a
reference on fib info and then drop it using fib_info_put().

Add the fib_info_hold() helper to make the code more readable and also
symmetric with fib_info_put().

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Suggested-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
---
 include/net/ip_fib.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index f390c3b..6c67b93 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -397,6 +397,11 @@ static inline void fib_combine_itag(u32 *itag, const struct fib_result *res)
 
 void free_fib_info(struct fib_info *fi);
 
+static inline void fib_info_hold(struct fib_info *fi)
+{
+	atomic_inc(&fi->fib_clntref);
+}
+
 static inline void fib_info_put(struct fib_info *fi)
 {
 	if (atomic_dec_and_test(&fi->fib_clntref))
-- 
2.7.4

^ permalink raw reply related

* [patch net-next v4 01/10] ipv4: fib: Export free_fib_info()
From: Jiri Pirko @ 2016-12-03 15:44 UTC (permalink / raw)
  To: netdev
  Cc: davem, idosch, eladr, yotamg, nogahf, arkadis, ogerlitz, roopa,
	dsa, nikolay, andy, vivien.didelot, andrew, f.fainelli,
	alexander.h.duyck, hannes, kaber
In-Reply-To: <1480779907-32535-1-git-send-email-jiri@resnulli.us>

From: Ido Schimmel <idosch@mellanox.com>

The FIB notification chain is going to be converted to an atomic chain,
which means switchdev drivers will have to offload FIB entries in
deferred work, as hardware operations entail sleeping.

However, while the work is queued fib info might be freed, so a
reference must be taken. To release the reference (and potentially free
the fib info) fib_info_put() will be called, which in turn calls
free_fib_info().

Export free_fib_info() so that modules will be able to invoke
fib_info_put().

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
---
 net/ipv4/fib_semantics.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 388d3e2..c1bc1e9 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -234,6 +234,7 @@ void free_fib_info(struct fib_info *fi)
 #endif
 	call_rcu(&fi->rcu, free_fib_info_rcu);
 }
+EXPORT_SYMBOL_GPL(free_fib_info);

 void fib_release_info(struct fib_info *fi)
 {
-- 
2.7.4

^ permalink raw reply related

* [patch net-next v4 00/10] ipv4: fib: Replay events when registering FIB notifier
From: Jiri Pirko @ 2016-12-03 15:44 UTC (permalink / raw)
  To: netdev
  Cc: davem, idosch, eladr, yotamg, nogahf, arkadis, ogerlitz, roopa,
	dsa, nikolay, andy, vivien.didelot, andrew, f.fainelli,
	alexander.h.duyck, hannes, kaber

From: Jiri Pirko <jiri@mellanox.com>

Ido says:

In kernel 4.9 the switchdev-specific FIB offload mechanism was replaced
by a new FIB notification chain to which modules could register in order
to be notified about the addition and deletion of FIB entries. The
motivation for this change was that switchdev drivers need to be able to
reflect the entire FIB table and not only FIBs configured on top of the
port netdevs themselves. This is useful in case of in-band management.

The fundamental problem with this approach is that upon registration
listeners lose all the information previously sent in the chain and
thus have an incomplete view of the FIB tables, which can result in
packet loss. This patchset fixes that by dumping the FIB tables and
replaying notifications previously sent in the chain for the registered
notification block.

The entire dump process is done under RCU and thus the FIB notification
chain is converted to be atomic. The listeners are modified accordingly.
This is done in the first eight patches.

The ninth patch adds a change sequence counter to ensure the integrity
of the FIB dump. The last patch adds the dump itself to the FIB chain
registration function and modifies existing listeners to pass a callback
to be executed in case dump was inconsistent.

---
v3->v4:
- Register the notification block after the dump and protect it using
  the change sequence counter (Hannes Frederic Sowa).
- Since we now integrate the dump into the registration function, drop
  the sysctl to set maximum number of retries and instead set it to a
  fixed number. Lets see if it's really a problem before adding something
  we can never remove.
- For the same reason, dump FIB tables for all net namespaces.
- Add a comment regarding guarantees provided by mutex semantics.

v2->v3:
- Add sysctl to set the number of FIB dump retries (Hannes Frederic Sowa).
- Read the sequence counter under RTNL to ensure synchronization
  between the dump process and other processes changing the routing
  tables (Hannes Frederic Sowa).
- Pass a callback to the dump function to be executed prior to a retry.
- Limit the dump to a single net namespace.

v1->v2:
- Add a sequence counter to ensure the integrity of the FIB dump
  (David S. Miller, Hannes Frederic Sowa).
- Protect notifications from re-ordering in listeners by using an
  ordered workqueue (Hannes Frederic Sowa).
- Introduce fib_info_hold() (Jiri Pirko).
- Relieve rocker from the need to invoke the FIB dump by registering
  to the FIB notification chain prior to ports creation.

Ido Schimmel (10):
  ipv4: fib: Export free_fib_info()
  ipv4: fib: Add fib_info_hold() helper
  mlxsw: core: Create an ordered workqueue for FIB offload
  mlxsw: spectrum_router: Implement FIB offload in deferred work
  rocker: Create an ordered workqueue for FIB offload
  rocker: Implement FIB offload in deferred work
  rocker: Register FIB notifier before creating ports
  ipv4: fib: Convert FIB notification chain to be atomic
  ipv4: fib: Allow for consistent FIB dumping
  ipv4: fib: Replay events when registering FIB notifier

 drivers/net/ethernet/mellanox/mlxsw/core.c         |  22 +++
 drivers/net/ethernet/mellanox/mlxsw/core.h         |   2 +
 .../net/ethernet/mellanox/mlxsw/spectrum_router.c  |  92 ++++++++++--
 drivers/net/ethernet/rocker/rocker.h               |   1 +
 drivers/net/ethernet/rocker/rocker_main.c          |  84 +++++++++--
 drivers/net/ethernet/rocker/rocker_ofdpa.c         |   1 +
 include/net/ip_fib.h                               |   8 +-
 include/net/netns/ipv4.h                           |   3 +
 net/ipv4/fib_frontend.c                            |   2 +
 net/ipv4/fib_semantics.c                           |   1 +
 net/ipv4/fib_trie.c                                | 155 ++++++++++++++++++++-
 11 files changed, 342 insertions(+), 29 deletions(-)

-- 
2.7.4

^ permalink raw reply

* Re: net: use-after-free in worker_thread
From: Andrey Konovalov @ 2016-12-03 15:39 UTC (permalink / raw)
  To: syzkaller
  Cc: David S. Miller, Cong Wang, Johannes Berg, Florian Westphal,
	Herbert Xu, Eric Dumazet, Bob Copeland, Tom Herbert,
	David Decotigny, netdev, LKML, Kostya Serebryany, Dmitry Vyukov
In-Reply-To: <1480772947.18162.402.camel@edumazet-glaptop3.roam.corp.google.com>

[-- Attachment #1: Type: text/plain, Size: 14438 bytes --]

On Sat, Dec 3, 2016 at 2:49 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> On Sat, 2016-12-03 at 14:05 +0100, Andrey Konovalov wrote:
>> On Sat, Dec 3, 2016 at 1:58 PM, Andrey Konovalov <andreyknvl@google.com> wrote:
>> > +syzkaller@googlegroups.com
>> >
>> > On Sat, Dec 3, 2016 at 1:56 PM, Andrey Konovalov <andreyknvl@google.com> wrote:
>> >> Hi!
>> >>
>> >> I'm seeing lots of the following error reports while running the
>> >> syzkaller fuzzer.
>> >>
>> >> Reports appeared when I updated to 3c49de52 (Dec 2) from 2caceb32 (Dec 1).
>> >>
>> >> ==================================================================
>> >> BUG: KASAN: use-after-free in worker_thread+0x17d8/0x18a0
>> >> Read of size 8 at addr ffff880067f3ecd8 by task kworker/3:1/774
>> >>
>> >> page:ffffea00019fce00 count:1 mapcount:0 mapping:          (null)
>> >> index:0xffff880067f39c10 compound_mapcount: 0
>> >> flags: 0x500000000004080(slab|head)
>> >> page dumped because: kasan: bad access detected
>> >>
>> >> CPU: 3 PID: 774 Comm: kworker/3:1 Not tainted 4.9.0-rc7+ #66
>> >> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
>> >>  ffff88006c267838 ffffffff81f882da ffffffff6c25e338 1ffff1000d84ce9a
>> >>  ffffed000d84ce92 ffff88006c25e340 0000000041b58ab3 ffffffff8541e198
>> >>  ffffffff81f88048 0000000100000000 0000000041b58ab3 ffffffff853d3ee8
>> >> Call Trace:
>> >>  [<     inline     >] __dump_stack lib/dump_stack.c:15
>> >>  [<ffffffff81f882da>] dump_stack+0x292/0x398 lib/dump_stack.c:51
>> >>  [<     inline     >] describe_address mm/kasan/report.c:262
>> >>  [<ffffffff817e50d1>] kasan_report_error+0x121/0x560 mm/kasan/report.c:368
>> >>  [<     inline     >] kasan_report mm/kasan/report.c:390
>> >>  [<ffffffff817e560e>] __asan_report_load8_noabort+0x3e/0x40
>> >> mm/kasan/report.c:411
>> >>  [<ffffffff81329b88>] worker_thread+0x17d8/0x18a0 kernel/workqueue.c:2228
>> >>  [<ffffffff8133ebf3>] kthread+0x323/0x3e0 kernel/kthread.c:209
>> >>  [<ffffffff84a2a22a>] ret_from_fork+0x2a/0x40 arch/x86/entry/entry_64.S:433
>> >>
>> >> The buggy address belongs to the object at ffff880067f3e6d0
>> >>  which belongs to the cache kmalloc-2048 of size 2048
>> >> The buggy address ffff880067f3ecd8 is located 1544 bytes inside
>> >>  of 2048-byte region [ffff880067f3e6d0, ffff880067f3eed0)
>> >>
>> >> Freed by task 0:
>> >>  [<ffffffff81203526>] save_stack_trace+0x16/0x20 arch/x86/kernel/stacktrace.c:57
>> >>  [<ffffffff817e4173>] save_stack+0x43/0xd0 mm/kasan/kasan.c:495
>> >>  [<     inline     >] set_track mm/kasan/kasan.c:507
>> >>  [<ffffffff817e4a53>] kasan_slab_free+0x73/0xc0 mm/kasan/kasan.c:571
>> >>  [<     inline     >] slab_free_hook mm/slub.c:1352
>> >>  [<     inline     >] slab_free_freelist_hook mm/slub.c:1374
>> >>  [<     inline     >] slab_free mm/slub.c:2951
>> >>  [<ffffffff817e0eb7>] kfree+0xe7/0x2b0 mm/slub.c:3871
>> >>  [<     inline     >] sk_prot_free net/core/sock.c:1372
>> >>  [<ffffffff831ea1c7>] __sk_destruct+0x5c7/0x6e0 net/core/sock.c:1445
>> >>  [<ffffffff831f3517>] sk_destruct+0x47/0x80 net/core/sock.c:1453
>> >>  [<ffffffff831f35a7>] __sk_free+0x57/0x230 net/core/sock.c:1461
>> >>  [<ffffffff831f37a3>] sk_free+0x23/0x30 net/core/sock.c:1472
>> >>  [<     inline     >] sock_put include/net/sock.h:1591
>> >>  [<ffffffff8348ca9c>] deferred_put_nlk_sk+0x2c/0x40 net/netlink/af_netlink.c:671
>> >>  [<     inline     >] __rcu_reclaim kernel/rcu/rcu.h:118
>> >>  [<ffffffff8146d42f>] rcu_do_batch.isra.67+0x8ff/0xc50 kernel/rcu/tree.c:2776
>> >>  [<     inline     >] invoke_rcu_callbacks kernel/rcu/tree.c:3040
>> >>  [<     inline     >] __rcu_process_callbacks kernel/rcu/tree.c:3007
>> >>  [<ffffffff8146e097>] rcu_process_callbacks+0x2b7/0xba0 kernel/rcu/tree.c:3024
>> >>  [<ffffffff84a2d08b>] __do_softirq+0x2fb/0xb63 kernel/softirq.c:284
>> >>
>> >> Allocated by task 10748:
>> >>  [<ffffffff81203526>] save_stack_trace+0x16/0x20 arch/x86/kernel/stacktrace.c:57
>> >>  [<ffffffff817e4173>] save_stack+0x43/0xd0 mm/kasan/kasan.c:495
>> >>  [<     inline     >] set_track mm/kasan/kasan.c:507
>> >>  [<ffffffff817e43fd>] kasan_kmalloc+0xad/0xe0 mm/kasan/kasan.c:598
>> >>  [<ffffffff817e0050>] __kmalloc+0xa0/0x2d0 mm/slub.c:3734
>> >>  [<     inline     >] kmalloc include/linux/slab.h:495
>> >>  [<ffffffff831e4c01>] sk_prot_alloc+0x101/0x2a0 net/core/sock.c:1333
>> >>  [<ffffffff831efd15>] sk_alloc+0x105/0x1000 net/core/sock.c:1389
>> >>  [<ffffffff8348ad46>] __netlink_create+0x66/0x1d0 net/netlink/af_netlink.c:588
>> >>  [<ffffffff8348cdab>] netlink_create+0x2fb/0x500 net/netlink/af_netlink.c:647
>> >>  [<ffffffff831dd1d6>] __sock_create+0x4f6/0x880 net/socket.c:1168
>> >>  [<     inline     >] sock_create net/socket.c:1208
>> >>  [<     inline     >] SYSC_socket net/socket.c:1238
>> >>  [<ffffffff831dd799>] SyS_socket+0xf9/0x230 net/socket.c:1218
>> >>  [<ffffffff84a29fc1>] entry_SYSCALL_64_fastpath+0x1f/0xc2
>> >>
>> >> Memory state around the buggy address:
>> >>  ffff880067f3eb80: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
>> >>  ffff880067f3ec00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
>> >>>ffff880067f3ec80: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
>> >>                                                     ^
>> >>  ffff880067f3ed00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
>> >>  ffff880067f3ed80: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
>> >> ==================================================================
>>
>> Here is another report that looks related:
>>
>> ==================================================================
>> BUG: KASAN: use-after-free in __list_add+0x236/0x2c0
>> Read of size 8 at addr ffff880068854780 by task ksoftirqd/2/20
>>
>> page:ffffea0001a21400 count:1 mapcount:0 mapping:          (null)
>> index:0x0 compound_mapcount: 0
>> flags: 0x500000000004080(slab|head)
>> page dumped because: kasan: bad access detected
>>
>> CPU: 2 PID: 20 Comm: ksoftirqd/2 Not tainted 4.9.0-rc7+ #66
>> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
>>  ffff88006daf6578 ffffffff81f882da ffffffff6daf62a0 1ffff1000db5ec42
>>  ffffed000db5ec3a dffffc0000000000 0000000041b58ab3 ffffffff8541e198
>>  ffffffff81f88048 ffff88006dac3610 ffff88006daf6300 0000000000000802
>> Call Trace:
>>  [<     inline     >] __dump_stack lib/dump_stack.c:15
>>  [<ffffffff81f882da>] dump_stack+0x292/0x398 lib/dump_stack.c:51
>>  [<     inline     >] describe_address mm/kasan/report.c:262
>>  [<ffffffff817e50d1>] kasan_report_error+0x121/0x560 mm/kasan/report.c:368
>>  [<     inline     >] kasan_report mm/kasan/report.c:390
>>  [<ffffffff817e560e>] __asan_report_load8_noabort+0x3e/0x40
>> mm/kasan/report.c:411
>>  [<ffffffff8200c166>] __list_add+0x236/0x2c0 lib/list_debug.c:30
>>  [<     inline     >] list_add_tail include/linux/list.h:77
>>  [<ffffffff8131e295>] insert_work+0x175/0x4b0 kernel/workqueue.c:1298
>>  [<ffffffff8131eb52>] __queue_work+0x582/0x11e0 kernel/workqueue.c:1459
>>  [<ffffffff81320c21>] queue_work_on+0x231/0x240 kernel/workqueue.c:1484
>>  [<     inline     >] queue_work include/linux/workqueue.h:474
>>  [<     inline     >] schedule_work include/linux/workqueue.h:532
>>  [<ffffffff8348c8cc>] netlink_sock_destruct+0x23c/0x2d0
>> net/netlink/af_netlink.c:361
>>  [<ffffffff831e9ce1>] __sk_destruct+0xe1/0x6e0 net/core/sock.c:1423
>>  [<ffffffff831f3517>] sk_destruct+0x47/0x80 net/core/sock.c:1453
>>  [<ffffffff831f35a7>] __sk_free+0x57/0x230 net/core/sock.c:1461
>>  [<ffffffff831f37a3>] sk_free+0x23/0x30 net/core/sock.c:1472
>>  [<     inline     >] sock_put include/net/sock.h:1591
>>  [<ffffffff8348ca9c>] deferred_put_nlk_sk+0x2c/0x40 net/netlink/af_netlink.c:671
>>  [<     inline     >] __rcu_reclaim kernel/rcu/rcu.h:118
>>  [<ffffffff8146d42f>] rcu_do_batch.isra.67+0x8ff/0xc50 kernel/rcu/tree.c:2776
>>  [<     inline     >] invoke_rcu_callbacks kernel/rcu/tree.c:3040
>>  [<     inline     >] __rcu_process_callbacks kernel/rcu/tree.c:3007
>>  [<ffffffff8146e097>] rcu_process_callbacks+0x2b7/0xba0 kernel/rcu/tree.c:3024
>>  [<ffffffff84a2d08b>] __do_softirq+0x2fb/0xb63 kernel/softirq.c:284
>>  [<ffffffff812d38c0>] run_ksoftirqd+0x20/0x60 kernel/softirq.c:676
>>  [<ffffffff81350132>] smpboot_thread_fn+0x562/0x860 kernel/smpboot.c:163
>>  [<ffffffff8133ebf3>] kthread+0x323/0x3e0 kernel/kthread.c:209
>>  [<ffffffff84a2a22a>] ret_from_fork+0x2a/0x40 arch/x86/entry/entry_64.S:433
>>
>> The buggy address belongs to the object at ffff880068854170
>>  which belongs to the cache kmalloc-2048 of size 2048
>> The buggy address ffff880068854780 is located 1552 bytes inside
>>  of 2048-byte region [ffff880068854170, ffff880068854970)
>>
>> Freed by task 20:
>>  [<ffffffff81203526>] save_stack_trace+0x16/0x20 arch/x86/kernel/stacktrace.c:57
>>  [<ffffffff817e4173>] save_stack+0x43/0xd0 mm/kasan/kasan.c:495
>>  [<     inline     >] set_track mm/kasan/kasan.c:507
>>  [<ffffffff817e4a53>] kasan_slab_free+0x73/0xc0 mm/kasan/kasan.c:571
>>  [<     inline     >] slab_free_hook mm/slub.c:1352
>>  [<     inline     >] slab_free_freelist_hook mm/slub.c:1374
>>  [<     inline     >] slab_free mm/slub.c:2951
>>  [<ffffffff817e0eb7>] kfree+0xe7/0x2b0 mm/slub.c:3871
>>  [<     inline     >] sk_prot_free net/core/sock.c:1372
>>  [<ffffffff831ea1c7>] __sk_destruct+0x5c7/0x6e0 net/core/sock.c:1445
>>  [<ffffffff831f3517>] sk_destruct+0x47/0x80 net/core/sock.c:1453
>>  [<ffffffff831f35a7>] __sk_free+0x57/0x230 net/core/sock.c:1461
>>  [<ffffffff831f37a3>] sk_free+0x23/0x30 net/core/sock.c:1472
>>  [<     inline     >] sock_put include/net/sock.h:1591
>>  [<ffffffff8348ca9c>] deferred_put_nlk_sk+0x2c/0x40 net/netlink/af_netlink.c:671
>>  [<     inline     >] __rcu_reclaim kernel/rcu/rcu.h:118
>>  [<ffffffff8146d42f>] rcu_do_batch.isra.67+0x8ff/0xc50 kernel/rcu/tree.c:2776
>>  [<     inline     >] invoke_rcu_callbacks kernel/rcu/tree.c:3040
>>  [<     inline     >] __rcu_process_callbacks kernel/rcu/tree.c:3007
>>  [<ffffffff8146e097>] rcu_process_callbacks+0x2b7/0xba0 kernel/rcu/tree.c:3024
>>  [<ffffffff84a2d08b>] __do_softirq+0x2fb/0xb63 kernel/softirq.c:284
>>
>> Allocated by task 9480:
>>  [<ffffffff81203526>] save_stack_trace+0x16/0x20 arch/x86/kernel/stacktrace.c:57
>>  [<ffffffff817e4173>] save_stack+0x43/0xd0 mm/kasan/kasan.c:495
>>  [<     inline     >] set_track mm/kasan/kasan.c:507
>>  [<ffffffff817e43fd>] kasan_kmalloc+0xad/0xe0 mm/kasan/kasan.c:598
>>  [<ffffffff817e0050>] __kmalloc+0xa0/0x2d0 mm/slub.c:3734
>>  [<     inline     >] kmalloc include/linux/slab.h:495
>>  [<ffffffff831e4c01>] sk_prot_alloc+0x101/0x2a0 net/core/sock.c:1333
>>  [<ffffffff831efd15>] sk_alloc+0x105/0x1000 net/core/sock.c:1389
>>  [<ffffffff8348ad46>] __netlink_create+0x66/0x1d0 net/netlink/af_netlink.c:588
>>  [<ffffffff8348cdab>] netlink_create+0x2fb/0x500 net/netlink/af_netlink.c:647
>>  [<ffffffff831dd1d6>] __sock_create+0x4f6/0x880 net/socket.c:1168
>>  [<     inline     >] sock_create net/socket.c:1208
>>  [<     inline     >] SYSC_socket net/socket.c:1238
>>  [<ffffffff831dd799>] SyS_socket+0xf9/0x230 net/socket.c:1218
>>  [<ffffffff84a29fc1>] entry_SYSCALL_64_fastpath+0x1f/0xc2
>>
>> Memory state around the buggy address:
>>  ffff880068854680: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
>>  ffff880068854700: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
>> >ffff880068854780: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
>>                    ^
>>  ffff880068854800: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
>>  ffff880068854880: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
>> ==================================================================
>
>
> Hi Andrey. Please give us some rest during the week end ;)

Hi Eric,

Sorry, wanted to restart fuzzer on newer kernel and immediately
started getting enormous amount of crashes :)

>
> This looks like the bug I mentioned earlier for which I have a pending
> patch ? Can you try it ?

No, it seems that your patch doesn't help, this is apparently something else.

I've attached a reproducer.

Thanks!

>
> The RCU conversion done by Thomas was quite buggy.
>
> Thanks.
>
>
> diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
> index 602e5ebe9db39ec6c72708628bc48efad9f0e680..c348c4a5ea4ecc05dcc9e2afbc069ab65a1a57fe 100644
> --- a/net/netlink/af_netlink.c
> +++ b/net/netlink/af_netlink.c
> @@ -475,8 +475,8 @@ static struct sock *netlink_lookup(struct net *net, int protocol, u32 portid)
>
>         rcu_read_lock();
>         sk = __netlink_lookup(table, portid, net);
> -       if (sk)
> -               sock_hold(sk);
> +       if (sk && !atomic_inc_not_zero(&sk->sk_refcnt))
> +               sk = NULL;
>         rcu_read_unlock();
>
>         return sk;
> @@ -600,6 +600,7 @@ static int __netlink_create(struct net *net, struct socket *sock,
>         }
>         init_waitqueue_head(&nlk->wait);
>
> +       sock_set_flag(sk, SOCK_RCU_FREE);
>         sk->sk_destruct = netlink_sock_destruct;
>         sk->sk_protocol = protocol;
>         return 0;
> @@ -664,13 +665,6 @@ static int netlink_create(struct net *net, struct socket *sock, int protocol,
>         goto out;
>  }
>
> -static void deferred_put_nlk_sk(struct rcu_head *head)
> -{
> -       struct netlink_sock *nlk = container_of(head, struct netlink_sock, rcu);
> -
> -       sock_put(&nlk->sk);
> -}
> -
>  static int netlink_release(struct socket *sock)
>  {
>         struct sock *sk = sock->sk;
> @@ -743,7 +737,7 @@ static int netlink_release(struct socket *sock)
>         local_bh_disable();
>         sock_prot_inuse_add(sock_net(sk), &netlink_proto, -1);
>         local_bh_enable();
> -       call_rcu(&nlk->rcu, deferred_put_nlk_sk);
> +       sock_put(sk);
>         return 0;
>  }
>
> diff --git a/net/netlink/af_netlink.h b/net/netlink/af_netlink.h
> index 4fdb3831897775547f77c069a8018c0d2a253c8c..988d1a02487e37b7efd4872dd0ab6d230e5a2021 100644
> --- a/net/netlink/af_netlink.h
> +++ b/net/netlink/af_netlink.h
> @@ -33,7 +33,6 @@ struct netlink_sock {
>         struct module           *module;
>
>         struct rhash_head       node;
> -       struct rcu_head         rcu;
>         struct work_struct      work;
>  };
>
>
>
>
> --
> You received this message because you are subscribed to the Google Groups "syzkaller" group.
> To unsubscribe from this group and stop receiving emails from it, send an email to syzkaller+unsubscribe@googlegroups.com.
> For more options, visit https://groups.google.com/d/optout.

[-- Attachment #2: worker-crash-poc.c --]
[-- Type: text/x-csrc, Size: 8352 bytes --]

// autogenerated by syzkaller (http://github.com/google/syzkaller)

#ifndef __NR_socket
#define __NR_socket 41
#endif
#ifndef __NR_write
#define __NR_write 1
#endif
#ifndef __NR_readv
#define __NR_readv 19
#endif
#ifndef __NR_mmap
#define __NR_mmap 9
#endif

#define _GNU_SOURCE

#include <sys/ioctl.h>
#include <sys/mount.h>
#include <sys/prctl.h>
#include <sys/resource.h>
#include <sys/socket.h>
#include <sys/stat.h>
#include <sys/syscall.h>
#include <sys/time.h>
#include <sys/types.h>
#include <sys/wait.h>

#include <linux/capability.h>
#include <linux/if.h>
#include <linux/if_tun.h>
#include <linux/sched.h>
#include <net/if_arp.h>

#include <assert.h>
#include <dirent.h>
#include <errno.h>
#include <fcntl.h>
#include <grp.h>
#include <pthread.h>
#include <setjmp.h>
#include <signal.h>
#include <stdarg.h>
#include <stdbool.h>
#include <stddef.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>

const int kFailStatus = 67;
const int kErrorStatus = 68;
const int kRetryStatus = 69;

__attribute__((noreturn)) void fail(const char* msg, ...)
{
  int e = errno;
  fflush(stdout);
  va_list args;
  va_start(args, msg);
  vfprintf(stderr, msg, args);
  va_end(args);
  fprintf(stderr, " (errno %d)\n", e);
  exit(kFailStatus);
}

__attribute__((noreturn)) void exitf(const char* msg, ...)
{
  int e = errno;
  fflush(stdout);
  va_list args;
  va_start(args, msg);
  vfprintf(stderr, msg, args);
  va_end(args);
  fprintf(stderr, " (errno %d)\n", e);
  exit(kRetryStatus);
}

static int flag_debug;

void debug(const char* msg, ...)
{
  if (!flag_debug)
    return;
  va_list args;
  va_start(args, msg);
  vfprintf(stdout, msg, args);
  va_end(args);
  fflush(stdout);
}

__thread int skip_segv;
__thread jmp_buf segv_env;

static void segv_handler(int sig, siginfo_t* info, void* uctx)
{
  if (__atomic_load_n(&skip_segv, __ATOMIC_RELAXED))
    _longjmp(segv_env, 1);
  exit(sig);
}

static void install_segv_handler()
{
  struct sigaction sa;
  memset(&sa, 0, sizeof(sa));
  sa.sa_sigaction = segv_handler;
  sa.sa_flags = SA_NODEFER | SA_SIGINFO;
  sigaction(SIGSEGV, &sa, NULL);
  sigaction(SIGBUS, &sa, NULL);
}

#define NONFAILING(...)                                                \
  {                                                                    \
    __atomic_fetch_add(&skip_segv, 1, __ATOMIC_SEQ_CST);               \
    if (_setjmp(segv_env) == 0) {                                      \
      __VA_ARGS__;                                                     \
    }                                                                  \
    __atomic_fetch_sub(&skip_segv, 1, __ATOMIC_SEQ_CST);               \
  }

static uintptr_t execute_syscall(int nr, uintptr_t a0, uintptr_t a1,
                                 uintptr_t a2, uintptr_t a3,
                                 uintptr_t a4, uintptr_t a5,
                                 uintptr_t a6, uintptr_t a7,
                                 uintptr_t a8)
{
  switch (nr) {
  default:
    return syscall(nr, a0, a1, a2, a3, a4, a5);
  }
}

static void setup_main_process(uint64_t pid, bool enable_tun)
{
  struct sigaction sa;
  memset(&sa, 0, sizeof(sa));
  sa.sa_handler = SIG_IGN;
  syscall(SYS_rt_sigaction, 0x20, &sa, NULL, 8);
  syscall(SYS_rt_sigaction, 0x21, &sa, NULL, 8);
  install_segv_handler();

  char tmpdir_template[] = "./syzkaller.XXXXXX";
  char* tmpdir = mkdtemp(tmpdir_template);
  if (!tmpdir)
    fail("failed to mkdtemp");
  if (chmod(tmpdir, 0777))
    fail("failed to chmod");
  if (chdir(tmpdir))
    fail("failed to chdir");
}

static void loop();

static void sandbox_common()
{
  prctl(PR_SET_PDEATHSIG, SIGKILL, 0, 0, 0);
  setpgrp();
  setsid();

  struct rlimit rlim;
  rlim.rlim_cur = rlim.rlim_max = 128 << 20;
  setrlimit(RLIMIT_AS, &rlim);
  rlim.rlim_cur = rlim.rlim_max = 1 << 20;
  setrlimit(RLIMIT_FSIZE, &rlim);
  rlim.rlim_cur = rlim.rlim_max = 1 << 20;
  setrlimit(RLIMIT_STACK, &rlim);
  rlim.rlim_cur = rlim.rlim_max = 0;
  setrlimit(RLIMIT_CORE, &rlim);

  unshare(CLONE_NEWNS);
  unshare(CLONE_NEWIPC);
  unshare(CLONE_IO);
}

static int do_sandbox_none()
{
  int pid = fork();
  if (pid)
    return pid;
  sandbox_common();
  loop();
  exit(1);
}

static void remove_dir(const char* dir)
{
  DIR* dp;
  struct dirent* ep;
  int iter = 0;
retry:
  dp = opendir(dir);
  if (dp == NULL) {
    if (errno == EMFILE) {
      exitf("opendir(%s) failed due to NOFILE, exiting");
    }
    exitf("opendir(%s) failed", dir);
  }
  while ((ep = readdir(dp))) {
    if (strcmp(ep->d_name, ".") == 0 || strcmp(ep->d_name, "..") == 0)
      continue;
    char filename[FILENAME_MAX];
    snprintf(filename, sizeof(filename), "%s/%s", dir, ep->d_name);
    struct stat st;
    if (lstat(filename, &st))
      exitf("lstat(%s) failed", filename);
    if (S_ISDIR(st.st_mode)) {
      remove_dir(filename);
      continue;
    }
    int i;
    for (i = 0;; i++) {
      debug("unlink(%s)\n", filename);
      if (unlink(filename) == 0)
        break;
      if (errno == EROFS) {
        debug("ignoring EROFS\n");
        break;
      }
      if (errno != EBUSY || i > 100)
        exitf("unlink(%s) failed", filename);
      debug("umount(%s)\n", filename);
      if (umount2(filename, MNT_DETACH))
        exitf("umount(%s) failed", filename);
    }
  }
  closedir(dp);
  int i;
  for (i = 0;; i++) {
    debug("rmdir(%s)\n", dir);
    if (rmdir(dir) == 0)
      break;
    if (i < 100) {
      if (errno == EROFS) {
        debug("ignoring EROFS\n");
        break;
      }
      if (errno == EBUSY) {
        debug("umount(%s)\n", dir);
        if (umount2(dir, MNT_DETACH))
          exitf("umount(%s) failed", dir);
        continue;
      }
      if (errno == ENOTEMPTY) {
        if (iter < 100) {
          iter++;
          goto retry;
        }
      }
    }
    exitf("rmdir(%s) failed", dir);
  }
}

static uint64_t current_time_ms()
{
  struct timespec ts;

  if (clock_gettime(CLOCK_MONOTONIC, &ts))
    fail("clock_gettime failed");
  return (uint64_t)ts.tv_sec * 1000 + (uint64_t)ts.tv_nsec / 1000000;
}

static void test();

void loop()
{
  int iter;
  for (iter = 0;; iter++) {
    char cwdbuf[256];
    sprintf(cwdbuf, "./%d", iter);
    if (mkdir(cwdbuf, 0777))
      fail("failed to mkdir");
    int pid = fork();
    if (pid < 0)
      fail("clone failed");
    if (pid == 0) {
      prctl(PR_SET_PDEATHSIG, SIGKILL, 0, 0, 0);
      setpgrp();
      if (chdir(cwdbuf))
        fail("failed to chdir");
      test();
      exit(0);
    }
    int status = 0;
    uint64_t start = current_time_ms();
    for (;;) {
      int res = waitpid(pid, &status, __WALL | WNOHANG);
      int errno0 = errno;
      if (res == pid)
        break;
      usleep(1000);
      if (current_time_ms() - start > 5 * 1000) {
        kill(-pid, SIGKILL);
        kill(pid, SIGKILL);
        waitpid(pid, &status, __WALL);
        break;
      }
    }
    remove_dir(cwdbuf);
  }
}

long r[7];
void* thr(void* arg)
{
  switch ((long)arg) {
  case 0:
    r[0] =
        execute_syscall(__NR_mmap, 0x20000000ul, 0xe43000ul, 0x3ul,
                        0x32ul, 0xfffffffffffffffful, 0x0ul, 0, 0, 0);
    break;
  case 1:
    r[1] = execute_syscall(__NR_socket, 0x10ul, 0x3ul, 0x0ul, 0, 0, 0,
                           0, 0, 0);
    break;
  case 2:
    NONFAILING(memcpy((void*)0x20e42fe1,
                      "\x1f\x00\x00\x00\x1a\x00\x03\xf2\x00\x00\x13\xff"
                      "\x07\x00\x00\x77\x00\x00\xff\xff\xff\x7f\xd8\x00"
                      "\x00\x00\x00\x00\x00\x00\x34",
                      31));
    r[3] = execute_syscall(__NR_write, r[1], 0x20e42fe1ul, 0x1ful, 0, 0,
                           0, 0, 0, 0);
    break;
  case 3:
    NONFAILING(*(uint64_t*)0x20e41000 = (uint64_t)0x20e3ef11);
    NONFAILING(*(uint64_t*)0x20e41008 = (uint64_t)0x1);
    r[6] = execute_syscall(__NR_readv, r[1], 0x20e41000ul, 0x1ul, 0, 0,
                           0, 0, 0, 0);
    break;
  }
  return 0;
}

void test()
{
  long i;
  pthread_t th[8];

  memset(r, -1, sizeof(r));
  srand(getpid());
  for (i = 0; i < 4; i++) {
    pthread_create(&th[i], 0, thr, (void*)i);
    usleep(10000);
  }
  usleep(100000);
}

int main()
{
  setup_main_process(0, false);
  int pid = do_sandbox_none();
  int status = 0;
  while (waitpid(pid, &status, __WALL) != pid) {
  }
  return 0;
}

^ permalink raw reply

* Re: [PATCH 1/1] net: caif: fix ineffective error check
From: Pan Bian @ 2016-12-03 15:38 UTC (permalink / raw)
  To: Sergei Shtylyov, Dmitry Tarnyagin, David S. Miller
  Cc: PanBian, netdev, linux-kernel
In-Reply-To: <f214a441-9e4b-6c40-f09d-32a6d1ea4dd0@cogentembedded.com>

From: PanBian <bianpan2016@163.com>

Hello Sergei,

On Sat, Dec 03, 2016 at 04:17:51PM +0300, Sergei Shtylyov wrote:
> Hello.
> 
> On 12/3/2016 2:18 PM, Pan Bian wrote:
> 
> >In function caif_sktinit_module(), the check of the return value of
> >sock_register() seems ineffective. This patch fixes it.
> >
> >Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=188751
> >
> >Signed-off-by: Pan Bian <bianpan2016@163.com>
> >---
> > net/caif/caif_socket.c | 2 +-
> > 1 file changed, 1 insertion(+), 1 deletion(-)
> >
> >diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c
> >index aa209b1..2a689a3 100644
> >--- a/net/caif/caif_socket.c
> >+++ b/net/caif/caif_socket.c
> >@@ -1108,7 +1108,7 @@ static int caif_create(struct net *net, struct socket *sock, int protocol,
> > static int __init caif_sktinit_module(void)
> > {
> > 	int err = sock_register(&caif_family_ops);
> >-	if (!err)
> >+	if (err)
> > 		return err;
> 
>    Why not just:
> 
> 	return sock_register(&caif_family_ops);
>
	Your solution looks much cleaner.

	But I am not really sure whether it is the author's intention to
	return 0 anyway. Do you have any idea?

	Thanks!
> > 	return 0;
> > }
> 
> MBR, Sergei
> 

Best regards,
Pan

^ permalink raw reply

* Re: [PATCH net-next 1/4] bpf: xdp: Allow head adjustment in XDP prog
From: Jesper Dangaard Brouer @ 2016-12-03 15:24 UTC (permalink / raw)
  To: Martin KaFai Lau
  Cc: brouer, netdev, Alexei Starovoitov, Brenden Blanco,
	Daniel Borkmann, David Miller, Saeed Mahameed, Tariq Toukan,
	Kernel Team
In-Reply-To: <1480721013-1047541-2-git-send-email-kafai@fb.com>

On Fri, 2 Dec 2016 15:23:30 -0800
Martin KaFai Lau <kafai@fb.com> wrote:

> -bool bpf_helper_changes_skb_data(void *func)
> +BPF_CALL_2(bpf_xdp_adjust_head, struct xdp_buff *, xdp, int, offset)
> +{
> +	/* Both mlx4 and mlx5 driver align each packet to PAGE_SIZE when
> +	 * XDP prog is set.
> +	 * If the above is not true for the other drivers to support
> +	 * bpf_xdp_adjust_head, struct xdp_buff can be extended.
> +	 */
> +	void *head = (void *)((unsigned long)xdp->data & PAGE_MASK);
> +	void *new_data = xdp->data + offset;
> +
> +	if (new_data < head || new_data >= xdp->data_end)
> +		/* The packet length must be >=1 */
> +		return -EINVAL;
> +
> +	xdp->data = new_data;
> +
> +	return 0;
> +}

First time I read this code, I was about to complain about you didn't
use XDP_PACKET_HEADROOM in your boundary check.  But then I noticed the
PAGE_MASK.  If you rename "head" to "page_boundary" or "page_start"
then IMHO the code would be more readable.

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer

^ permalink raw reply

* Re: [PATCH 1/1] net: ethernet: 3com: set error code on failures
From: Pan Bian @ 2016-12-03 15:23 UTC (permalink / raw)
  To: Lino Sanfilippo, David Dillow, netdev; +Cc: PanBian, linux-kernel
In-Reply-To: <06dd8fc8-0c39-e79d-7e34-22bfab17d8a3@gmx.de>

From: PanBian <bianpan2016@163.com>

Hi, Lino,

There is no special reason to map different errors to -EIO. In the
original source code, err is set to -EIO on most error paths. I copied 
that. I guess it has no difference with the error value provided by the 
called functions in the view of callers of typhoon_init_one().

Thanks!

Best regards,
Pan

On Sat, Dec 03, 2016 at 02:53:07PM +0100, Lino Sanfilippo wrote:
> Hi,
> 
> On 03.12.2016 14:24, Pan Bian wrote:
> > From: Pan Bian <bianpan2016@163.com>
> > 
> > In function typhoon_init_one(), returns the value of variable err on
> > errors. However, on some error paths, variable err is not set to a
> > negative errno. This patch assigns "-EIO" to err on those paths.
> > 
> > Signed-off-by: Pan Bian <bianpan2016@163.com>
> 
> >  
> > @@ -2409,6 +2410,7 @@ enum state_values {
> >  	INIT_COMMAND_WITH_RESPONSE(&xp_cmd, TYPHOON_CMD_READ_VERSIONS);
> >  	if(typhoon_issue_command(tp, 1, &xp_cmd, 3, xp_resp) < 0) {
> >  		err_msg = "Could not get Sleep Image version";
> > +		err = -EIO;
> >  		goto error_out_reset;
> >  	}
> >  
> > @@ -2451,6 +2453,7 @@ enum state_values {
> >  
> >  	if(register_netdev(dev) < 0) {
> >  		err_msg = "unable to register netdev";
> > +		err = -EIO;
> >  		goto error_out_reset;
> >  	}
> >  
> > 
> 
> Why not return the error value provided by the called functions? Is there a reason
> to map different errors to -EIO?
> 
> Regards,
> Lino

^ permalink raw reply

* Re: [PATCH net-next 0/4]: Allow head adjustment in XDP prog
From: Jesper Dangaard Brouer @ 2016-12-03 15:17 UTC (permalink / raw)
  To: Martin KaFai Lau
  Cc: brouer, netdev, Alexei Starovoitov, Brenden Blanco,
	Daniel Borkmann, David Miller, Saeed Mahameed, Tariq Toukan,
	Kernel Team
In-Reply-To: <1480721013-1047541-1-git-send-email-kafai@fb.com>

On Fri, 2 Dec 2016 15:23:29 -0800
Martin KaFai Lau <kafai@fb.com> wrote:

> This series adds a helper to allow head adjustment in XDP prog.  mlx4
> driver has been modified to support this feature.  An example is written
> to encapsulate a packet with an IPv4/v6 header and then XDP_TX it
> out.

Hi Martin,

It is great to see work in this area.

Push/pop of headers is listed as on of the missing features here:
 https://prototype-kernel.readthedocs.io/en/latest/networking/XDP/implementation/missing_features.html
We can hopefully soon cross that of the list :-)

Header push and pop desc:
 https://prototype-kernel.readthedocs.io/en/latest/networking/XDP/design/requirements.html#header-push-and-pop

Thanks for working on this! :-)))
-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox