Netdev List
 help / color / mirror / Atom feed
* [bpf-next v2 3/9] net/ipv6: Extract table lookup from ip6_pol_route
From: David Ahern @ 2018-05-04  2:54 UTC (permalink / raw)
  To: netdev, borkmann, ast
  Cc: davem, shm, roopa, brouer, toke, john.fastabend, David Ahern
In-Reply-To: <20180504025432.23451-1-dsahern@gmail.com>

ip6_pol_route is used for ingress and egress FIB lookups. Refactor it
moving the table lookup into a separate fib6_table_lookup that can be
invoked separately and export the new function.

ip6_pol_route now calls fib6_table_lookup and uses the result to generate
a dst based rt6_info.

Signed-off-by: David Ahern <dsahern@gmail.com>
Acked-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_fib.h |  4 ++++
 net/ipv6/route.c      | 39 +++++++++++++++++++++++++--------------
 2 files changed, 29 insertions(+), 14 deletions(-)

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 80d76d8dc683..4f7b8f59ea6d 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -376,6 +376,10 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
 				   const struct sk_buff *skb,
 				   int flags, pol_lookup_t lookup);
 
+/* called with rcu lock held; caller needs to select path */
+struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table,
+				    int oif, struct flowi6 *fl6, int strict);
+
 struct fib6_info *fib6_multipath_select(const struct net *net,
 					struct fib6_info *match,
 					struct flowi6 *fl6, int oif,
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 58af969f3a2c..d0ace0c5c3e9 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1800,21 +1800,12 @@ void rt6_age_exceptions(struct fib6_info *rt,
 	rcu_read_unlock_bh();
 }
 
-struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
-			       int oif, struct flowi6 *fl6,
-			       const struct sk_buff *skb, int flags)
+/* must be called with rcu lock held */
+struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table,
+				    int oif, struct flowi6 *fl6, int strict)
 {
 	struct fib6_node *fn, *saved_fn;
 	struct fib6_info *f6i;
-	struct rt6_info *rt;
-	int strict = 0;
-
-	strict |= flags & RT6_LOOKUP_F_IFACE;
-	strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
-	if (net->ipv6.devconf_all->forwarding == 0)
-		strict |= RT6_LOOKUP_F_REACHABLE;
-
-	rcu_read_lock();
 
 	fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
 	saved_fn = fn;
@@ -1824,8 +1815,6 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
 
 redo_rt6_select:
 	f6i = rt6_select(net, fn, oif, strict);
-	if (f6i->fib6_nsiblings)
-		f6i = fib6_multipath_select(net, f6i, fl6, oif, skb, strict);
 	if (f6i == net->ipv6.fib6_null_entry) {
 		fn = fib6_backtrack(fn, &fl6->saddr);
 		if (fn)
@@ -1838,6 +1827,28 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
 		}
 	}
 
+	return f6i;
+}
+
+struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
+			       int oif, struct flowi6 *fl6,
+			       const struct sk_buff *skb, int flags)
+{
+	struct fib6_info *f6i;
+	struct rt6_info *rt;
+	int strict = 0;
+
+	strict |= flags & RT6_LOOKUP_F_IFACE;
+	strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
+	if (net->ipv6.devconf_all->forwarding == 0)
+		strict |= RT6_LOOKUP_F_REACHABLE;
+
+	rcu_read_lock();
+
+	f6i = fib6_table_lookup(net, table, oif, fl6, strict);
+	if (f6i->fib6_nsiblings)
+		f6i = fib6_multipath_select(net, f6i, fl6, oif, skb, strict);
+
 	if (f6i == net->ipv6.fib6_null_entry) {
 		rt = net->ipv6.ip6_null_entry;
 		rcu_read_unlock();
-- 
2.11.0

^ permalink raw reply related

* [bpf-next v2 4/9] net/ipv6: Refactor fib6_rule_action
From: David Ahern @ 2018-05-04  2:54 UTC (permalink / raw)
  To: netdev, borkmann, ast
  Cc: davem, shm, roopa, brouer, toke, john.fastabend, David Ahern
In-Reply-To: <20180504025432.23451-1-dsahern@gmail.com>

Move source address lookup from fib6_rule_action to a helper. It will be
used in a later patch by a second variant for fib6_rule_action.

Signed-off-by: David Ahern <dsahern@gmail.com>
Acked-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/fib6_rules.c | 52 ++++++++++++++++++++++++++++++---------------------
 1 file changed, 31 insertions(+), 21 deletions(-)

diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c
index 6547fc6491a6..d040c4bff3a0 100644
--- a/net/ipv6/fib6_rules.c
+++ b/net/ipv6/fib6_rules.c
@@ -96,6 +96,31 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
 	return &net->ipv6.ip6_null_entry->dst;
 }
 
+static int fib6_rule_saddr(struct net *net, struct fib_rule *rule, int flags,
+			   struct flowi6 *flp6, const struct net_device *dev)
+{
+	struct fib6_rule *r = (struct fib6_rule *)rule;
+
+	/* If we need to find a source address for this traffic,
+	 * we check the result if it meets requirement of the rule.
+	 */
+	if ((rule->flags & FIB_RULE_FIND_SADDR) &&
+	    r->src.plen && !(flags & RT6_LOOKUP_F_HAS_SADDR)) {
+		struct in6_addr saddr;
+
+		if (ipv6_dev_get_saddr(net, dev, &flp6->daddr,
+				       rt6_flags2srcprefs(flags), &saddr))
+			return -EAGAIN;
+
+		if (!ipv6_prefix_equal(&saddr, &r->src.addr, r->src.plen))
+			return -EAGAIN;
+
+		flp6->saddr = saddr;
+	}
+
+	return 0;
+}
+
 static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp,
 			    int flags, struct fib_lookup_arg *arg)
 {
@@ -134,27 +159,12 @@ static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp,
 
 	rt = lookup(net, table, flp6, arg->lookup_data, flags);
 	if (rt != net->ipv6.ip6_null_entry) {
-		struct fib6_rule *r = (struct fib6_rule *)rule;
-
-		/*
-		 * If we need to find a source address for this traffic,
-		 * we check the result if it meets requirement of the rule.
-		 */
-		if ((rule->flags & FIB_RULE_FIND_SADDR) &&
-		    r->src.plen && !(flags & RT6_LOOKUP_F_HAS_SADDR)) {
-			struct in6_addr saddr;
-
-			if (ipv6_dev_get_saddr(net,
-					       ip6_dst_idev(&rt->dst)->dev,
-					       &flp6->daddr,
-					       rt6_flags2srcprefs(flags),
-					       &saddr))
-				goto again;
-			if (!ipv6_prefix_equal(&saddr, &r->src.addr,
-					       r->src.plen))
-				goto again;
-			flp6->saddr = saddr;
-		}
+		err = fib6_rule_saddr(net, rule, flags, flp6,
+				      ip6_dst_idev(&rt->dst)->dev);
+
+		if (err == -EAGAIN)
+			goto again;
+
 		err = rt->dst.error;
 		if (err != -EAGAIN)
 			goto out;
-- 
2.11.0

^ permalink raw reply related

* [bpf-next v2 5/9] net/ipv6: Add fib6_lookup
From: David Ahern @ 2018-05-04  2:54 UTC (permalink / raw)
  To: netdev, borkmann, ast
  Cc: davem, shm, roopa, brouer, toke, john.fastabend, David Ahern
In-Reply-To: <20180504025432.23451-1-dsahern@gmail.com>

Add IPv6 equivalent to fib_lookup. Does a fib lookup, including rules,
but returns a FIB entry, fib6_info, rather than a dst based rt6_info.
fib6_lookup is any where from 140% (MULTIPLE_TABLES config disabled)
to 60% faster than any of the dst based lookup methods (without custom
rules) and 25% faster with custom rules (e.g., l3mdev rule).

Since the lookup function has a completely different signature,
fib6_rule_action is split into 2 paths: the existing one is
renamed __fib6_rule_action and a new one for the fib6_info path
is added. fib6_rule_action decides which to call based on the
lookup_ptr. If it is fib6_table_lookup then the new path is taken.

Caller must hold rcu lock as no reference is taken on the returned
fib entry.

Signed-off-by: David Ahern <dsahern@gmail.com>
Acked-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_fib.h |  6 ++++
 net/ipv6/fib6_rules.c | 86 +++++++++++++++++++++++++++++++++++++++++++++++++--
 net/ipv6/ip6_fib.c    |  7 +++++
 3 files changed, 97 insertions(+), 2 deletions(-)

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 4f7b8f59ea6d..d920dd00139b 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -376,6 +376,12 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
 				   const struct sk_buff *skb,
 				   int flags, pol_lookup_t lookup);
 
+/* called with rcu lock held; can return error pointer
+ * caller needs to select path
+ */
+struct fib6_info *fib6_lookup(struct net *net, int oif, struct flowi6 *fl6,
+			      int flags);
+
 /* called with rcu lock held; caller needs to select path */
 struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table,
 				    int oif, struct flowi6 *fl6, int strict);
diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c
index d040c4bff3a0..f590446595d8 100644
--- a/net/ipv6/fib6_rules.c
+++ b/net/ipv6/fib6_rules.c
@@ -60,6 +60,39 @@ unsigned int fib6_rules_seq_read(struct net *net)
 	return fib_rules_seq_read(net, AF_INET6);
 }
 
+/* called with rcu lock held; no reference taken on fib6_info */
+struct fib6_info *fib6_lookup(struct net *net, int oif, struct flowi6 *fl6,
+			      int flags)
+{
+	struct fib6_info *f6i;
+	int err;
+
+	if (net->ipv6.fib6_has_custom_rules) {
+		struct fib_lookup_arg arg = {
+			.lookup_ptr = fib6_table_lookup,
+			.lookup_data = &oif,
+			.flags = FIB_LOOKUP_NOREF,
+		};
+
+		l3mdev_update_flow(net, flowi6_to_flowi(fl6));
+
+		err = fib_rules_lookup(net->ipv6.fib6_rules_ops,
+				       flowi6_to_flowi(fl6), flags, &arg);
+		if (err)
+			return ERR_PTR(err);
+
+		f6i = arg.result ? : net->ipv6.fib6_null_entry;
+	} else {
+		f6i = fib6_table_lookup(net, net->ipv6.fib6_local_tbl,
+					oif, fl6, flags);
+		if (!f6i || f6i == net->ipv6.fib6_null_entry)
+			f6i = fib6_table_lookup(net, net->ipv6.fib6_main_tbl,
+						oif, fl6, flags);
+	}
+
+	return f6i;
+}
+
 struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
 				   const struct sk_buff *skb,
 				   int flags, pol_lookup_t lookup)
@@ -121,8 +154,48 @@ static int fib6_rule_saddr(struct net *net, struct fib_rule *rule, int flags,
 	return 0;
 }
 
-static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp,
-			    int flags, struct fib_lookup_arg *arg)
+static int fib6_rule_action_alt(struct fib_rule *rule, struct flowi *flp,
+				int flags, struct fib_lookup_arg *arg)
+{
+	struct flowi6 *flp6 = &flp->u.ip6;
+	struct net *net = rule->fr_net;
+	struct fib6_table *table;
+	struct fib6_info *f6i;
+	int err = -EAGAIN, *oif;
+	u32 tb_id;
+
+	switch (rule->action) {
+	case FR_ACT_TO_TBL:
+		break;
+	case FR_ACT_UNREACHABLE:
+		return -ENETUNREACH;
+	case FR_ACT_PROHIBIT:
+		return -EACCES;
+	case FR_ACT_BLACKHOLE:
+	default:
+		return -EINVAL;
+	}
+
+	tb_id = fib_rule_get_table(rule, arg);
+	table = fib6_get_table(net, tb_id);
+	if (!table)
+		return -EAGAIN;
+
+	oif = (int *)arg->lookup_data;
+	f6i = fib6_table_lookup(net, table, *oif, flp6, flags);
+	if (f6i != net->ipv6.fib6_null_entry) {
+		err = fib6_rule_saddr(net, rule, flags, flp6,
+				      fib6_info_nh_dev(f6i));
+
+		if (likely(!err))
+			arg->result = f6i;
+	}
+
+	return err;
+}
+
+static int __fib6_rule_action(struct fib_rule *rule, struct flowi *flp,
+			      int flags, struct fib_lookup_arg *arg)
 {
 	struct flowi6 *flp6 = &flp->u.ip6;
 	struct rt6_info *rt = NULL;
@@ -182,6 +255,15 @@ static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp,
 	return err;
 }
 
+static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp,
+			    int flags, struct fib_lookup_arg *arg)
+{
+	if (arg->lookup_ptr == fib6_table_lookup)
+		return fib6_rule_action_alt(rule, flp, flags, arg);
+
+	return __fib6_rule_action(rule, flp, flags, arg);
+}
+
 static bool fib6_rule_suppress(struct fib_rule *rule, struct fib_lookup_arg *arg)
 {
 	struct rt6_info *rt = (struct rt6_info *) arg->result;
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 4cfffa0f676e..0b94c0a631cb 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -354,6 +354,13 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
 	return &rt->dst;
 }
 
+/* called with rcu lock held; no reference taken on fib6_info */
+struct fib6_info *fib6_lookup(struct net *net, int oif, struct flowi6 *fl6,
+			      int flags)
+{
+	return fib6_table_lookup(net, net->ipv6.fib6_main_tbl, oif, fl6, flags);
+}
+
 static void __net_init fib6_tables_init(struct net *net)
 {
 	fib6_link_table(net, net->ipv6.fib6_main_tbl);
-- 
2.11.0

^ permalink raw reply related

* [bpf-next v2 6/9] net/ipv6: Update fib6 tracepoint to take fib6_info
From: David Ahern @ 2018-05-04  2:54 UTC (permalink / raw)
  To: netdev, borkmann, ast
  Cc: davem, shm, roopa, brouer, toke, john.fastabend, David Ahern
In-Reply-To: <20180504025432.23451-1-dsahern@gmail.com>

Similar to IPv4, IPv6 should use the FIB lookup result in the
tracepoint.

Signed-off-by: David Ahern <dsahern@gmail.com>
Acked-by: David S. Miller <davem@davemloft.net>
---
 include/trace/events/fib6.h | 14 +++++++-------
 net/ipv6/route.c            | 14 ++++++--------
 2 files changed, 13 insertions(+), 15 deletions(-)

diff --git a/include/trace/events/fib6.h b/include/trace/events/fib6.h
index 7e8d48a81b91..1b8d951e3c12 100644
--- a/include/trace/events/fib6.h
+++ b/include/trace/events/fib6.h
@@ -12,10 +12,10 @@
 
 TRACE_EVENT(fib6_table_lookup,
 
-	TP_PROTO(const struct net *net, const struct rt6_info *rt,
+	TP_PROTO(const struct net *net, const struct fib6_info *f6i,
 		 struct fib6_table *table, const struct flowi6 *flp),
 
-	TP_ARGS(net, rt, table, flp),
+	TP_ARGS(net, f6i, table, flp),
 
 	TP_STRUCT__entry(
 		__field(	u32,	tb_id		)
@@ -48,20 +48,20 @@ TRACE_EVENT(fib6_table_lookup,
 		in6 = (struct in6_addr *)__entry->dst;
 		*in6 = flp->daddr;
 
-		if (rt->rt6i_idev) {
-			__assign_str(name, rt->rt6i_idev->dev->name);
+		if (f6i->fib6_nh.nh_dev) {
+			__assign_str(name, f6i->fib6_nh.nh_dev);
 		} else {
 			__assign_str(name, "");
 		}
-		if (rt == net->ipv6.ip6_null_entry) {
+		if (f6i == net->ipv6.fib6_null_entry) {
 			struct in6_addr in6_zero = {};
 
 			in6 = (struct in6_addr *)__entry->gw;
 			*in6 = in6_zero;
 
-		} else if (rt) {
+		} else if (f6i) {
 			in6 = (struct in6_addr *)__entry->gw;
-			*in6 = rt->rt6i_gateway;
+			*in6 = f6i->fib6_nh.nh_gw;
 		}
 	),
 
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index d0ace0c5c3e9..cf8de6899581 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1078,6 +1078,8 @@ static struct rt6_info *ip6_pol_route_lookup(struct net *net,
 			goto restart;
 	}
 
+	trace_fib6_table_lookup(net, f6i, table, fl6);
+
 	/* Search through exception table */
 	rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
 	if (rt) {
@@ -1096,8 +1098,6 @@ static struct rt6_info *ip6_pol_route_lookup(struct net *net,
 
 	rcu_read_unlock();
 
-	trace_fib6_table_lookup(net, rt, table, fl6);
-
 	return rt;
 }
 
@@ -1827,6 +1827,8 @@ struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table,
 		}
 	}
 
+	trace_fib6_table_lookup(net, f6i, table, fl6);
+
 	return f6i;
 }
 
@@ -1853,7 +1855,6 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
 		rt = net->ipv6.ip6_null_entry;
 		rcu_read_unlock();
 		dst_hold(&rt->dst);
-		trace_fib6_table_lookup(net, rt, table, fl6);
 		return rt;
 	}
 
@@ -1864,7 +1865,6 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
 			dst_use_noref(&rt->dst, jiffies);
 
 		rcu_read_unlock();
-		trace_fib6_table_lookup(net, rt, table, fl6);
 		return rt;
 	} else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
 			    !(f6i->fib6_flags & RTF_GATEWAY))) {
@@ -1890,9 +1890,7 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
 			dst_hold(&uncached_rt->dst);
 		}
 
-		trace_fib6_table_lookup(net, uncached_rt, table, fl6);
 		return uncached_rt;
-
 	} else {
 		/* Get a percpu copy */
 
@@ -1906,7 +1904,7 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
 
 		local_bh_enable();
 		rcu_read_unlock();
-		trace_fib6_table_lookup(net, pcpu_rt, table, fl6);
+
 		return pcpu_rt;
 	}
 }
@@ -2486,7 +2484,7 @@ static struct rt6_info *__ip6_route_redirect(struct net *net,
 
 	rcu_read_unlock();
 
-	trace_fib6_table_lookup(net, ret, table, fl6);
+	trace_fib6_table_lookup(net, rt, table, fl6);
 	return ret;
 };
 
-- 
2.11.0

^ permalink raw reply related

* [bpf-next v2 7/9] net/ipv6: Add fib lookup stubs for use in bpf helper
From: David Ahern @ 2018-05-04  2:54 UTC (permalink / raw)
  To: netdev, borkmann, ast
  Cc: davem, shm, roopa, brouer, toke, john.fastabend, David Ahern
In-Reply-To: <20180504025432.23451-1-dsahern@gmail.com>

Add stubs to retrieve a handle to an IPv6 FIB table, fib6_get_table,
a stub to do a lookup in a specific table, fib6_table_lookup, and
a stub for a full route lookup.

The stubs are needed for core bpf code to handle the case when the
IPv6 module is not builtin.

Signed-off-by: David Ahern <dsahern@gmail.com>
Acked-by: David S. Miller <davem@davemloft.net>
---
 include/net/addrconf.h   | 14 ++++++++++++++
 net/ipv6/addrconf_core.c | 33 ++++++++++++++++++++++++++++++++-
 net/ipv6/af_inet6.c      |  6 +++++-
 3 files changed, 51 insertions(+), 2 deletions(-)

diff --git a/include/net/addrconf.h b/include/net/addrconf.h
index 8312cc25a3af..ff766ab207e0 100644
--- a/include/net/addrconf.h
+++ b/include/net/addrconf.h
@@ -223,6 +223,20 @@ struct ipv6_stub {
 				 const struct in6_addr *addr);
 	int (*ipv6_dst_lookup)(struct net *net, struct sock *sk,
 			       struct dst_entry **dst, struct flowi6 *fl6);
+
+	struct fib6_table *(*fib6_get_table)(struct net *net, u32 id);
+	struct fib6_info *(*fib6_lookup)(struct net *net, int oif,
+					 struct flowi6 *fl6, int flags);
+	struct fib6_info *(*fib6_table_lookup)(struct net *net,
+					      struct fib6_table *table,
+					      int oif, struct flowi6 *fl6,
+					      int flags);
+	struct fib6_info *(*fib6_multipath_select)(const struct net *net,
+						   struct fib6_info *f6i,
+						   struct flowi6 *fl6, int oif,
+						   const struct sk_buff *skb,
+						   int strict);
+
 	void (*udpv6_encap_enable)(void);
 	void (*ndisc_send_na)(struct net_device *dev, const struct in6_addr *daddr,
 			      const struct in6_addr *solicited_addr,
diff --git a/net/ipv6/addrconf_core.c b/net/ipv6/addrconf_core.c
index 32b564dfd02a..2fe754fd4f5e 100644
--- a/net/ipv6/addrconf_core.c
+++ b/net/ipv6/addrconf_core.c
@@ -134,8 +134,39 @@ static int eafnosupport_ipv6_dst_lookup(struct net *net, struct sock *u1,
 	return -EAFNOSUPPORT;
 }
 
+static struct fib6_table *eafnosupport_fib6_get_table(struct net *net, u32 id)
+{
+	return NULL;
+}
+
+static struct fib6_info *
+eafnosupport_fib6_table_lookup(struct net *net, struct fib6_table *table,
+			       int oif, struct flowi6 *fl6, int flags)
+{
+	return NULL;
+}
+
+static struct fib6_info *
+eafnosupport_fib6_lookup(struct net *net, int oif, struct flowi6 *fl6,
+			 int flags)
+{
+	return NULL;
+}
+
+static struct fib6_info *
+eafnosupport_fib6_multipath_select(const struct net *net, struct fib6_info *f6i,
+				   struct flowi6 *fl6, int oif,
+				   const struct sk_buff *skb, int strict)
+{
+	return f6i;
+}
+
 const struct ipv6_stub *ipv6_stub __read_mostly = &(struct ipv6_stub) {
-	.ipv6_dst_lookup = eafnosupport_ipv6_dst_lookup,
+	.ipv6_dst_lookup   = eafnosupport_ipv6_dst_lookup,
+	.fib6_get_table    = eafnosupport_fib6_get_table,
+	.fib6_table_lookup = eafnosupport_fib6_table_lookup,
+	.fib6_lookup       = eafnosupport_fib6_lookup,
+	.fib6_multipath_select = eafnosupport_fib6_multipath_select,
 };
 EXPORT_SYMBOL_GPL(ipv6_stub);
 
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 36d622c477b1..c0e8255d50bb 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -887,7 +887,11 @@ static struct pernet_operations inet6_net_ops = {
 static const struct ipv6_stub ipv6_stub_impl = {
 	.ipv6_sock_mc_join = ipv6_sock_mc_join,
 	.ipv6_sock_mc_drop = ipv6_sock_mc_drop,
-	.ipv6_dst_lookup = ip6_dst_lookup,
+	.ipv6_dst_lookup   = ip6_dst_lookup,
+	.fib6_get_table	   = fib6_get_table,
+	.fib6_table_lookup = fib6_table_lookup,
+	.fib6_lookup       = fib6_lookup,
+	.fib6_multipath_select = fib6_multipath_select,
 	.udpv6_encap_enable = udpv6_encap_enable,
 	.ndisc_send_na = ndisc_send_na,
 	.nd_tbl	= &nd_tbl,
-- 
2.11.0

^ permalink raw reply related

* [bpf-next v2 8/9] bpf: Provide helper to do forwarding lookups in kernel FIB table
From: David Ahern @ 2018-05-04  2:54 UTC (permalink / raw)
  To: netdev, borkmann, ast
  Cc: davem, shm, roopa, brouer, toke, john.fastabend, David Ahern
In-Reply-To: <20180504025432.23451-1-dsahern@gmail.com>

Provide a helper for doing a FIB and neighbor lookup in the kernel
tables from an XDP program. The helper provides a fastpath for forwarding
packets. If the packet is a local delivery or for any reason is not a
simple lookup and forward, the packet continues up the stack.

If it is to be forwarded, the forwarding can be done directly if the
neighbor is already known. If the neighbor does not exist, the first
few packets go up the stack for neighbor resolution. Once resolved, the
xdp program provides the fast path.

On successful lookup the nexthop dmac, current device smac and egress
device index are returned.

The API supports IPv4, IPv6 and MPLS protocols, but only IPv4 and IPv6
are implemented in this patch. The API includes layer 4 parameters if
the XDP program chooses to do deep packet inspection to allow compare
against ACLs implemented as FIB rules.

Header rewrite is left to the XDP program.

The lookup takes 2 flags:
- BPF_FIB_LOOKUP_DIRECT to do a lookup that bypasses FIB rules and goes
  straight to the table associated with the device (expert setting for
  those looking to maximize throughput)

- BPF_FIB_LOOKUP_OUTPUT to do a lookup from the egress perspective.
  Default is an ingress lookup.

Initial performance numbers collected by Jesper, forwarded packets/sec:

       Full stack    XDP FIB lookup    XDP Direct lookup
IPv4   1,947,969       7,074,156          7,415,333
IPv6   1,728,000       6,165,504          7,262,720

These number are single CPU core forwarding on a Broadwell
E5-1650 v4 @ 3.60GHz.

Signed-off-by: David Ahern <dsahern@gmail.com>
---
 include/uapi/linux/bpf.h |  83 ++++++++++++++-
 net/core/filter.c        | 266 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 348 insertions(+), 1 deletion(-)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 93d5a4eeec2a..ddc566cb7492 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -10,6 +10,8 @@
 
 #include <linux/types.h>
 #include <linux/bpf_common.h>
+#include <linux/if_ether.h>
+#include <linux/in6.h>
 
 /* Extended instruction set based on top of classic BPF */
 
@@ -1826,6 +1828,33 @@ union bpf_attr {
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
+ *
+ * int bpf_fib_lookup(void *ctx, struct bpf_fib_lookup *params, int plen, u32 flags)
+ *	Description
+ *		Do FIB lookup in kernel tables using parameters in *params*.
+ *		If lookup is successful and result shows packet is to be
+ *		forwarded, the neighbor tables are searched for the nexthop.
+ *		If successful (ie., FIB lookup shows forwarding and nexthop
+ *		is resolved), the nexthop address is returned in ipv4_dst,
+ *		ipv6_dst or mpls_out based on family, smac is set to mac
+ *		address of egress device, dmac is set to nexthop mac address,
+ *		rt_metric is set to metric from route.
+ *
+ *             *plen* argument is the size of the passed in struct.
+ *             *flags* argument can be one or more BPF_FIB_LOOKUP_ flags:
+ *
+ *             **BPF_FIB_LOOKUP_DIRECT** means do a direct table lookup vs
+ *             full lookup using FIB rules
+ *             **BPF_FIB_LOOKUP_OUTPUT** means do lookup from an egress
+ *             perspective (default is ingress)
+ *
+ *             *ctx* is either **struct xdp_md** for XDP programs or
+ *             **struct sk_buff** tc cls_act programs.
+ *
+ *     Return
+ *             Egress device index on success, 0 if packet needs to continue
+ *             up the stack for further processing or a negative error in case
+ *             of failure.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -1896,7 +1925,8 @@ union bpf_attr {
 	FN(xdp_adjust_tail),		\
 	FN(skb_get_xfrm_state),		\
 	FN(get_stack),			\
-	FN(skb_load_bytes_relative),
+	FN(skb_load_bytes_relative),	\
+	FN(fib_lookup),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -2310,4 +2340,55 @@ struct bpf_raw_tracepoint_args {
 	__u64 args[0];
 };
 
+/* DIRECT:  Skip the FIB rules and go to FIB table associated with device
+ * OUTPUT:  Do lookup from egress perspective; default is ingress
+ */
+#define BPF_FIB_LOOKUP_DIRECT  BIT(0)
+#define BPF_FIB_LOOKUP_OUTPUT  BIT(1)
+
+struct bpf_fib_lookup {
+	/* input */
+	__u8	family;   /* network family, AF_INET, AF_INET6, AF_MPLS */
+
+	/* set if lookup is to consider L4 data - e.g., FIB rules */
+	__u8	l4_protocol;
+	__be16	sport;
+	__be16	dport;
+
+	/* total length of packet from network header - used for MTU check */
+	__u16	tot_len;
+	__u32	ifindex;  /* L3 device index for lookup */
+
+	union {
+		/* inputs to lookup */
+		__u8	tos;		/* AF_INET  */
+		__be32	flowlabel;	/* AF_INET6 */
+
+		/* output: metric of fib result */
+		__u32 rt_metric;
+	};
+
+	union {
+		__be32		mpls_in;
+		__be32		ipv4_src;
+		struct in6_addr	ipv6_src;
+	};
+
+	/* input to bpf_fib_lookup, *dst is destination address.
+	 * output: bpf_fib_lookup sets to gateway address
+	 */
+	union {
+		/* return for MPLS lookups */
+		__be32		mpls_out[4];  /* support up to 4 labels */
+		__be32		ipv4_dst;
+		struct in6_addr	ipv6_dst;
+	};
+
+	/* output */
+	__be16	h_vlan_proto;
+	__be16	h_vlan_TCI;
+	__u8	smac[ETH_ALEN];
+	__u8	dmac[ETH_ALEN];
+};
+
 #endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/net/core/filter.c b/net/core/filter.c
index 6877426c23a6..cf0d27acf1d1 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -60,6 +60,10 @@
 #include <net/xfrm.h>
 #include <linux/bpf_trace.h>
 #include <net/xdp_sock.h>
+#include <linux/inetdevice.h>
+#include <net/ip_fib.h>
+#include <net/flow.h>
+#include <net/arp.h>
 
 /**
  *	sk_filter_trim_cap - run a packet through a socket filter
@@ -4032,6 +4036,264 @@ static const struct bpf_func_proto bpf_skb_get_xfrm_state_proto = {
 };
 #endif
 
+#if IS_ENABLED(CONFIG_INET) || IS_ENABLED(CONFIG_IPV6)
+static int bpf_fib_set_fwd_params(struct bpf_fib_lookup *params,
+				  const struct neighbour *neigh,
+				  const struct net_device *dev)
+{
+	memcpy(params->dmac, neigh->ha, ETH_ALEN);
+	memcpy(params->smac, dev->dev_addr, ETH_ALEN);
+	params->h_vlan_TCI = 0;
+	params->h_vlan_proto = 0;
+
+	return dev->ifindex;
+}
+#endif
+
+#if IS_ENABLED(CONFIG_INET)
+static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
+			       u32 flags)
+{
+	struct in_device *in_dev;
+	struct neighbour *neigh;
+	struct net_device *dev;
+	struct fib_result res;
+	struct fib_nh *nh;
+	struct flowi4 fl4;
+	int err;
+
+	dev = dev_get_by_index_rcu(net, params->ifindex);
+	if (unlikely(!dev))
+		return -ENODEV;
+
+	/* verify forwarding is enabled on this interface */
+	in_dev = __in_dev_get_rcu(dev);
+	if (unlikely(!in_dev || !IN_DEV_FORWARD(in_dev)))
+		return 0;
+
+	if (flags & BPF_FIB_LOOKUP_OUTPUT) {
+		fl4.flowi4_iif = 1;
+		fl4.flowi4_oif = params->ifindex;
+	} else {
+		fl4.flowi4_iif = params->ifindex;
+		fl4.flowi4_oif = 0;
+	}
+	fl4.flowi4_tos = params->tos & IPTOS_RT_MASK;
+	fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
+	fl4.flowi4_flags = 0;
+
+	fl4.flowi4_proto = params->l4_protocol;
+	fl4.daddr = params->ipv4_dst;
+	fl4.saddr = params->ipv4_src;
+	fl4.fl4_sport = params->sport;
+	fl4.fl4_dport = params->dport;
+
+	if (flags & BPF_FIB_LOOKUP_DIRECT) {
+		u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN;
+		struct fib_table *tb;
+
+		tb = fib_get_table(net, tbid);
+		if (unlikely(!tb))
+			return 0;
+
+		err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF);
+	} else {
+		fl4.flowi4_mark = 0;
+		fl4.flowi4_secid = 0;
+		fl4.flowi4_tun_key.tun_id = 0;
+		fl4.flowi4_uid = sock_net_uid(net, NULL);
+
+		err = fib_lookup(net, &fl4, &res, FIB_LOOKUP_NOREF);
+	}
+
+	if (err || res.type != RTN_UNICAST)
+		return 0;
+
+	if (res.fi->fib_nhs > 1)
+		fib_select_path(net, &res, &fl4, NULL);
+
+	nh = &res.fi->fib_nh[res.nh_sel];
+
+	/* do not handle lwt encaps right now */
+	if (nh->nh_lwtstate)
+		return 0;
+
+	dev = nh->nh_dev;
+	if (unlikely(!dev))
+		return 0;
+
+	if (nh->nh_gw)
+		params->ipv4_dst = nh->nh_gw;
+
+	params->rt_metric = res.fi->fib_priority;
+
+	/* xdp and cls_bpf programs are run in RCU-bh so
+	 * rcu_read_lock_bh is not needed here
+	 */
+	neigh = __ipv4_neigh_lookup_noref(dev, (__force u32)params->ipv4_dst);
+	if (neigh)
+		return bpf_fib_set_fwd_params(params, neigh, dev);
+
+	return 0;
+}
+#endif
+
+#if IS_ENABLED(CONFIG_IPV6)
+static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
+			       u32 flags)
+{
+	struct neighbour *neigh;
+	struct net_device *dev;
+	struct inet6_dev *idev;
+	struct fib6_info *f6i;
+	struct flowi6 fl6;
+	int strict = 0;
+	int oif;
+
+	/* link local addresses are never forwarded */
+	if (rt6_need_strict(&params->ipv6_dst) ||
+	    rt6_need_strict(&params->ipv6_src))
+		return 0;
+
+	dev = dev_get_by_index_rcu(net, params->ifindex);
+	if (unlikely(!dev))
+		return -ENODEV;
+
+	idev = __in6_dev_get_safely(dev);
+	if (unlikely(!idev || !net->ipv6.devconf_all->forwarding))
+		return 0;
+
+	if (flags & BPF_FIB_LOOKUP_OUTPUT) {
+		fl6.flowi6_iif = 1;
+		oif = fl6.flowi6_oif = params->ifindex;
+	} else {
+		oif = fl6.flowi6_iif = params->ifindex;
+		fl6.flowi6_oif = 0;
+		strict = RT6_LOOKUP_F_HAS_SADDR;
+	}
+	fl6.flowlabel = params->flowlabel;
+	fl6.flowi6_scope = 0;
+	fl6.flowi6_flags = 0;
+	fl6.mp_hash = 0;
+
+	fl6.flowi6_proto = params->l4_protocol;
+	fl6.daddr = params->ipv6_dst;
+	fl6.saddr = params->ipv6_src;
+	fl6.fl6_sport = params->sport;
+	fl6.fl6_dport = params->dport;
+
+	if (flags & BPF_FIB_LOOKUP_DIRECT) {
+		u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN;
+		struct fib6_table *tb;
+
+		tb = ipv6_stub->fib6_get_table(net, tbid);
+		if (unlikely(!tb))
+			return 0;
+
+		f6i = ipv6_stub->fib6_table_lookup(net, tb, oif, &fl6, strict);
+	} else {
+		fl6.flowi6_mark = 0;
+		fl6.flowi6_secid = 0;
+		fl6.flowi6_tun_key.tun_id = 0;
+		fl6.flowi6_uid = sock_net_uid(net, NULL);
+
+		f6i = ipv6_stub->fib6_lookup(net, oif, &fl6, strict);
+	}
+
+	if (unlikely(IS_ERR_OR_NULL(f6i) || f6i == net->ipv6.fib6_null_entry))
+		return 0;
+
+	if (unlikely(f6i->fib6_flags & RTF_REJECT ||
+	    f6i->fib6_type != RTN_UNICAST))
+		return 0;
+
+	if (f6i->fib6_nsiblings && fl6.flowi6_oif == 0)
+		f6i = ipv6_stub->fib6_multipath_select(net, f6i, &fl6,
+						       fl6.flowi6_oif, NULL,
+						       strict);
+
+	if (f6i->fib6_nh.nh_lwtstate)
+		return 0;
+
+	if (f6i->fib6_flags & RTF_GATEWAY)
+		params->ipv6_dst = f6i->fib6_nh.nh_gw;
+
+	dev = f6i->fib6_nh.nh_dev;
+	params->rt_metric = f6i->fib6_metric;
+
+	/* xdp and cls_bpf programs are run in RCU-bh so rcu_read_lock_bh is
+	 * not needed here. Can not use __ipv6_neigh_lookup_noref here
+	 * because we need to get nd_tbl via the stub
+	 */
+	neigh = ___neigh_lookup_noref(ipv6_stub->nd_tbl, neigh_key_eq128,
+				      ndisc_hashfn, &params->ipv6_dst, dev);
+	if (neigh)
+		return bpf_fib_set_fwd_params(params, neigh, dev);
+
+	return 0;
+}
+#endif
+
+BPF_CALL_4(bpf_xdp_fib_lookup, struct xdp_buff *, ctx,
+	   struct bpf_fib_lookup *, params, int, plen, u32, flags)
+{
+	if (plen < sizeof(*params))
+		return -EINVAL;
+
+	switch (params->family) {
+#if IS_ENABLED(CONFIG_INET)
+	case AF_INET:
+		return bpf_ipv4_fib_lookup(dev_net(ctx->rxq->dev), params,
+					   flags);
+#endif
+#if IS_ENABLED(CONFIG_IPV6)
+	case AF_INET6:
+		return bpf_ipv6_fib_lookup(dev_net(ctx->rxq->dev), params,
+					   flags);
+#endif
+	}
+	return 0;
+}
+
+static const struct bpf_func_proto bpf_xdp_fib_lookup_proto = {
+	.func		= bpf_xdp_fib_lookup,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type      = ARG_PTR_TO_CTX,
+	.arg2_type      = ARG_PTR_TO_MEM,
+	.arg3_type      = ARG_CONST_SIZE,
+	.arg4_type	= ARG_ANYTHING,
+};
+
+BPF_CALL_4(bpf_skb_fib_lookup, struct sk_buff *, skb,
+	   struct bpf_fib_lookup *, params, int, plen, u32, flags)
+{
+	if (plen < sizeof(*params))
+		return -EINVAL;
+
+	switch (params->family) {
+#if IS_ENABLED(CONFIG_INET)
+	case AF_INET:
+		return bpf_ipv4_fib_lookup(dev_net(skb->dev), params, flags);
+#endif
+#if IS_ENABLED(CONFIG_IPV6)
+	case AF_INET6:
+		return bpf_ipv6_fib_lookup(dev_net(skb->dev), params, flags);
+#endif
+	}
+	return -ENOTSUPP;
+}
+
+static const struct bpf_func_proto bpf_skb_fib_lookup_proto = {
+	.func		= bpf_skb_fib_lookup,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type      = ARG_PTR_TO_CTX,
+	.arg2_type      = ARG_PTR_TO_MEM,
+	.arg3_type      = ARG_CONST_SIZE,
+	.arg4_type	= ARG_ANYTHING,
+};
+
 static const struct bpf_func_proto *
 bpf_base_func_proto(enum bpf_func_id func_id)
 {
@@ -4181,6 +4443,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	case BPF_FUNC_skb_get_xfrm_state:
 		return &bpf_skb_get_xfrm_state_proto;
 #endif
+	case BPF_FUNC_fib_lookup:
+		return &bpf_skb_fib_lookup_proto;
 	default:
 		return bpf_base_func_proto(func_id);
 	}
@@ -4206,6 +4470,8 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_xdp_redirect_map_proto;
 	case BPF_FUNC_xdp_adjust_tail:
 		return &bpf_xdp_adjust_tail_proto;
+	case BPF_FUNC_fib_lookup:
+		return &bpf_xdp_fib_lookup_proto;
 	default:
 		return bpf_base_func_proto(func_id);
 	}
-- 
2.11.0

^ permalink raw reply related

* [bpf-next v2 9/9] samples/bpf: Add example of ipv4 and ipv6 forwarding in XDP
From: David Ahern @ 2018-05-04  2:54 UTC (permalink / raw)
  To: netdev, borkmann, ast
  Cc: davem, shm, roopa, brouer, toke, john.fastabend, David Ahern
In-Reply-To: <20180504025432.23451-1-dsahern@gmail.com>

Simple example of fast-path forwarding. It has a serious flaw
in not verifying the egress device index supports XDP forwarding.
If the egress device does not packets are dropped.

Take this only as a simple example of fast-path forwarding.

Signed-off-by: David Ahern <dsahern@gmail.com>
Acked-by: David S. Miller <davem@davemloft.net>
---
 samples/bpf/Makefile                      |   4 +
 samples/bpf/xdp_fwd_kern.c                | 113 +++++++++++++++++++++++++
 samples/bpf/xdp_fwd_user.c                | 136 ++++++++++++++++++++++++++++++
 tools/testing/selftests/bpf/bpf_helpers.h |   3 +
 4 files changed, 256 insertions(+)
 create mode 100644 samples/bpf/xdp_fwd_kern.c
 create mode 100644 samples/bpf/xdp_fwd_user.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 8e0c7fb6d7cc..28513d6be1bf 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -46,6 +46,7 @@ hostprogs-y += syscall_tp
 hostprogs-y += cpustat
 hostprogs-y += xdp_adjust_tail
 hostprogs-y += xdpsock
+hostprogs-y += xdp_fwd
 
 # Libbpf dependencies
 LIBBPF := ../../tools/lib/bpf/bpf.o ../../tools/lib/bpf/nlattr.o
@@ -100,6 +101,7 @@ syscall_tp-objs := bpf_load.o $(LIBBPF) syscall_tp_user.o
 cpustat-objs := bpf_load.o $(LIBBPF) cpustat_user.o
 xdp_adjust_tail-objs := bpf_load.o $(LIBBPF) xdp_adjust_tail_user.o
 xdpsock-objs := bpf_load.o $(LIBBPF) xdpsock_user.o
+xdp_fwd-objs := bpf_load.o $(LIBBPF) xdp_fwd_user.o
 
 # Tell kbuild to always build the programs
 always := $(hostprogs-y)
@@ -154,6 +156,7 @@ always += syscall_tp_kern.o
 always += cpustat_kern.o
 always += xdp_adjust_tail_kern.o
 always += xdpsock_kern.o
+always += xdp_fwd_kern.o
 
 HOSTCFLAGS += -I$(objtree)/usr/include
 HOSTCFLAGS += -I$(srctree)/tools/lib/
@@ -201,6 +204,7 @@ HOSTLOADLIBES_syscall_tp += -lelf
 HOSTLOADLIBES_cpustat += -lelf
 HOSTLOADLIBES_xdp_adjust_tail += -lelf
 HOSTLOADLIBES_xdpsock += -lelf -pthread
+HOSTLOADLIBES_xdp_fwd += -lelf
 
 # Allows pointing LLC/CLANG to a LLVM backend with bpf support, redefine on cmdline:
 #  make samples/bpf/ LLC=~/git/llvm/build/bin/llc CLANG=~/git/llvm/build/bin/clang
diff --git a/samples/bpf/xdp_fwd_kern.c b/samples/bpf/xdp_fwd_kern.c
new file mode 100644
index 000000000000..7eeaa32538b1
--- /dev/null
+++ b/samples/bpf/xdp_fwd_kern.c
@@ -0,0 +1,113 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2017-18 David Ahern <dsahern@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#define KBUILD_MODNAME "foo"
+#include <uapi/linux/bpf.h>
+#include <linux/in.h>
+#include <linux/if_ether.h>
+#include <linux/if_packet.h>
+#include <linux/if_vlan.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+
+#include "bpf_helpers.h"
+
+#define IPV6_FLOWINFO_MASK              cpu_to_be32(0x0FFFFFFF)
+
+struct bpf_map_def SEC("maps") tx_port = {
+	.type = BPF_MAP_TYPE_DEVMAP,
+	.key_size = sizeof(int),
+	.value_size = sizeof(int),
+	.max_entries = 64,
+};
+
+static __always_inline int xdp_fwd_flags(struct xdp_md *ctx, u32 flags)
+{
+	void *data_end = (void *)(long)ctx->data_end;
+	void *data = (void *)(long)ctx->data;
+	struct bpf_fib_lookup fib_params;
+	struct ethhdr *eth = data;
+	int out_index;
+	u16 h_proto;
+	u64 nh_off;
+
+	nh_off = sizeof(*eth);
+	if (data + nh_off > data_end)
+		return XDP_DROP;
+
+	__builtin_memset(&fib_params, 0, sizeof(fib_params));
+
+	h_proto = eth->h_proto;
+	if (h_proto == htons(ETH_P_IP)) {
+		struct iphdr *iph = data + nh_off;
+
+		if (iph + 1 > data_end)
+			return XDP_DROP;
+
+		fib_params.family	= AF_INET;
+		fib_params.tos		= iph->tos;
+		fib_params.l4_protocol	= iph->protocol;
+		fib_params.sport	= 0;
+		fib_params.dport	= 0;
+		fib_params.tot_len	= ntohs(iph->tot_len);
+		fib_params.ipv4_src	= iph->saddr;
+		fib_params.ipv4_dst	= iph->daddr;
+	} else if (h_proto == htons(ETH_P_IPV6)) {
+		struct ipv6hdr *iph = data + nh_off;
+
+		if (iph + 1 > data_end)
+			return XDP_DROP;
+
+		fib_params.family	= AF_INET6;
+		fib_params.flowlabel	= *(__be32 *)iph & IPV6_FLOWINFO_MASK;
+		fib_params.l4_protocol	= iph->nexthdr;
+		fib_params.sport	= 0;
+		fib_params.dport	= 0;
+		fib_params.tot_len	= ntohs(iph->payload_len);
+		fib_params.ipv6_src	= iph->saddr;
+		fib_params.ipv6_dst	= iph->daddr;
+	} else {
+		return XDP_PASS;
+	}
+
+	fib_params.ifindex = ctx->ingress_ifindex;
+
+	out_index = bpf_fib_lookup(ctx, &fib_params, sizeof(fib_params), flags);
+
+	/* verify egress index has xdp support
+	 * TO-DO bpf_map_lookup_elem(&tx_port, &key) fails with
+	 *       cannot pass map_type 14 into func bpf_map_lookup_elem#1:
+	 * NOTE: without verification that egress index supports XDP
+	 *       forwarding packets are dropped.
+	 */
+	if (out_index > 0) {
+		memcpy(eth->h_dest, fib_params.dmac, ETH_ALEN);
+		memcpy(eth->h_source, fib_params.smac, ETH_ALEN);
+		return bpf_redirect_map(&tx_port, out_index, 0);
+	}
+
+	return XDP_PASS;
+}
+
+SEC("xdp_fwd")
+int xdp_fwd_prog(struct xdp_md *ctx)
+{
+	return xdp_fwd_flags(ctx, 0);
+}
+
+SEC("xdp_fwd_direct")
+int xdp_fwd_direct_prog(struct xdp_md *ctx)
+{
+	return xdp_fwd_flags(ctx, BPF_FIB_LOOKUP_DIRECT);
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/xdp_fwd_user.c b/samples/bpf/xdp_fwd_user.c
new file mode 100644
index 000000000000..9c6606f57126
--- /dev/null
+++ b/samples/bpf/xdp_fwd_user.c
@@ -0,0 +1,136 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2017-18 David Ahern <dsahern@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+#include <linux/bpf.h>
+#include <linux/if_link.h>
+#include <linux/limits.h>
+#include <net/if.h>
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <string.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <libgen.h>
+
+#include "bpf_load.h"
+#include "bpf_util.h"
+#include "libbpf.h"
+
+
+static int do_attach(int idx, int fd, const char *name)
+{
+	int err;
+
+	err = bpf_set_link_xdp_fd(idx, fd, 0);
+	if (err < 0)
+		printf("ERROR: failed to attach program to %s\n", name);
+
+	return err;
+}
+
+static int do_detach(int idx, const char *name)
+{
+	int err;
+
+	err = bpf_set_link_xdp_fd(idx, -1, 0);
+	if (err < 0)
+		printf("ERROR: failed to detach program from %s\n", name);
+
+	return err;
+}
+
+static void usage(const char *prog)
+{
+	fprintf(stderr,
+		"usage: %s [OPTS] interface-list\n"
+		"\nOPTS:\n"
+		"    -d    detach program\n"
+		"    -D    direct table lookups (skip fib rules)\n",
+		prog);
+}
+
+int main(int argc, char **argv)
+{
+	char filename[PATH_MAX];
+	int opt, i, idx, err;
+	int prog_id = 0;
+	int attach = 1;
+	int ret = 0;
+
+	while ((opt = getopt(argc, argv, ":dD")) != -1) {
+		switch (opt) {
+		case 'd':
+			attach = 0;
+			break;
+		case 'D':
+			prog_id = 1;
+			break;
+		default:
+			usage(basename(argv[0]));
+			return 1;
+		}
+	}
+
+	if (optind == argc) {
+		usage(basename(argv[0]));
+		return 1;
+	}
+
+	if (attach) {
+		snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+		if (access(filename, O_RDONLY) < 0) {
+			printf("error accessing file %s: %s\n",
+				filename, strerror(errno));
+			return 1;
+		}
+
+		if (load_bpf_file(filename)) {
+			printf("%s", bpf_log_buf);
+			return 1;
+		}
+
+		if (!prog_fd[prog_id]) {
+			printf("load_bpf_file: %s\n", strerror(errno));
+			return 1;
+		}
+	}
+	if (attach) {
+		for (i = 1; i < 64; ++i)
+			bpf_map_update_elem(map_fd[0], &i, &i, 0);
+	}
+
+	for (i = optind; i < argc; ++i) {
+		idx = if_nametoindex(argv[i]);
+		if (!idx)
+			idx = strtoul(argv[i], NULL, 0);
+
+		if (!idx) {
+			fprintf(stderr, "Invalid arg\n");
+			return 1;
+		}
+		if (!attach) {
+			err = do_detach(idx, argv[i]);
+			if (err)
+				ret = err;
+		} else {
+			err = do_attach(idx, prog_fd[prog_id], argv[i]);
+			if (err)
+				ret = err;
+		}
+	}
+
+	return ret;
+}
diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h
index 265f8e0e8ada..2375d06c706b 100644
--- a/tools/testing/selftests/bpf/bpf_helpers.h
+++ b/tools/testing/selftests/bpf/bpf_helpers.h
@@ -103,6 +103,9 @@ static int (*bpf_skb_get_xfrm_state)(void *ctx, int index, void *state,
 	(void *) BPF_FUNC_skb_get_xfrm_state;
 static int (*bpf_get_stack)(void *ctx, void *buf, int size, int flags) =
 	(void *) BPF_FUNC_get_stack;
+static int (*bpf_fib_lookup)(void *ctx, struct bpf_fib_lookup *params,
+			     int plen, __u32 flags) =
+	(void *) BPF_FUNC_fib_lookup;
 
 /* llvm builtin functions that eBPF C program may use to
  * emit BPF_LD_ABS and BPF_LD_IND instructions
-- 
2.11.0

^ permalink raw reply related

* [PATCH V3] net/netlink: make sure the headers line up actual value output
From: YU Bo @ 2018-05-04  2:59 UTC (permalink / raw)
  To: davem, xiyou.wangcong, yuzibode, tsu.yubo; +Cc: netdev, janitors

Making sure the headers line up properly with the actual value output of the command
`cat /proc/net/netlink`

Before the patch:
<sk       Eth Pid    Groups   Rmem     Wmem     Dump     Locks     Drops     Inode
<ffff8cd2c2f7b000 0   909    00000550 0        0        0 2        0        18946

After the patch:
>sk               Eth Pid        Groups   Rmem     Wmem     Dump  Locks    Drops    Inode
>0000000033203952 0   897        00000113 0        0        0     2        0        14906

Signed-off-by: Bo YU <tsu.yubo@gmail.com>
---
Changes in V3:
  - Drop external website for commit message
  - Modify the commit accroding to DavidM
Changes in V2:
  - Do not break the code line

 net/netlink/af_netlink.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 55342c4d5cec..2e2dd88fc79f 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -2606,13 +2606,13 @@ static int netlink_seq_show(struct seq_file *seq, void *v)
 {
 	if (v == SEQ_START_TOKEN) {
 		seq_puts(seq,
-			 "sk       Eth Pid    Groups   "
-			 "Rmem     Wmem     Dump     Locks     Drops     Inode\n");
+			 "sk               Eth Pid        Groups   "
+			 "Rmem     Wmem     Dump  Locks    Drops    Inode\n");
 	} else {
 		struct sock *s = v;
 		struct netlink_sock *nlk = nlk_sk(s);

-		seq_printf(seq, "%pK %-3d %-6u %08x %-8d %-8d %d %-8d %-8d %-8lu\n",
+		seq_printf(seq, "%pK %-3d %-10u %08x %-8d %-8d %-5d %-8d %-8d %-8lu\n",
 			   s,
 			   s->sk_protocol,
 			   nlk->portid,

^ permalink raw reply related

* [PATCH V3] net/netlink: make sure the headers line up actual value output
From: YU Bo @ 2018-05-04  3:09 UTC (permalink / raw)
  To: davem, xiyou.wangcong, tsu.yubo, yuzibode; +Cc: netdev

Making sure the headers line up properly with the actual value output of the command
`cat /proc/net/netlink`

Before the patch:
<sk       Eth Pid    Groups   Rmem     Wmem     Dump     Locks     Drops     Inode
<ffff8cd2c2f7b000 0   909    00000550 0        0        0 2        0        18946

After the patch:
>sk               Eth Pid        Groups   Rmem     Wmem     Dump  Locks    Drops    Inode
>0000000033203952 0   897        00000113 0        0        0     2        0        14906

Signed-off-by: Bo YU <tsu.yubo@gmail.com>
---
Changes in V3:
  - Drop external website for commit message
  - Modify the commit accroding to DavidM
Changes in V2:
  - Do not break the code line

 net/netlink/af_netlink.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 55342c4d5cec..2e2dd88fc79f 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -2606,13 +2606,13 @@ static int netlink_seq_show(struct seq_file *seq, void *v)
 {
 	if (v == SEQ_START_TOKEN) {
 		seq_puts(seq,
-			 "sk       Eth Pid    Groups   "
-			 "Rmem     Wmem     Dump     Locks     Drops     Inode\n");
+			 "sk               Eth Pid        Groups   "
+			 "Rmem     Wmem     Dump  Locks    Drops    Inode\n");
 	} else {
 		struct sock *s = v;
 		struct netlink_sock *nlk = nlk_sk(s);

-		seq_printf(seq, "%pK %-3d %-6u %08x %-8d %-8d %d %-8d %-8d %-8lu\n",
+		seq_printf(seq, "%pK %-3d %-10u %08x %-8d %-8d %-5d %-8d %-8d %-8lu\n",
 			   s,
 			   s->sk_protocol,
 			   nlk->portid,

^ permalink raw reply related

* [PATCH bpf v3] x86/cpufeature: bpf hack for clang not supporting asm goto
From: Yonghong Song @ 2018-05-04  3:31 UTC (permalink / raw)
  To: peterz, mingo, torvalds, ast, daniel, linux-kernel, x86, netdev
  Cc: kernel-team

Commit d0266046ad54 ("x86: Remove FAST_FEATURE_TESTS")
removed X86_FAST_FEATURE_TESTS and make macro static_cpu_has() always
use __always_inline function _static_cpu_has() funciton.
The static_cpu_has() uses gcc feature asm goto construct,
which is not supported by clang.

Issues
======

Currently, for BPF programs written in C, the only widely-supported
compiler is clang. Because of clang lacking support
of asm goto construct, if you try to compile
bpf programs under samples/bpf/,
   $ make -j20 && make headers_install && make samples/bpf/
you will see a lot of failures like below:

  clang  -nostdinc -isystem /usr/lib/gcc/x86_64-redhat-linux/7/include \
         -I/home/yhs/work/bpf-next/arch/x86/include \
         -I./arch/x86/include/generated  -I/home/yhs/work/bpf-next/include \
         -I./include -I/home/yhs/work/bpf-next/arch/x86/include/uapi \
         -I./arch/x86/include/generated/uapi \
         -I/home/yhs/work/bpf-next/include/uapi -I./include/generated/uapi \
         -include /home/yhs/work/bpf-next/include/linux/kconfig.h
         -Isamples/bpf \
	 -I/home/yhs/work/bpf-next/tools/testing/selftests/bpf/ \
	 -D__KERNEL__ -Wno-unused-value -Wno-pointer-sign \
	 -D__TARGET_ARCH_x86 -Wno-compare-distinct-pointer-types \
	 -Wno-gnu-variable-sized-type-not-at-end \
	 -Wno-address-of-packed-member -Wno-tautological-compare \
	 -Wno-unknown-warning-option  \
	 -O2 -emit-llvm -c /home/yhs/work/bpf-next/samples/bpf/xdp_redirect_cpu_kern.c \
         -o -| llc -march=bpf -filetype=obj -o samples/bpf/xdp_redirect_cpu_kern.o
  In file included from /home/yhs/work/bpf-next/samples/bpf/xdp_redirect_cpu_kern.c:10:
  In file included from /home/yhs/work/bpf-next/include/uapi/linux/in.h:24:
  In file included from /home/yhs/work/bpf-next/include/linux/socket.h:8:
  In file included from /home/yhs/work/bpf-next/include/linux/uio.h:13:
  In file included from /home/yhs/work/bpf-next/include/linux/thread_info.h:38:
  In file included from /home/yhs/work/bpf-next/arch/x86/include/asm/thread_info.h:53:
  /home/yhs/work/bpf-next/arch/x86/include/asm/cpufeature.h:150:2: error: 'asm goto' constructs are not supported yet
        asm_volatile_goto("1: jmp 6f\n"
        ^
  /home/yhs/work/bpf-next/include/linux/compiler-gcc.h:296:42: note: expanded from macro 'asm_volatile_goto'
  #define asm_volatile_goto(x...) do { asm goto(x); asm (""); } while (0)
                                           ^
  1 error generated.
  ...
  In file included from /home/yhs/work/bpf-next/samples/bpf/tracex4_kern.c:7:
  In file included from /home/yhs/work/bpf-next/include/linux/ptrace.h:6:
  In file included from /home/yhs/work/bpf-next/include/linux/sched.h:14:
  In file included from /home/yhs/work/bpf-next/include/linux/pid.h:5:
  In file included from /home/yhs/work/bpf-next/include/linux/rculist.h:11:
  In file included from /home/yhs/work/bpf-next/include/linux/rcupdate.h:40:
  In file included from /home/yhs/work/bpf-next/include/linux/preempt.h:81:
  In file included from /home/yhs/work/bpf-next/arch/x86/include/asm/preempt.h:7:
  In file included from /home/yhs/work/bpf-next/include/linux/thread_info.h:38:
  In file included from /home/yhs/work/bpf-next/arch/x86/include/asm/thread_info.h:53:
  /home/yhs/work/bpf-next/arch/x86/include/asm/cpufeature.h:150:2: error: 'asm goto' constructs are not supported yet
  ...

In the above, bpf compilation looks like below
  clang -target <native_arch> ... -emit-llvm -c -o - | llc -march=bpf ...
The clang compilation targets the native architecture and generates the LLVM IR bytecode, and
the llc compilation takes the IR bytecode and generates bpf instructions.

If kernel header files accessed by bpf program have asm goto construct, e.g.,
arch/x86/include/asm/cpufeature.h, the "clang -target <native_arch> ..."
compilation will fail as clang does not support asm goto yet.

Solution
=========

The solution proposed in this patch does not need user space change.
When the kernel detects the compiler has asm goto support,
it sets CC_HAVE_ASM_GOTO, the patch added the additional marco definition
__NO_CLANG_BPF_HACK. In arch/x86/include/asm/cpufeature.h, the
asm goto construct is accessed only if __NO_CLANG_BPF_HACK is defined.
This should not impact kernel compilation functionality since
for x86, we have
  ifndef CC_HAVE_ASM_GOTO
    $(error Compiler lacks asm-goto support.)
  endif
So __NO_CLANG_BPF_HACK should be always defined during kernel compilation
targeting x86.

User space code which compiles bpf program does not have
__NO_CLANG_BPF_HACK defined so it will go to alternate path
which avoids asm goto.

This approach is preferred since the already deployed bcc scripts, or
any other bpf applicaitons utilizing LLVM JIT compilation functionality,
will continue work with the new kernel without re-compilation and
re-deployment.

Note that this is a hack in the kernel to workaround bpf compilation issue.
The hack will be removed once clang starts to support asm goto.

Fixes: d0266046ad54 ("x86: Remove FAST_FEATURE_TESTS")
Suggested-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Yonghong Song <yhs@fb.com>
---
 Makefile                          | 1 +
 arch/x86/include/asm/cpufeature.h | 5 +++++
 2 files changed, 6 insertions(+)

Changelog:
 v2 -> v3:
   . Changed macro name from NO_BPF_WORKAROUND to __NO_CLANG_BPF_HACK.
     Explained the solution in more details.
 v1 -> v2:
   . Use NO_BPF_WORKAROUND macro instead of CC_HAVE_ASM_GOTO
     to make it explicit that this is a workaround.

Peter,

Could you take a look at this patch revision and give your opinion?
More and more people hit this issue and have to manually work around it.
It would be good if this can be resolved soon.

Thanks!

diff --git a/Makefile b/Makefile
index 83b6c54..cfd8759 100644
--- a/Makefile
+++ b/Makefile
@@ -504,6 +504,7 @@ export RETPOLINE_CFLAGS
 ifeq ($(call shell-cached,$(CONFIG_SHELL) $(srctree)/scripts/gcc-goto.sh $(CC) $(KBUILD_CFLAGS)), y)
   CC_HAVE_ASM_GOTO := 1
   KBUILD_CFLAGS += -DCC_HAVE_ASM_GOTO
+  KBUILD_CFLAGS += -D__NO_CLANG_BPF_HACK
   KBUILD_AFLAGS += -DCC_HAVE_ASM_GOTO
 endif
 
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index b27da96..42edd5d 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -140,6 +140,8 @@ extern void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit);
 
 #define setup_force_cpu_bug(bit) setup_force_cpu_cap(bit)
 
+/* this macro is a temporary hack for bpf until clang gains asm-goto support */
+#ifdef __NO_CLANG_BPF_HACK
 /*
  * Static testing of CPU features.  Used the same as boot_cpu_has().
  * These will statically patch the target code for additional
@@ -195,6 +197,9 @@ static __always_inline __pure bool _static_cpu_has(u16 bit)
 		boot_cpu_has(bit) :				\
 		_static_cpu_has(bit)				\
 )
+#else
+#define static_cpu_has(bit)		boot_cpu_has(bit)
+#endif
 
 #define cpu_has_bug(c, bit)		cpu_has(c, (bit))
 #define set_cpu_bug(c, bit)		set_cpu_cap(c, (bit))
-- 
2.9.5

^ permalink raw reply related

* Re: VRF: ICMPV6 Echo Reply failed to egress if ingress pkt Src is IPV6 Global and Dest is IPV6 Link Local.
From: David Ahern @ 2018-05-04  3:32 UTC (permalink / raw)
  To: Sukumar Gopalakrishnan, netdev
In-Reply-To: <CADiZnkTJqH70iwpKmfCKo68-5T+aaq27gNNu76toWL6LAyeEcg@mail.gmail.com>

On 4/30/18 6:58 AM, Sukumar Gopalakrishnan wrote:
> VRF: ICMPV6 Echo Reply failed to egress if ingress pkt Src is IPV6
> Global and Dest is IPV6 Link Local.

...

> if (fl6->flowi6_oif == dev->ifindex) {

try adding ' && !rt6_need_strict(saddr)' to the above.

If it works, add a comment above the line noting this case (link local
dest and global source) submit a patch.


>                 dst = &net->ipv6.ip6_null_entry->dst;
>                 dst_hold(dst);
>                 return dst;
>         }
> ..
> 
> 
> TEMP FIX:
> =========
> 
> Return Ingress vlan device's ifIndex instead of skb->dev's.
> 
> static int icmp6_iif(const struct sk_buff *skb)
> {
>         int iif = IP6CB(skb)->iif; // skb->dev->ifindex;
> 
> ..
> ..
> }
> 

^ permalink raw reply

* Re: possible deadlock in smc_close_non_accepted
From: Hangbin Liu @ 2018-05-04  3:45 UTC (permalink / raw)
  To: syzbot; +Cc: davem, linux-kernel, linux-s390, netdev, syzkaller-bugs, ubraun
In-Reply-To: <001a114218b2f8c0fe05666a2fb6@google.com>

#syz fix: net/smc: simplify wait when closing listen socket

^ permalink raw reply

* Re: [PATCH RFC net-next] net: ipvs: Adjust gso_size for IPPROTO_TCP
From: David Ahern @ 2018-05-04  3:50 UTC (permalink / raw)
  To: Martin KaFai Lau, Julian Anastasov
  Cc: netdev, Tom Herbert, Eric Dumazet, Nikita Shirokov, kernel-team,
	lvs-devel
In-Reply-To: <20180503070114.bcuusvzga45klccs@kafai-mbp>

On 5/3/18 1:01 AM, Martin KaFai Lau wrote:
> On Wed, May 02, 2018 at 10:30:32PM +0300, Julian Anastasov wrote:
>>
>> 	Hello,
>>
>> On Wed, 2 May 2018, Martin KaFai Lau wrote:
>>
>>> On Wed, May 02, 2018 at 09:38:43AM +0300, Julian Anastasov wrote:
>>>>
>>>> - initial traffic for port 21 does not use GSO. But after
>>>> every packet IPVS calls maybe_update_pmtu (rt->dst.ops->update_pmtu)
>>>> to report the reduced MTU. These updates are stored in fnhe_pmtu
>>>> but they do not go to any route, even if we try to get fresh
>>>> output route. Why? Because the local routes are not cached, so
>>>> they can not use the fnhe. This is what my patch for route.c
>>>> will fix. With this fix FTP-DATA gets route with reduced PMTU.
>>> For IPv6, the 'if (rt6->rt6i_flags & RTF_LOCAL)' gate in
>>> __ip6_rt_update_pmtu() may need to be lifted also.
>>
>> 	Probably. I completely forgot the IPv6 part
>> but as I don't know the IPv6 code enough, it may take
>> some time to understand what can be the problem there...
>> I'm not sure whether everything started with commit 0a6b2a1dc2a2,
>> so that in some configurations before that commit things
>> worked and problem was not noticed.
>>
>> 	I think, we should focus on such direction for IPv6:
>>
>> - do we remember per-VIP PMTU for the local routes
> IPv6 used not to create cache route for DST_HOST route which
> is a /128 route (that includes local /128 route).
> 
> Because of this, it had a bug such that a PMTU for the DST_HOST
> route will trigger dst.ops->update_pmtu() which then set
> an expire on the permanent /128 route instead of a cache
> route.  The permanent route got unexpectedly expired/removed
> later.
> 
> The fix was to allow creating /128 cache route as long as
> it is not RTF_LOCAL in 653437d02f1f and 7035870d1219.  The
> first post spelled out the problem better:
> https://patchwork.ozlabs.org/patch/456050/
> 
> Later, when we only create cache route after seeing PMTU
> in 45e4fd26683c, this RTF_LOCAL checking was carried over
> to __ip6_rt_update_pmtu().
> 
> Out of my head, I don't see issue removing the
> RTF_LOCAL check from __ip6_rt_update_pmtu().
> DavidA, what do you think?

I agree; I think it is fine. A route is route. The IPVS use cases with
local redirects are blurring the line with needs for local routes.

^ permalink raw reply

* Re: [net-next PATCH 3/5] udp: Add support for software checksum and GSO_PARTIAL with GSO offload
From: Eric Dumazet @ 2018-05-04  3:50 UTC (permalink / raw)
  To: Alexander Duyck, netdev, willemb, davem, Steffen Klassert
In-Reply-To: <20180504003333.4496.7705.stgit@localhost.localdomain>



On 05/03/2018 05:33 PM, Alexander Duyck wrote:
> From: Alexander Duyck <alexander.h.duyck@intel.com>
> 
> This patch adds support for a software provided checksum and GSO_PARTIAL
> segmentation support. With this we can offload UDP segmentation on devices
> that only have partial support for tunnels.
> 
> Since we are no longer needing the hardware checksum we can drop the checks
> in the segmentation code that were verifying if it was present.
> 
> Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
> ---
>  net/ipv4/udp_offload.c |   28 ++++++++++++++++++----------
>  net/ipv6/udp_offload.c |   11 +----------
>  2 files changed, 19 insertions(+), 20 deletions(-)
> 
> diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
> index 946d06d2aa0c..fd94bbb369b2 100644
> --- a/net/ipv4/udp_offload.c
> +++ b/net/ipv4/udp_offload.c
> @@ -217,6 +217,13 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
>  		return segs;
>  	}
>  
> +	/* GSO partial and frag_list segmentation only requires splitting
> +	 * the frame into an MSS multiple and possibly a remainder, both
> +	 * cases return a GSO skb. So update the mss now.
> +	 */
> +	if (skb_is_gso(segs))
> +		mss *= skb_shinfo(segs)->gso_segs;
> +
> 

I do not understand this code.

I am also seeing it in tcp, after commit 07b26c9454a2a ("gso: Support partial splitting at the frag_list pointer")

Presumably this broke tcp_gso_tstamp() , right ?

^ permalink raw reply

* [PATCH 0/8] Assorted rhashtable fixes and cleanups
From: NeilBrown @ 2018-05-04  3:54 UTC (permalink / raw)
  To: Thomas Graf, Herbert Xu; +Cc: netdev, linux-kernel

This series contains some bugfixes, mostly minor though one
is worthy of a stable backport I think - tagged with Fixes and Cc: stable.

Then there are improvements to walking, which have been discussed
to some degree already.
Finally a code simplification which I think is correct...

Thanks,
NeilBrown

---

NeilBrown (8):
      rhashtable: silence RCU warning in rhashtable_test.
      rhashtable: remove nulls_base and related code.
      rhashtable: use cmpxchg() to protect ->future_tbl.
      rhashtable: fix race in nested_table_alloc()
      rhashtable: remove rhashtable_walk_peek()
      rhashtable: further improve stability of rhashtable_walk
      rhashtable: add rhashtable_walk_prev()
      rhashtable: don't hold lock on first table throughout insertion.


 include/linux/rhashtable.h |   61 +++----------
 lib/rhashtable.c           |  202 +++++++++++++++++++++-----------------------
 lib/test_rhashtable.c      |    8 +-
 3 files changed, 113 insertions(+), 158 deletions(-)

--
Signature

^ permalink raw reply

* [PATCH 1/8] rhashtable: silence RCU warning in rhashtable_test.
From: NeilBrown @ 2018-05-04  3:54 UTC (permalink / raw)
  To: Thomas Graf, Herbert Xu; +Cc: netdev, linux-kernel
In-Reply-To: <152540595840.18473.11298241115621799037.stgit@noble>

print_ht in rhashtable_test calls rht_dereference() with neither
RCU protection or the mutex.  This triggers an RCU warning.
So take the mutex to silence the warning.

Signed-off-by: NeilBrown <neilb@suse.com>
---
 lib/test_rhashtable.c |    3 +++
 1 file changed, 3 insertions(+)

diff --git a/lib/test_rhashtable.c b/lib/test_rhashtable.c
index f4000c137dbe..bf92b7aa2a49 100644
--- a/lib/test_rhashtable.c
+++ b/lib/test_rhashtable.c
@@ -499,6 +499,8 @@ static unsigned int __init print_ht(struct rhltable *rhlt)
 	unsigned int i, cnt = 0;
 
 	ht = &rhlt->ht;
+	/* Take the mutex to avoid RCU warning */
+	mutex_lock(&ht->mutex);
 	tbl = rht_dereference(ht->tbl, ht);
 	for (i = 0; i < tbl->size; i++) {
 		struct rhash_head *pos, *next;
@@ -532,6 +534,7 @@ static unsigned int __init print_ht(struct rhltable *rhlt)
 		}
 	}
 	printk(KERN_ERR "\n---- ht: ----%s\n-------------\n", buff);
+	mutex_unlock(&ht->mutex);
 
 	return cnt;
 }

^ permalink raw reply related

* [PATCH 2/8] rhashtable: remove nulls_base and related code.
From: NeilBrown @ 2018-05-04  3:54 UTC (permalink / raw)
  To: Thomas Graf, Herbert Xu; +Cc: netdev, linux-kernel
In-Reply-To: <152540595840.18473.11298241115621799037.stgit@noble>

This "feature" is unused, undocumented, and untested and so
doesn't really belong.  If a use for the nulls marker
is found, all this code would need to be reviewed to
ensure it works as required.  It would be just as easy to
just add the code if/when it is needed instead.

This patch actually fixes a bug too.  The table resizing allows a
table to grow to 2^31 buckets, but the hash is truncated to 27 bits -
any growth beyond 2^27 is wasteful an ineffective.

This patch result in NULLS_MARKER(0) being used for all chains,
and leave the use of rht_is_a_null() to test for it.

Signed-off-by: NeilBrown <neilb@suse.com>
---
 include/linux/rhashtable.h |   35 +++--------------------------------
 lib/rhashtable.c           |    8 --------
 lib/test_rhashtable.c      |    5 +----
 3 files changed, 4 insertions(+), 44 deletions(-)

diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index 4e1f535c2034..8822924dd05a 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -29,25 +29,8 @@
 
 /*
  * The end of the chain is marked with a special nulls marks which has
- * the following format:
- *
- * +-------+-----------------------------------------------------+-+
- * | Base  |                      Hash                           |1|
- * +-------+-----------------------------------------------------+-+
- *
- * Base (4 bits) : Reserved to distinguish between multiple tables.
- *                 Specified via &struct rhashtable_params.nulls_base.
- * Hash (27 bits): Full hash (unmasked) of first element added to bucket
- * 1 (1 bit)     : Nulls marker (always set)
- *
- * The remaining bits of the next pointer remain unused for now.
+ * the least significant bit set.
  */
-#define RHT_BASE_BITS		4
-#define RHT_HASH_BITS		27
-#define RHT_BASE_SHIFT		RHT_HASH_BITS
-
-/* Base bits plus 1 bit for nulls marker */
-#define RHT_HASH_RESERVED_SPACE	(RHT_BASE_BITS + 1)
 
 /* Maximum chain length before rehash
  *
@@ -129,7 +112,6 @@ struct rhashtable;
  * @min_size: Minimum size while shrinking
  * @locks_mul: Number of bucket locks to allocate per cpu (default: 32)
  * @automatic_shrinking: Enable automatic shrinking of tables
- * @nulls_base: Base value to generate nulls marker
  * @hashfn: Hash function (default: jhash2 if !(key_len % 4), or jhash)
  * @obj_hashfn: Function to hash object
  * @obj_cmpfn: Function to compare key with object
@@ -143,7 +125,6 @@ struct rhashtable_params {
 	u16			min_size;
 	bool			automatic_shrinking;
 	u8			locks_mul;
-	u32			nulls_base;
 	rht_hashfn_t		hashfn;
 	rht_obj_hashfn_t	obj_hashfn;
 	rht_obj_cmpfn_t		obj_cmpfn;
@@ -210,24 +191,14 @@ struct rhashtable_iter {
 	bool end_of_table;
 };
 
-static inline unsigned long rht_marker(const struct rhashtable *ht, u32 hash)
-{
-	return NULLS_MARKER(ht->p.nulls_base + hash);
-}
-
 #define INIT_RHT_NULLS_HEAD(ptr, ht, hash) \
-	((ptr) = (typeof(ptr)) rht_marker(ht, hash))
+	((ptr) = (typeof(ptr)) NULLS_MARKER(0))
 
 static inline bool rht_is_a_nulls(const struct rhash_head *ptr)
 {
 	return ((unsigned long) ptr & 1);
 }
 
-static inline unsigned long rht_get_nulls_value(const struct rhash_head *ptr)
-{
-	return ((unsigned long) ptr) >> 1;
-}
-
 static inline void *rht_obj(const struct rhashtable *ht,
 			    const struct rhash_head *he)
 {
@@ -237,7 +208,7 @@ static inline void *rht_obj(const struct rhashtable *ht,
 static inline unsigned int rht_bucket_index(const struct bucket_table *tbl,
 					    unsigned int hash)
 {
-	return (hash >> RHT_HASH_RESERVED_SPACE) & (tbl->size - 1);
+	return hash & (tbl->size - 1);
 }
 
 static inline unsigned int rht_key_get_hash(struct rhashtable *ht,
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 9427b5766134..4a3f94e8e8a6 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -994,7 +994,6 @@ static u32 rhashtable_jhash2(const void *key, u32 length, u32 seed)
  *	.key_offset = offsetof(struct test_obj, key),
  *	.key_len = sizeof(int),
  *	.hashfn = jhash,
- *	.nulls_base = (1U << RHT_BASE_SHIFT),
  * };
  *
  * Configuration Example 2: Variable length keys
@@ -1028,9 +1027,6 @@ int rhashtable_init(struct rhashtable *ht,
 	    (params->obj_hashfn && !params->obj_cmpfn))
 		return -EINVAL;
 
-	if (params->nulls_base && params->nulls_base < (1U << RHT_BASE_SHIFT))
-		return -EINVAL;
-
 	memset(ht, 0, sizeof(*ht));
 	mutex_init(&ht->mutex);
 	spin_lock_init(&ht->lock);
@@ -1095,10 +1091,6 @@ int rhltable_init(struct rhltable *hlt, const struct rhashtable_params *params)
 {
 	int err;
 
-	/* No rhlist NULLs marking for now. */
-	if (params->nulls_base)
-		return -EINVAL;
-
 	err = rhashtable_init(&hlt->ht, params);
 	hlt->ht.rhlist = true;
 	return err;
diff --git a/lib/test_rhashtable.c b/lib/test_rhashtable.c
index bf92b7aa2a49..b428a9c7522a 100644
--- a/lib/test_rhashtable.c
+++ b/lib/test_rhashtable.c
@@ -83,7 +83,7 @@ static u32 my_hashfn(const void *data, u32 len, u32 seed)
 {
 	const struct test_obj_rhl *obj = data;
 
-	return (obj->value.id % 10) << RHT_HASH_RESERVED_SPACE;
+	return (obj->value.id % 10);
 }
 
 static int my_cmpfn(struct rhashtable_compare_arg *arg, const void *obj)
@@ -99,7 +99,6 @@ static struct rhashtable_params test_rht_params = {
 	.key_offset = offsetof(struct test_obj, value),
 	.key_len = sizeof(struct test_obj_val),
 	.hashfn = jhash,
-	.nulls_base = (3U << RHT_BASE_SHIFT),
 };
 
 static struct rhashtable_params test_rht_params_dup = {
@@ -294,8 +293,6 @@ static int __init test_rhltable(unsigned int entries)
 	if (!obj_in_table)
 		goto out_free;
 
-	/* nulls_base not supported in rhlist interface */
-	test_rht_params.nulls_base = 0;
 	err = rhltable_init(&rhlt, &test_rht_params);
 	if (WARN_ON(err))
 		goto out_free;

^ permalink raw reply related

* [PATCH 3/8] rhashtable: use cmpxchg() to protect ->future_tbl.
From: NeilBrown @ 2018-05-04  3:54 UTC (permalink / raw)
  To: Thomas Graf, Herbert Xu; +Cc: netdev, linux-kernel
In-Reply-To: <152540595840.18473.11298241115621799037.stgit@noble>

Rather than borrowing one of the bucket locks to
protect ->future_tbl updates, use cmpxchg().
This gives more freedom to change how bucket locking
is implemented.

Signed-off-by: NeilBrown <neilb@suse.com>
---
 lib/rhashtable.c |   17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 4a3f94e8e8a6..b73afe1dec7e 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -298,21 +298,16 @@ static int rhashtable_rehash_attach(struct rhashtable *ht,
 				    struct bucket_table *old_tbl,
 				    struct bucket_table *new_tbl)
 {
-	/* Protect future_tbl using the first bucket lock. */
-	spin_lock_bh(old_tbl->locks);
-
-	/* Did somebody beat us to it? */
-	if (rcu_access_pointer(old_tbl->future_tbl)) {
-		spin_unlock_bh(old_tbl->locks);
-		return -EEXIST;
-	}
-
 	/* Make insertions go into the new, empty table right away. Deletions
 	 * and lookups will be attempted in both tables until we synchronize.
+	 * The use of 'tmp' is simply to ensure we get the required memory
+	 * barriers before the cmpxchg().
 	 */
-	rcu_assign_pointer(old_tbl->future_tbl, new_tbl);
+	struct bucket_table *tmp;
 
-	spin_unlock_bh(old_tbl->locks);
+	rcu_assign_pointer(tmp, new_tbl);
+	if (cmpxchg(&old_tbl->future_tbl, NULL, tmp) != NULL)
+		return -EEXIST;
 
 	return 0;
 }

^ permalink raw reply related

* [PATCH 4/8] rhashtable: fix race in nested_table_alloc()
From: NeilBrown @ 2018-05-04  3:54 UTC (permalink / raw)
  To: Thomas Graf, Herbert Xu; +Cc: netdev, linux-kernel
In-Reply-To: <152540595840.18473.11298241115621799037.stgit@noble>

If two threads run nested_table_alloc() at the same time
they could both allocate a new table.
Best case is that one of them will never be freed, leaking memory.
Worst case is hat entry get stored there before it leaks,
and the are lost from the table.

So use cmpxchg to detect the race and free the unused table.

Fixes: da20420f83ea ("rhashtable: Add nested tables")
Cc: stable@vger.kernel.org # 4.11+
Signed-off-by: NeilBrown <neilb@suse.com>
---
 lib/rhashtable.c |   10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index b73afe1dec7e..114e6090228a 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -119,6 +119,7 @@ static union nested_table *nested_table_alloc(struct rhashtable *ht,
 					      unsigned int nhash)
 {
 	union nested_table *ntbl;
+	union nested_table *tmp;
 	int i;
 
 	ntbl = rcu_dereference(*prev);
@@ -133,9 +134,12 @@ static union nested_table *nested_table_alloc(struct rhashtable *ht,
 					    (i << shifted) | nhash);
 	}
 
-	rcu_assign_pointer(*prev, ntbl);
-
-	return ntbl;
+	rcu_assign_pointer(tmp, ntbl);
+	if (cmpxchg(prev, NULL, tmp) == NULL)
+		return tmp;
+	/* Raced with another thread. */
+	kfree(ntbl);
+	return rcu_dereference(*prev);
 }
 
 static struct bucket_table *nested_bucket_table_alloc(struct rhashtable *ht,

^ permalink raw reply related

* [PATCH 5/8] rhashtable: remove rhashtable_walk_peek()
From: NeilBrown @ 2018-05-04  3:54 UTC (permalink / raw)
  To: Thomas Graf, Herbert Xu; +Cc: netdev, linux-kernel
In-Reply-To: <152540595840.18473.11298241115621799037.stgit@noble>

This function has a somewhat confused behavior that is not properly
described by the documentation.
Sometimes is returns the previous object, sometimes it returns the
next one.
Sometimes it changes the iterator, sometimes it doesn't.

This function is not currently used and is not worth keeping, so
remove it.

Signed-off-by: NeilBrown <neilb@suse.com>
---
 include/linux/rhashtable.h |    1 -
 lib/rhashtable.c           |   34 ----------------------------------
 2 files changed, 35 deletions(-)

diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index 8822924dd05a..5091abf975a1 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -366,7 +366,6 @@ static inline void rhashtable_walk_start(struct rhashtable_iter *iter)
 }
 
 void *rhashtable_walk_next(struct rhashtable_iter *iter);
-void *rhashtable_walk_peek(struct rhashtable_iter *iter);
 void rhashtable_walk_stop(struct rhashtable_iter *iter) __releases(RCU);
 
 void rhashtable_free_and_destroy(struct rhashtable *ht,
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 114e6090228a..83f5d1ebf452 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -897,40 +897,6 @@ void *rhashtable_walk_next(struct rhashtable_iter *iter)
 }
 EXPORT_SYMBOL_GPL(rhashtable_walk_next);
 
-/**
- * rhashtable_walk_peek - Return the next object but don't advance the iterator
- * @iter:	Hash table iterator
- *
- * Returns the next object or NULL when the end of the table is reached.
- *
- * Returns -EAGAIN if resize event occurred.  Note that the iterator
- * will rewind back to the beginning and you may continue to use it.
- */
-void *rhashtable_walk_peek(struct rhashtable_iter *iter)
-{
-	struct rhlist_head *list = iter->list;
-	struct rhashtable *ht = iter->ht;
-	struct rhash_head *p = iter->p;
-
-	if (p)
-		return rht_obj(ht, ht->rhlist ? &list->rhead : p);
-
-	/* No object found in current iter, find next one in the table. */
-
-	if (iter->skip) {
-		/* A nonzero skip value points to the next entry in the table
-		 * beyond that last one that was found. Decrement skip so
-		 * we find the current value. __rhashtable_walk_find_next
-		 * will restore the original value of skip assuming that
-		 * the table hasn't changed.
-		 */
-		iter->skip--;
-	}
-
-	return __rhashtable_walk_find_next(iter);
-}
-EXPORT_SYMBOL_GPL(rhashtable_walk_peek);
-
 /**
  * rhashtable_walk_stop - Finish a hash table walk
  * @iter:	Hash table iterator

^ permalink raw reply related

* [PATCH 6/8] rhashtable: further improve stability of rhashtable_walk
From: NeilBrown @ 2018-05-04  3:54 UTC (permalink / raw)
  To: Thomas Graf, Herbert Xu; +Cc: netdev, linux-kernel
In-Reply-To: <152540595840.18473.11298241115621799037.stgit@noble>

If the sequence:
   obj = rhashtable_walk_next(iter);
   rhashtable_walk_stop(iter);
   rhashtable_remove_fast(ht, &obj->head, params);
   rhashtable_walk_start(iter);

 races with another thread inserting or removing
 an object on the same hash chain, a subsequent
 rhashtable_walk_next() is not guaranteed to get the "next"
 object. It is possible that an object could be
 repeated, or missed.

 This can be made more reliable by keeping the objects in a hash chain
 sorted by memory address.  A subsequent rhashtable_walk_next()
 call can reliably find the correct position in the list, and thus
 find the 'next' object.

 It is not possible (certainly not so easy) to achieve this with an
 rhltable as keeping the hash chain in order is not so easy.  When the
 first object with a given key is removed, it is replaced in the chain
 with the next object with the same key, and the address of that
 object may not be correctly ordered.
 No current user of rhltable_walk_enter() calls
 rhashtable_walk_start() more than once, so no current code
 could benefit from a more reliable walk of rhltables.

 This patch only attempts to improve walks for rhashtables.
 - a new object is always inserted after the last object with a
   smaller address, or at the start
 - when rhashtable_walk_start() is called, it records that 'p' is not
   'safe', meaning that it cannot be dereferenced.  The revalidation
   that was previously done here is moved to rhashtable_walk_next()
 - when rhashtable_walk_next() is called while p is not NULL and not
   safe, it walks the chain looking for the first object with an
   address greater than p and returns that.  If there is none, it moves
   to the next hash chain.

Signed-off-by: NeilBrown <neilb@suse.com>
---
 include/linux/rhashtable.h |   11 +++++-
 lib/rhashtable.c           |   82 ++++++++++++++++++++++++++++----------------
 2 files changed, 62 insertions(+), 31 deletions(-)

diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index 5091abf975a1..20684a451cb0 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -188,6 +188,7 @@ struct rhashtable_iter {
 	struct rhashtable_walker walker;
 	unsigned int slot;
 	unsigned int skip;
+	bool p_is_unsafe;
 	bool end_of_table;
 };
 
@@ -737,7 +738,12 @@ static inline void *__rhashtable_insert_fast(
 		    (params.obj_cmpfn ?
 		     params.obj_cmpfn(&arg, rht_obj(ht, head)) :
 		     rhashtable_compare(&arg, rht_obj(ht, head)))) {
-			pprev = &head->next;
+			if (rhlist) {
+				pprev = &head->next;
+			} else {
+				if (head < obj)
+					pprev = &head->next;
+			}
 			continue;
 		}
 
@@ -1233,7 +1239,8 @@ static inline int rhashtable_walk_init(struct rhashtable *ht,
  * Note that if you restart a walk after rhashtable_walk_stop you
  * may see the same object twice.  Also, you may miss objects if
  * there are removals in between rhashtable_walk_stop and the next
- * call to rhashtable_walk_start.
+ * call to rhashtable_walk_start.  Note that this is different to
+ * rhashtable_walk_enter() which misses objects.
  *
  * For a completely stable walk you should construct your own data
  * structure outside the hash table.
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 83f5d1ebf452..038c4156b66a 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -234,6 +234,7 @@ static int rhashtable_rehash_one(struct rhashtable *ht, unsigned int old_hash)
 	struct bucket_table *new_tbl = rhashtable_last_table(ht,
 		rht_dereference_rcu(old_tbl->future_tbl, ht));
 	struct rhash_head __rcu **pprev = rht_bucket_var(old_tbl, old_hash);
+	struct rhash_head __rcu **inspos;
 	int err = -EAGAIN;
 	struct rhash_head *head, *next, *entry;
 	spinlock_t *new_bucket_lock;
@@ -262,12 +263,15 @@ static int rhashtable_rehash_one(struct rhashtable *ht, unsigned int old_hash)
 	new_bucket_lock = rht_bucket_lock(new_tbl, new_hash);
 
 	spin_lock_nested(new_bucket_lock, SINGLE_DEPTH_NESTING);
-	head = rht_dereference_bucket(new_tbl->buckets[new_hash],
-				      new_tbl, new_hash);
-
+	inspos = &new_tbl->buckets[new_hash];
+	head = rht_dereference_bucket(*inspos, new_tbl, new_hash);
+	while (!rht_is_a_nulls(head) && head < entry) {
+		inspos = &head->next;
+		head = rht_dereference_bucket(*inspos, new_tbl, new_hash);
+	}
 	RCU_INIT_POINTER(entry->next, head);
 
-	rcu_assign_pointer(new_tbl->buckets[new_hash], entry);
+	rcu_assign_pointer(*inspos, entry);
 	spin_unlock(new_bucket_lock);
 
 	rcu_assign_pointer(*pprev, next);
@@ -565,6 +569,10 @@ static struct bucket_table *rhashtable_insert_one(struct rhashtable *ht,
 		return ERR_PTR(-ENOMEM);
 
 	head = rht_dereference_bucket(*pprev, tbl, hash);
+	while (!rht_is_a_nulls(head) && head < obj) {
+		pprev = &head->next;
+		head = rht_dereference_bucket(*pprev, tbl, hash);
+	}
 
 	RCU_INIT_POINTER(obj->next, head);
 	if (ht->rhlist) {
@@ -659,10 +667,10 @@ EXPORT_SYMBOL_GPL(rhashtable_insert_slow);
  *
  * This function prepares a hash table walk.
  *
- * Note that if you restart a walk after rhashtable_walk_stop you
- * may see the same object twice.  Also, you may miss objects if
- * there are removals in between rhashtable_walk_stop and the next
- * call to rhashtable_walk_start.
+ * A walk is guaranteed to return every object that was in
+ * the table before this call, and is still in the table when
+ * rhashtable_walk_next() returns NULL.  Duplicates can be
+ * seen, but only if there is a rehash event during the walk.
  *
  * For a completely stable walk you should construct your own data
  * structure outside the hash table.
@@ -746,19 +754,10 @@ int rhashtable_walk_start_check(struct rhashtable_iter *iter)
 
 	if (iter->p && !rhlist) {
 		/*
-		 * We need to validate that 'p' is still in the table, and
-		 * if so, update 'skip'
+		 * 'p' will be revalidated when rhashtable_walk_next()
+		 * is called.
 		 */
-		struct rhash_head *p;
-		int skip = 0;
-		rht_for_each_rcu(p, iter->walker.tbl, iter->slot) {
-			skip++;
-			if (p == iter->p) {
-				iter->skip = skip;
-				goto found;
-			}
-		}
-		iter->p = NULL;
+		iter->p_is_unsafe = true;
 	} else if (iter->p && rhlist) {
 		/* Need to validate that 'list' is still in the table, and
 		 * if so, update 'skip' and 'p'.
@@ -875,15 +874,39 @@ void *rhashtable_walk_next(struct rhashtable_iter *iter)
 	bool rhlist = ht->rhlist;
 
 	if (p) {
-		if (!rhlist || !(list = rcu_dereference(list->next))) {
-			p = rcu_dereference(p->next);
-			list = container_of(p, struct rhlist_head, rhead);
-		}
-		if (!rht_is_a_nulls(p)) {
-			iter->skip++;
-			iter->p = p;
-			iter->list = list;
-			return rht_obj(ht, rhlist ? &list->rhead : p);
+		if (!rhlist && iter->p_is_unsafe) {
+			/*
+			 * First time next() was called after start().
+			 * Need to find location of 'p' in the list.
+			 */
+			struct rhash_head *p;
+
+			iter->skip = 0;
+			rht_for_each_rcu(p, iter->walker.tbl, iter->slot) {
+				iter->skip++;
+				if (p <= iter->p)
+					continue;
+
+				/* p is the next object after iter->p */
+				iter->p = p;
+				iter->p_is_unsafe = false;
+				return rht_obj(ht, p);
+			}
+			/* There is no "next" object in the list, move
+			 * to next hash chain.
+			 */
+		} else {
+			if (!rhlist || !(list = rcu_dereference(list->next))) {
+				p = rcu_dereference(p->next);
+				list = container_of(p, struct rhlist_head,
+						    rhead);
+			}
+			if (!rht_is_a_nulls(p)) {
+				iter->skip++;
+				iter->p = p;
+				iter->list = list;
+				return rht_obj(ht, rhlist ? &list->rhead : p);
+			}
 		}
 
 		/* At the end of this slot, switch to next one and then find
@@ -893,6 +916,7 @@ void *rhashtable_walk_next(struct rhashtable_iter *iter)
 		iter->slot++;
 	}
 
+	iter->p_is_unsafe = false;
 	return __rhashtable_walk_find_next(iter);
 }
 EXPORT_SYMBOL_GPL(rhashtable_walk_next);

^ permalink raw reply related

* [PATCH 7/8] rhashtable: add rhashtable_walk_prev()
From: NeilBrown @ 2018-05-04  3:54 UTC (permalink / raw)
  To: Thomas Graf, Herbert Xu; +Cc: netdev, linux-kernel
In-Reply-To: <152540595840.18473.11298241115621799037.stgit@noble>

rhashtable_walk_prev() returns the object returned by
the previous rhashtable_walk_next(), providing it is still in the
table (or was during this grace period).
This works even if rhashtable_walk_stop() and rhashtable_talk_start()
have been called since the last rhashtable_walk_next().

If there have been no calls to rhashtable_walk_next(), or if the
object is gone from the table, then NULL is returned.

This can usefully be used in a seq_file ->start() function.
If the pos is the same as was returned by the last ->next() call,
then rhashtable_walk_prev() can be used to re-establish the
current location in the table.  If it returns NULL, then
rhashtable_walk_next() should be used.

Signed-off-by: NeilBrown <neilb@suse.com>
---
 include/linux/rhashtable.h |    1 +
 lib/rhashtable.c           |   31 +++++++++++++++++++++++++++++++
 2 files changed, 32 insertions(+)

diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index 20684a451cb0..82d061ff96d6 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -367,6 +367,7 @@ static inline void rhashtable_walk_start(struct rhashtable_iter *iter)
 }
 
 void *rhashtable_walk_next(struct rhashtable_iter *iter);
+void *rhashtable_walk_prev(struct rhashtable_iter *iter);
 void rhashtable_walk_stop(struct rhashtable_iter *iter) __releases(RCU);
 
 void rhashtable_free_and_destroy(struct rhashtable *ht,
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 038c4156b66a..d0267e37e7e1 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -921,6 +921,37 @@ void *rhashtable_walk_next(struct rhashtable_iter *iter)
 }
 EXPORT_SYMBOL_GPL(rhashtable_walk_next);
 
+/**
+ * rhashtable_walk_prev - Return the previously returned object, if available
+ * @iter:	Hash table iterator
+ *
+ * If rhashtable_walk_next() has previously been called and the object
+ * it returned is still in the hash table, that object is returned again,
+ * otherwise %NULL is returned.
+ *
+ * If the recent rhashtable_walk_next() call was since the most recent
+ * rhashtable_walk_start() call then the returned object may not, strictly
+ * speaking, still be in the table.  It will be safe to dereference.
+ *
+ * Note that the iterator is not changed and in particular it does not
+ * step backwards.
+ */
+void *rhashtable_walk_prev(struct rhashtable_iter *iter)
+{
+	struct rhashtable *ht = iter->ht;
+	struct rhash_head *p = iter->p;
+
+	if (!p)
+		return NULL;
+	if (!iter->p_is_unsafe || ht->rhlist)
+		return p;
+	rht_for_each_rcu(p, iter->walker.tbl, iter->slot)
+		if (p == iter->p)
+			return p;
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(rhashtable_walk_prev);
+
 /**
  * rhashtable_walk_stop - Finish a hash table walk
  * @iter:	Hash table iterator

^ permalink raw reply related

* [PATCH 8/8] rhashtable: don't hold lock on first table throughout insertion.
From: NeilBrown @ 2018-05-04  3:54 UTC (permalink / raw)
  To: Thomas Graf, Herbert Xu; +Cc: netdev, linux-kernel
In-Reply-To: <152540595840.18473.11298241115621799037.stgit@noble>

rhashtable_try_insert() currently hold a lock on the bucket in
the first table, while also locking buckets in subsequent tables.
This is unnecessary and looks like a hold-over from some earlier
version of the implementation.

As insert and remove always lock a bucket in each table in turn, and
as insert only inserts in the final table, there cannot be any races
that are not covered by simply locking a bucket in each table in turn.

When an insert call reaches that last table it can be sure that there
is no match entry in any other table as it has searched them all, and
insertion never happens anywhere but in the last table.  The fact that
code tests for the existence of future_tbl while holding a lock on
the relevant bucket ensures that two threads inserting the same key
will make compatible decisions about which is the "last" table.

This simplifies the code and allows the ->rehash field to be
discarded.
We still need a way to ensure that a dead bucket_table is never
re-linked by rhashtable_walk_stop().  This can be achieved by
setting the ->size to 1.  This still allows lookup code to work (and
correctly not find anything) but can never happen on an active bucket
table (as the minimum size is 4).

Signed-off-by: NeilBrown <neilb@suse.com>
---
 include/linux/rhashtable.h |   13 -------------
 lib/rhashtable.c           |   42 ++++++++++--------------------------------
 2 files changed, 10 insertions(+), 45 deletions(-)

diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index 82d061ff96d6..0529925af41d 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -73,7 +73,6 @@ struct rhlist_head {
 struct bucket_table {
 	unsigned int		size;
 	unsigned int		nest;
-	unsigned int		rehash;
 	u32			hash_rnd;
 	unsigned int		locks_mask;
 	spinlock_t		*locks;
@@ -885,12 +884,6 @@ static inline int rhltable_insert(
  * @obj:	pointer to hash head inside object
  * @params:	hash table parameters
  *
- * Locks down the bucket chain in both the old and new table if a resize
- * is in progress to ensure that writers can't remove from the old table
- * and can't insert to the new table during the atomic operation of search
- * and insertion. Searches for duplicates in both the old and new table if
- * a resize is in progress.
- *
  * This lookup function may only be used for fixed key hash table (key_len
  * parameter set). It will BUG() if used inappropriately.
  *
@@ -946,12 +939,6 @@ static inline void *rhashtable_lookup_get_insert_fast(
  * @obj:	pointer to hash head inside object
  * @params:	hash table parameters
  *
- * Locks down the bucket chain in both the old and new table if a resize
- * is in progress to ensure that writers can't remove from the old table
- * and can't insert to the new table during the atomic operation of search
- * and insertion. Searches for duplicates in both the old and new table if
- * a resize is in progress.
- *
  * Lookups may occur in parallel with hashtable mutations and resizing.
  *
  * Will trigger an automatic deferred table resizing if residency in the
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index d0267e37e7e1..4f7a7423a675 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -293,10 +293,9 @@ static int rhashtable_rehash_chain(struct rhashtable *ht,
 	while (!(err = rhashtable_rehash_one(ht, old_hash)))
 		;
 
-	if (err == -ENOENT) {
-		old_tbl->rehash++;
+	if (err == -ENOENT)
 		err = 0;
-	}
+
 	spin_unlock_bh(old_bucket_lock);
 
 	return err;
@@ -345,6 +344,9 @@ static int rhashtable_rehash_table(struct rhashtable *ht)
 	spin_lock(&ht->lock);
 	list_for_each_entry(walker, &old_tbl->walkers, list)
 		walker->tbl = NULL;
+
+	/* Ensure rhashtable_walk_stop() doesn't relink this table */
+	old_tbl->size = 1;
 	spin_unlock(&ht->lock);
 
 	/* Wait for readers. All new readers will see the new
@@ -597,36 +599,14 @@ static void *rhashtable_try_insert(struct rhashtable *ht, const void *key,
 	struct bucket_table *new_tbl;
 	struct bucket_table *tbl;
 	unsigned int hash;
-	spinlock_t *lock;
 	void *data;
 
-	tbl = rcu_dereference(ht->tbl);
-
-	/* All insertions must grab the oldest table containing
-	 * the hashed bucket that is yet to be rehashed.
-	 */
-	for (;;) {
-		hash = rht_head_hashfn(ht, tbl, obj, ht->p);
-		lock = rht_bucket_lock(tbl, hash);
-		spin_lock_bh(lock);
-
-		if (tbl->rehash <= hash)
-			break;
-
-		spin_unlock_bh(lock);
-		tbl = rcu_dereference(tbl->future_tbl);
-	}
-
-	data = rhashtable_lookup_one(ht, tbl, hash, key, obj);
-	new_tbl = rhashtable_insert_one(ht, tbl, hash, obj, data);
-	if (PTR_ERR(new_tbl) != -EEXIST)
-		data = ERR_CAST(new_tbl);
+	new_tbl = rcu_dereference(ht->tbl);
 
-	while (!IS_ERR_OR_NULL(new_tbl)) {
+	do {
 		tbl = new_tbl;
 		hash = rht_head_hashfn(ht, tbl, obj, ht->p);
-		spin_lock_nested(rht_bucket_lock(tbl, hash),
-				 SINGLE_DEPTH_NESTING);
+		spin_lock(rht_bucket_lock(tbl, hash));
 
 		data = rhashtable_lookup_one(ht, tbl, hash, key, obj);
 		new_tbl = rhashtable_insert_one(ht, tbl, hash, obj, data);
@@ -634,9 +614,7 @@ static void *rhashtable_try_insert(struct rhashtable *ht, const void *key,
 			data = ERR_CAST(new_tbl);
 
 		spin_unlock(rht_bucket_lock(tbl, hash));
-	}
-
-	spin_unlock_bh(lock);
+	} while (!IS_ERR_OR_NULL(new_tbl));
 
 	if (PTR_ERR(data) == -EAGAIN)
 		data = ERR_PTR(rhashtable_insert_rehash(ht, tbl) ?:
@@ -971,7 +949,7 @@ void rhashtable_walk_stop(struct rhashtable_iter *iter)
 	ht = iter->ht;
 
 	spin_lock(&ht->lock);
-	if (tbl->rehash < tbl->size)
+	if (tbl->size > 1)
 		list_add(&iter->walker.list, &tbl->walkers);
 	else
 		iter->walker.tbl = NULL;

^ permalink raw reply related

* Re: [PATCH v2 bpf-next 2/2] bpf: add selftest for stackmap with build_id in NMI context
From: Song Liu @ 2018-05-04  6:41 UTC (permalink / raw)
  To: Tobin C. Harding; +Cc: netdev@vger.kernel.org, Kernel Team, Teng Qin
In-Reply-To: <20180503071902.GP3791@eros>

Thanks Tobin. I will fold these changes in. 

> On May 3, 2018, at 12:19 AM, Tobin C. Harding <tobin@apporbit.com> wrote:
> 
> On Wed, May 02, 2018 at 04:20:30PM -0700, Song Liu wrote:
>> This new test captures stackmap with build_id with hardware event
>> PERF_COUNT_HW_CPU_CYCLES.
>> 
>> Because we only support one ips-to-build_id lookup per cpu in NMI
>> context, stack_amap will not be able to do the lookup in this test.
> 
>         stack_map ?

This one is stack_amap. There are two maps in the test. 

Song

> 
>> Therefore, we didn't do compare_stack_ips(), as it will alwasy fail.
>> 
>> urandom_read.c is extended to run configurable cycles so that it can be
>> caught by the perf event.
>> 
>> Signed-off-by: Song Liu <songliubraving@fb.com>
>> ---
>> tools/testing/selftests/bpf/test_progs.c   | 137 +++++++++++++++++++++++++++++
>> tools/testing/selftests/bpf/urandom_read.c |  10 ++-
>> 2 files changed, 145 insertions(+), 2 deletions(-)
>> 
>> diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c
>> index aa336f0..00bb08c 100644
>> --- a/tools/testing/selftests/bpf/test_progs.c
>> +++ b/tools/testing/selftests/bpf/test_progs.c
>> @@ -1272,6 +1272,142 @@ static void test_stacktrace_build_id(void)
>> 	return;
>> }
>> 
>> +static void test_stacktrace_build_id_nmi(void)
>> +{
>> +	int control_map_fd, stackid_hmap_fd, stackmap_fd, stack_amap_fd;
>> +	const char *file = "./test_stacktrace_build_id.o";
>> +	int err, pmu_fd, prog_fd;
>> +	struct perf_event_attr attr = {
>> +		.sample_freq = 5000,
>> +		.freq = 1,
>> +		.type = PERF_TYPE_HARDWARE,
>> +		.config = PERF_COUNT_HW_CPU_CYCLES,
>> +	};
>> +	__u32 key, previous_key, val, duration = 0;
>> +	struct bpf_object *obj;
>> +	char buf[256];
>> +	int i, j;
>> +	struct bpf_stack_build_id id_offs[PERF_MAX_STACK_DEPTH];
>> +	int build_id_matches = 0;
>> +
>> +	err = bpf_prog_load(file, BPF_PROG_TYPE_PERF_EVENT, &obj, &prog_fd);
>> +	if (CHECK(err, "prog_load", "err %d errno %d\n", err, errno))
>> +		goto out;
> 		    
> perhaps:
> 		return;
> 
>> +	pmu_fd = syscall(__NR_perf_event_open, &attr, -1 /* pid */,
>> +			 0 /* cpu 0 */, -1 /* group id */,
>> +			 0 /* flags */);
>> +	if (CHECK(pmu_fd < 0, "perf_event_open",
>> +		  "err %d errno %d. Does the test host support PERF_COUNT_HW_CPU_CYCLES?\n",
>> +		  pmu_fd, errno))
>> +		goto close_prog;
>> +
>> +	err = ioctl(pmu_fd, PERF_EVENT_IOC_ENABLE, 0);
>> +	if (CHECK(err, "perf_event_ioc_enable", "err %d errno %d\n",
>> +		  err, errno))
>> +		goto close_pmu;
>> +
>> +	err = ioctl(pmu_fd, PERF_EVENT_IOC_SET_BPF, prog_fd);
>> +	if (CHECK(err, "perf_event_ioc_set_bpf", "err %d errno %d\n",
>> +		  err, errno))
>> +		goto disable_pmu;
>> +
>> +	/* find map fds */
>> +	control_map_fd = bpf_find_map(__func__, obj, "control_map");
>> +	if (CHECK(control_map_fd < 0, "bpf_find_map control_map",
>> +		  "err %d errno %d\n", err, errno))
>> +		goto disable_pmu;
>> +
>> +	stackid_hmap_fd = bpf_find_map(__func__, obj, "stackid_hmap");
>> +	if (CHECK(stackid_hmap_fd < 0, "bpf_find_map stackid_hmap",
>> +		  "err %d errno %d\n", err, errno))
>> +		goto disable_pmu;
>> +
>> +	stackmap_fd = bpf_find_map(__func__, obj, "stackmap");
>> +	if (CHECK(stackmap_fd < 0, "bpf_find_map stackmap", "err %d errno %d\n",
>> +		  err, errno))
>> +		goto disable_pmu;
>> +
>> +	stack_amap_fd = bpf_find_map(__func__, obj, "stack_amap");
>> +	if (CHECK(stack_amap_fd < 0, "bpf_find_map stack_amap",
>> +		  "err %d errno %d\n", err, errno))
>> +		goto disable_pmu;
>> +
>> +	assert(system("dd if=/dev/urandom of=/dev/zero count=4 2> /dev/null")
>> +	       == 0);
>> +	assert(system("taskset 0x1 ./urandom_read 100000") == 0);
>> +	/* disable stack trace collection */
>> +	key = 0;
>> +	val = 1;
>> +	bpf_map_update_elem(control_map_fd, &key, &val, 0);
>> +
>> +	/* for every element in stackid_hmap, we can find a corresponding one
>> +	 * in stackmap, and vise versa.
>> +	 */
>> +	err = compare_map_keys(stackid_hmap_fd, stackmap_fd);
>> +	if (CHECK(err, "compare_map_keys stackid_hmap vs. stackmap",
>> +		  "err %d errno %d\n", err, errno))
>> +		goto disable_pmu;
>> +
>> +	err = compare_map_keys(stackmap_fd, stackid_hmap_fd);
>> +	if (CHECK(err, "compare_map_keys stackmap vs. stackid_hmap",
>> +		  "err %d errno %d\n", err, errno))
>> +		goto disable_pmu;
>> +
>> +	err = extract_build_id(buf, 256);
>> +
>> +	if (CHECK(err, "get build_id with readelf",
>> +		  "err %d errno %d\n", err, errno))
>> +		goto disable_pmu;
>> +
>> +	err = bpf_map_get_next_key(stackmap_fd, NULL, &key);
>> +	if (CHECK(err, "get_next_key from stackmap",
>> +		  "err %d, errno %d\n", err, errno))
>> +		goto disable_pmu;
>> +
>> +	do {
>> +		char build_id[64];
>> +
>> +		err = bpf_map_lookup_elem(stackmap_fd, &key, id_offs);
>> +		if (CHECK(err, "lookup_elem from stackmap",
>> +			  "err %d, errno %d\n", err, errno))
>> +			goto disable_pmu;
>> +		for (i = 0; i < PERF_MAX_STACK_DEPTH; ++i)
>> +			if (id_offs[i].status == BPF_STACK_BUILD_ID_VALID &&
>> +			    id_offs[i].offset != 0) {
>> +				for (j = 0; j < 20; ++j)
>> +					sprintf(build_id + 2 * j, "%02x",
>> +						id_offs[i].build_id[j] & 0xff);
>> +				if (strstr(buf, build_id) != NULL)
>> +					build_id_matches = 1;
>> +			}
>> +		previous_key = key;
>> +	} while (bpf_map_get_next_key(stackmap_fd, &previous_key, &key) == 0);
>> +
>> +	if (CHECK(build_id_matches < 1, "build id match",
>> +		  "Didn't find expected build ID from the map\n"))
>> +		goto disable_pmu;
>> +
>> +	/*
>> +	 * We intentionally skip compare_stack_ips(). This is because we
>> +	 * only support one in_nmi() ips-to-build_id translation per cpu
>> +	 * at any time, thus stack_amap here will always fallback to
>> +	 * BPF_STACK_BUILD_ID_IP;
>> +	 */
>> +
>> +disable_pmu:
>> +	ioctl(pmu_fd, PERF_EVENT_IOC_DISABLE);
>> +
>> +close_pmu:
>> +	close(pmu_fd);
>> +
>> +close_prog:
>> +	bpf_object__close(obj);
>> +
>> +out:
>> +	return;
>> +}
> 
> No real need for label 'out' right?  We can just return directly and
> remove the last three lines of this function.
> 
> Hope this helps,
> Tobin.

^ permalink raw reply

* Re: DSA switch
From: Ran Shalit @ 2018-05-04  6:59 UTC (permalink / raw)
  To: Andrew Lunn; +Cc: netdev
In-Reply-To: <20180503210545.GJ17027@lunn.ch>

On Fri, May 4, 2018 at 12:05 AM, Andrew Lunn <andrew@lunn.ch> wrote:
>> I am using kernel 2.6.37, but I think it is not kernel issue, but more
>> bad patches done on kernel.
>> It is based on TI's kernel, but with some custom modifications on
>> driver's switch, to make it work with TI's cpsw switch.
>> Seems like someone made some bad patch, I'll continue investigating it.
>> You can ignore the question...
>>
>> Many thanks a lot for the help,
>> Ran
>
> There is no DSA driver for the cpsw. Are you just using the cpsw to
> pass frames to a switch which is supported by DSA?
>
> In theory, mainline CPSW should just work for passing frames to an
> external switch. So why not just use mainline?
>

It seems that the bridge functions OK,
so I rather keep on working with it, instead of doing too many
dramatically changes in the custom kernel of TI's which works with our
chip (dm8148).

Yet, I would like to ask about the bridge:
Can a bridge also be used with dsa switch when ports are connected to
different subnets ?

Regards,
Ran

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox