Netdev List

Netdev List
 help / color / mirror / Atom feed

* [bpf-next v3 7/9] net/ipv6: Add fib lookup stubs for use in bpf helper
From: David Ahern @ 2018-05-10  3:34 UTC (permalink / raw)
  To: netdev, borkmann, ast
  Cc: davem, shm, roopa, brouer, toke, john.fastabend, David Ahern
In-Reply-To: <20180510033427.20756-1-dsahern@gmail.com>

Add stubs to retrieve a handle to an IPv6 FIB table, fib6_get_table,
a stub to do a lookup in a specific table, fib6_table_lookup, and
a stub for a full route lookup.

The stubs are needed for core bpf code to handle the case when the
IPv6 module is not builtin.

Signed-off-by: David Ahern <dsahern@gmail.com>
Acked-by: David S. Miller <davem@davemloft.net>
---
 include/net/addrconf.h   | 14 ++++++++++++++
 net/ipv6/addrconf_core.c | 33 ++++++++++++++++++++++++++++++++-
 net/ipv6/af_inet6.c      |  6 +++++-
 3 files changed, 51 insertions(+), 2 deletions(-)

diff --git a/include/net/addrconf.h b/include/net/addrconf.h
index 8312cc25a3af..ff766ab207e0 100644
--- a/include/net/addrconf.h
+++ b/include/net/addrconf.h
@@ -223,6 +223,20 @@ struct ipv6_stub {
 				 const struct in6_addr *addr);
 	int (*ipv6_dst_lookup)(struct net *net, struct sock *sk,
 			       struct dst_entry **dst, struct flowi6 *fl6);
+
+	struct fib6_table *(*fib6_get_table)(struct net *net, u32 id);
+	struct fib6_info *(*fib6_lookup)(struct net *net, int oif,
+					 struct flowi6 *fl6, int flags);
+	struct fib6_info *(*fib6_table_lookup)(struct net *net,
+					      struct fib6_table *table,
+					      int oif, struct flowi6 *fl6,
+					      int flags);
+	struct fib6_info *(*fib6_multipath_select)(const struct net *net,
+						   struct fib6_info *f6i,
+						   struct flowi6 *fl6, int oif,
+						   const struct sk_buff *skb,
+						   int strict);
+
 	void (*udpv6_encap_enable)(void);
 	void (*ndisc_send_na)(struct net_device *dev, const struct in6_addr *daddr,
 			      const struct in6_addr *solicited_addr,
diff --git a/net/ipv6/addrconf_core.c b/net/ipv6/addrconf_core.c
index 32b564dfd02a..2fe754fd4f5e 100644
--- a/net/ipv6/addrconf_core.c
+++ b/net/ipv6/addrconf_core.c
@@ -134,8 +134,39 @@ static int eafnosupport_ipv6_dst_lookup(struct net *net, struct sock *u1,
 	return -EAFNOSUPPORT;
 }
 
+static struct fib6_table *eafnosupport_fib6_get_table(struct net *net, u32 id)
+{
+	return NULL;
+}
+
+static struct fib6_info *
+eafnosupport_fib6_table_lookup(struct net *net, struct fib6_table *table,
+			       int oif, struct flowi6 *fl6, int flags)
+{
+	return NULL;
+}
+
+static struct fib6_info *
+eafnosupport_fib6_lookup(struct net *net, int oif, struct flowi6 *fl6,
+			 int flags)
+{
+	return NULL;
+}
+
+static struct fib6_info *
+eafnosupport_fib6_multipath_select(const struct net *net, struct fib6_info *f6i,
+				   struct flowi6 *fl6, int oif,
+				   const struct sk_buff *skb, int strict)
+{
+	return f6i;
+}
+
 const struct ipv6_stub *ipv6_stub __read_mostly = &(struct ipv6_stub) {
-	.ipv6_dst_lookup = eafnosupport_ipv6_dst_lookup,
+	.ipv6_dst_lookup   = eafnosupport_ipv6_dst_lookup,
+	.fib6_get_table    = eafnosupport_fib6_get_table,
+	.fib6_table_lookup = eafnosupport_fib6_table_lookup,
+	.fib6_lookup       = eafnosupport_fib6_lookup,
+	.fib6_multipath_select = eafnosupport_fib6_multipath_select,
 };
 EXPORT_SYMBOL_GPL(ipv6_stub);
 
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index d0af96e0d109..50de8b0d4f70 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -889,7 +889,11 @@ static struct pernet_operations inet6_net_ops = {
 static const struct ipv6_stub ipv6_stub_impl = {
 	.ipv6_sock_mc_join = ipv6_sock_mc_join,
 	.ipv6_sock_mc_drop = ipv6_sock_mc_drop,
-	.ipv6_dst_lookup = ip6_dst_lookup,
+	.ipv6_dst_lookup   = ip6_dst_lookup,
+	.fib6_get_table	   = fib6_get_table,
+	.fib6_table_lookup = fib6_table_lookup,
+	.fib6_lookup       = fib6_lookup,
+	.fib6_multipath_select = fib6_multipath_select,
 	.udpv6_encap_enable = udpv6_encap_enable,
 	.ndisc_send_na = ndisc_send_na,
 	.nd_tbl	= &nd_tbl,
-- 
2.11.0

^ permalink raw reply related

* [bpf-next v3 6/9] net/ipv6: Update fib6 tracepoint to take fib6_info
From: David Ahern @ 2018-05-10  3:34 UTC (permalink / raw)
  To: netdev, borkmann, ast
  Cc: davem, shm, roopa, brouer, toke, john.fastabend, David Ahern
In-Reply-To: <20180510033427.20756-1-dsahern@gmail.com>

Similar to IPv4, IPv6 should use the FIB lookup result in the
tracepoint.

Signed-off-by: David Ahern <dsahern@gmail.com>
Acked-by: David S. Miller <davem@davemloft.net>
---
 include/trace/events/fib6.h | 14 +++++++-------
 net/ipv6/route.c            | 14 ++++++--------
 2 files changed, 13 insertions(+), 15 deletions(-)

diff --git a/include/trace/events/fib6.h b/include/trace/events/fib6.h
index 7e8d48a81b91..1b8d951e3c12 100644
--- a/include/trace/events/fib6.h
+++ b/include/trace/events/fib6.h
@@ -12,10 +12,10 @@
 
 TRACE_EVENT(fib6_table_lookup,
 
-	TP_PROTO(const struct net *net, const struct rt6_info *rt,
+	TP_PROTO(const struct net *net, const struct fib6_info *f6i,
 		 struct fib6_table *table, const struct flowi6 *flp),
 
-	TP_ARGS(net, rt, table, flp),
+	TP_ARGS(net, f6i, table, flp),
 
 	TP_STRUCT__entry(
 		__field(	u32,	tb_id		)
@@ -48,20 +48,20 @@ TRACE_EVENT(fib6_table_lookup,
 		in6 = (struct in6_addr *)__entry->dst;
 		*in6 = flp->daddr;
 
-		if (rt->rt6i_idev) {
-			__assign_str(name, rt->rt6i_idev->dev->name);
+		if (f6i->fib6_nh.nh_dev) {
+			__assign_str(name, f6i->fib6_nh.nh_dev);
 		} else {
 			__assign_str(name, "");
 		}
-		if (rt == net->ipv6.ip6_null_entry) {
+		if (f6i == net->ipv6.fib6_null_entry) {
 			struct in6_addr in6_zero = {};
 
 			in6 = (struct in6_addr *)__entry->gw;
 			*in6 = in6_zero;
 
-		} else if (rt) {
+		} else if (f6i) {
 			in6 = (struct in6_addr *)__entry->gw;
-			*in6 = rt->rt6i_gateway;
+			*in6 = f6i->fib6_nh.nh_gw;
 		}
 	),
 
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 019d8ba9021e..73f9c29a5878 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1078,6 +1078,8 @@ static struct rt6_info *ip6_pol_route_lookup(struct net *net,
 			goto restart;
 	}
 
+	trace_fib6_table_lookup(net, f6i, table, fl6);
+
 	/* Search through exception table */
 	rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
 	if (rt) {
@@ -1096,8 +1098,6 @@ static struct rt6_info *ip6_pol_route_lookup(struct net *net,
 
 	rcu_read_unlock();
 
-	trace_fib6_table_lookup(net, rt, table, fl6);
-
 	return rt;
 }
 
@@ -1827,6 +1827,8 @@ struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table,
 		}
 	}
 
+	trace_fib6_table_lookup(net, f6i, table, fl6);
+
 	return f6i;
 }
 
@@ -1853,7 +1855,6 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
 		rt = net->ipv6.ip6_null_entry;
 		rcu_read_unlock();
 		dst_hold(&rt->dst);
-		trace_fib6_table_lookup(net, rt, table, fl6);
 		return rt;
 	}
 
@@ -1864,7 +1865,6 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
 			dst_use_noref(&rt->dst, jiffies);
 
 		rcu_read_unlock();
-		trace_fib6_table_lookup(net, rt, table, fl6);
 		return rt;
 	} else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
 			    !(f6i->fib6_flags & RTF_GATEWAY))) {
@@ -1890,9 +1890,7 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
 			dst_hold(&uncached_rt->dst);
 		}
 
-		trace_fib6_table_lookup(net, uncached_rt, table, fl6);
 		return uncached_rt;
-
 	} else {
 		/* Get a percpu copy */
 
@@ -1906,7 +1904,7 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
 
 		local_bh_enable();
 		rcu_read_unlock();
-		trace_fib6_table_lookup(net, pcpu_rt, table, fl6);
+
 		return pcpu_rt;
 	}
 }
@@ -2491,7 +2489,7 @@ static struct rt6_info *__ip6_route_redirect(struct net *net,
 
 	rcu_read_unlock();
 
-	trace_fib6_table_lookup(net, ret, table, fl6);
+	trace_fib6_table_lookup(net, rt, table, fl6);
 	return ret;
 };
 
-- 
2.11.0

^ permalink raw reply related

* [bpf-next v3 5/9] net/ipv6: Add fib6_lookup
From: David Ahern @ 2018-05-10  3:34 UTC (permalink / raw)
  To: netdev, borkmann, ast
  Cc: davem, shm, roopa, brouer, toke, john.fastabend, David Ahern
In-Reply-To: <20180510033427.20756-1-dsahern@gmail.com>

Add IPv6 equivalent to fib_lookup. Does a fib lookup, including rules,
but returns a FIB entry, fib6_info, rather than a dst based rt6_info.
fib6_lookup is any where from 140% (MULTIPLE_TABLES config disabled)
to 60% faster than any of the dst based lookup methods (without custom
rules) and 25% faster with custom rules (e.g., l3mdev rule).

Since the lookup function has a completely different signature,
fib6_rule_action is split into 2 paths: the existing one is
renamed __fib6_rule_action and a new one for the fib6_info path
is added. fib6_rule_action decides which to call based on the
lookup_ptr. If it is fib6_table_lookup then the new path is taken.

Caller must hold rcu lock as no reference is taken on the returned
fib entry.

Signed-off-by: David Ahern <dsahern@gmail.com>
Acked-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_fib.h |  6 ++++
 net/ipv6/fib6_rules.c | 86 +++++++++++++++++++++++++++++++++++++++++++++++++--
 net/ipv6/ip6_fib.c    |  7 +++++
 3 files changed, 97 insertions(+), 2 deletions(-)

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index c70705f2647a..cc70f6da8462 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -376,6 +376,12 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
 				   const struct sk_buff *skb,
 				   int flags, pol_lookup_t lookup);
 
+/* called with rcu lock held; can return error pointer
+ * caller needs to select path
+ */
+struct fib6_info *fib6_lookup(struct net *net, int oif, struct flowi6 *fl6,
+			      int flags);
+
 /* called with rcu lock held; caller needs to select path */
 struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table,
 				    int oif, struct flowi6 *fl6, int strict);
diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c
index d040c4bff3a0..f590446595d8 100644
--- a/net/ipv6/fib6_rules.c
+++ b/net/ipv6/fib6_rules.c
@@ -60,6 +60,39 @@ unsigned int fib6_rules_seq_read(struct net *net)
 	return fib_rules_seq_read(net, AF_INET6);
 }
 
+/* called with rcu lock held; no reference taken on fib6_info */
+struct fib6_info *fib6_lookup(struct net *net, int oif, struct flowi6 *fl6,
+			      int flags)
+{
+	struct fib6_info *f6i;
+	int err;
+
+	if (net->ipv6.fib6_has_custom_rules) {
+		struct fib_lookup_arg arg = {
+			.lookup_ptr = fib6_table_lookup,
+			.lookup_data = &oif,
+			.flags = FIB_LOOKUP_NOREF,
+		};
+
+		l3mdev_update_flow(net, flowi6_to_flowi(fl6));
+
+		err = fib_rules_lookup(net->ipv6.fib6_rules_ops,
+				       flowi6_to_flowi(fl6), flags, &arg);
+		if (err)
+			return ERR_PTR(err);
+
+		f6i = arg.result ? : net->ipv6.fib6_null_entry;
+	} else {
+		f6i = fib6_table_lookup(net, net->ipv6.fib6_local_tbl,
+					oif, fl6, flags);
+		if (!f6i || f6i == net->ipv6.fib6_null_entry)
+			f6i = fib6_table_lookup(net, net->ipv6.fib6_main_tbl,
+						oif, fl6, flags);
+	}
+
+	return f6i;
+}
+
 struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
 				   const struct sk_buff *skb,
 				   int flags, pol_lookup_t lookup)
@@ -121,8 +154,48 @@ static int fib6_rule_saddr(struct net *net, struct fib_rule *rule, int flags,
 	return 0;
 }
 
-static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp,
-			    int flags, struct fib_lookup_arg *arg)
+static int fib6_rule_action_alt(struct fib_rule *rule, struct flowi *flp,
+				int flags, struct fib_lookup_arg *arg)
+{
+	struct flowi6 *flp6 = &flp->u.ip6;
+	struct net *net = rule->fr_net;
+	struct fib6_table *table;
+	struct fib6_info *f6i;
+	int err = -EAGAIN, *oif;
+	u32 tb_id;
+
+	switch (rule->action) {
+	case FR_ACT_TO_TBL:
+		break;
+	case FR_ACT_UNREACHABLE:
+		return -ENETUNREACH;
+	case FR_ACT_PROHIBIT:
+		return -EACCES;
+	case FR_ACT_BLACKHOLE:
+	default:
+		return -EINVAL;
+	}
+
+	tb_id = fib_rule_get_table(rule, arg);
+	table = fib6_get_table(net, tb_id);
+	if (!table)
+		return -EAGAIN;
+
+	oif = (int *)arg->lookup_data;
+	f6i = fib6_table_lookup(net, table, *oif, flp6, flags);
+	if (f6i != net->ipv6.fib6_null_entry) {
+		err = fib6_rule_saddr(net, rule, flags, flp6,
+				      fib6_info_nh_dev(f6i));
+
+		if (likely(!err))
+			arg->result = f6i;
+	}
+
+	return err;
+}
+
+static int __fib6_rule_action(struct fib_rule *rule, struct flowi *flp,
+			      int flags, struct fib_lookup_arg *arg)
 {
 	struct flowi6 *flp6 = &flp->u.ip6;
 	struct rt6_info *rt = NULL;
@@ -182,6 +255,15 @@ static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp,
 	return err;
 }
 
+static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp,
+			    int flags, struct fib_lookup_arg *arg)
+{
+	if (arg->lookup_ptr == fib6_table_lookup)
+		return fib6_rule_action_alt(rule, flp, flags, arg);
+
+	return __fib6_rule_action(rule, flp, flags, arg);
+}
+
 static bool fib6_rule_suppress(struct fib_rule *rule, struct fib_lookup_arg *arg)
 {
 	struct rt6_info *rt = (struct rt6_info *) arg->result;
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 487faffeae28..d1dc6017f5a6 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -354,6 +354,13 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
 	return &rt->dst;
 }
 
+/* called with rcu lock held; no reference taken on fib6_info */
+struct fib6_info *fib6_lookup(struct net *net, int oif, struct flowi6 *fl6,
+			      int flags)
+{
+	return fib6_table_lookup(net, net->ipv6.fib6_main_tbl, oif, fl6, flags);
+}
+
 static void __net_init fib6_tables_init(struct net *net)
 {
 	fib6_link_table(net, net->ipv6.fib6_main_tbl);
-- 
2.11.0

^ permalink raw reply related

* [bpf-next v3 4/9] net/ipv6: Refactor fib6_rule_action
From: David Ahern @ 2018-05-10  3:34 UTC (permalink / raw)
  To: netdev, borkmann, ast
  Cc: davem, shm, roopa, brouer, toke, john.fastabend, David Ahern
In-Reply-To: <20180510033427.20756-1-dsahern@gmail.com>

Move source address lookup from fib6_rule_action to a helper. It will be
used in a later patch by a second variant for fib6_rule_action.

Signed-off-by: David Ahern <dsahern@gmail.com>
Acked-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/fib6_rules.c | 52 ++++++++++++++++++++++++++++++---------------------
 1 file changed, 31 insertions(+), 21 deletions(-)

diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c
index 6547fc6491a6..d040c4bff3a0 100644
--- a/net/ipv6/fib6_rules.c
+++ b/net/ipv6/fib6_rules.c
@@ -96,6 +96,31 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
 	return &net->ipv6.ip6_null_entry->dst;
 }
 
+static int fib6_rule_saddr(struct net *net, struct fib_rule *rule, int flags,
+			   struct flowi6 *flp6, const struct net_device *dev)
+{
+	struct fib6_rule *r = (struct fib6_rule *)rule;
+
+	/* If we need to find a source address for this traffic,
+	 * we check the result if it meets requirement of the rule.
+	 */
+	if ((rule->flags & FIB_RULE_FIND_SADDR) &&
+	    r->src.plen && !(flags & RT6_LOOKUP_F_HAS_SADDR)) {
+		struct in6_addr saddr;
+
+		if (ipv6_dev_get_saddr(net, dev, &flp6->daddr,
+				       rt6_flags2srcprefs(flags), &saddr))
+			return -EAGAIN;
+
+		if (!ipv6_prefix_equal(&saddr, &r->src.addr, r->src.plen))
+			return -EAGAIN;
+
+		flp6->saddr = saddr;
+	}
+
+	return 0;
+}
+
 static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp,
 			    int flags, struct fib_lookup_arg *arg)
 {
@@ -134,27 +159,12 @@ static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp,
 
 	rt = lookup(net, table, flp6, arg->lookup_data, flags);
 	if (rt != net->ipv6.ip6_null_entry) {
-		struct fib6_rule *r = (struct fib6_rule *)rule;
-
-		/*
-		 * If we need to find a source address for this traffic,
-		 * we check the result if it meets requirement of the rule.
-		 */
-		if ((rule->flags & FIB_RULE_FIND_SADDR) &&
-		    r->src.plen && !(flags & RT6_LOOKUP_F_HAS_SADDR)) {
-			struct in6_addr saddr;
-
-			if (ipv6_dev_get_saddr(net,
-					       ip6_dst_idev(&rt->dst)->dev,
-					       &flp6->daddr,
-					       rt6_flags2srcprefs(flags),
-					       &saddr))
-				goto again;
-			if (!ipv6_prefix_equal(&saddr, &r->src.addr,
-					       r->src.plen))
-				goto again;
-			flp6->saddr = saddr;
-		}
+		err = fib6_rule_saddr(net, rule, flags, flp6,
+				      ip6_dst_idev(&rt->dst)->dev);
+
+		if (err == -EAGAIN)
+			goto again;
+
 		err = rt->dst.error;
 		if (err != -EAGAIN)
 			goto out;
-- 
2.11.0

^ permalink raw reply related

* [bpf-next v3 3/9] net/ipv6: Extract table lookup from ip6_pol_route
From: David Ahern @ 2018-05-10  3:34 UTC (permalink / raw)
  To: netdev, borkmann, ast
  Cc: davem, shm, roopa, brouer, toke, john.fastabend, David Ahern
In-Reply-To: <20180510033427.20756-1-dsahern@gmail.com>

ip6_pol_route is used for ingress and egress FIB lookups. Refactor it
moving the table lookup into a separate fib6_table_lookup that can be
invoked separately and export the new function.

ip6_pol_route now calls fib6_table_lookup and uses the result to generate
a dst based rt6_info.

Signed-off-by: David Ahern <dsahern@gmail.com>
Acked-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_fib.h |  4 ++++
 net/ipv6/route.c      | 39 +++++++++++++++++++++++++--------------
 2 files changed, 29 insertions(+), 14 deletions(-)

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 2597d8fdd92f..c70705f2647a 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -376,6 +376,10 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
 				   const struct sk_buff *skb,
 				   int flags, pol_lookup_t lookup);
 
+/* called with rcu lock held; caller needs to select path */
+struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table,
+				    int oif, struct flowi6 *fl6, int strict);
+
 struct fib6_info *fib6_multipath_select(const struct net *net,
 					struct fib6_info *match,
 					struct flowi6 *fl6, int oif,
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 6a10608d9025..019d8ba9021e 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1800,21 +1800,12 @@ void rt6_age_exceptions(struct fib6_info *rt,
 	rcu_read_unlock_bh();
 }
 
-struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
-			       int oif, struct flowi6 *fl6,
-			       const struct sk_buff *skb, int flags)
+/* must be called with rcu lock held */
+struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table,
+				    int oif, struct flowi6 *fl6, int strict)
 {
 	struct fib6_node *fn, *saved_fn;
 	struct fib6_info *f6i;
-	struct rt6_info *rt;
-	int strict = 0;
-
-	strict |= flags & RT6_LOOKUP_F_IFACE;
-	strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
-	if (net->ipv6.devconf_all->forwarding == 0)
-		strict |= RT6_LOOKUP_F_REACHABLE;
-
-	rcu_read_lock();
 
 	fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
 	saved_fn = fn;
@@ -1824,8 +1815,6 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
 
 redo_rt6_select:
 	f6i = rt6_select(net, fn, oif, strict);
-	if (f6i->fib6_nsiblings)
-		f6i = fib6_multipath_select(net, f6i, fl6, oif, skb, strict);
 	if (f6i == net->ipv6.fib6_null_entry) {
 		fn = fib6_backtrack(fn, &fl6->saddr);
 		if (fn)
@@ -1838,6 +1827,28 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
 		}
 	}
 
+	return f6i;
+}
+
+struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
+			       int oif, struct flowi6 *fl6,
+			       const struct sk_buff *skb, int flags)
+{
+	struct fib6_info *f6i;
+	struct rt6_info *rt;
+	int strict = 0;
+
+	strict |= flags & RT6_LOOKUP_F_IFACE;
+	strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
+	if (net->ipv6.devconf_all->forwarding == 0)
+		strict |= RT6_LOOKUP_F_REACHABLE;
+
+	rcu_read_lock();
+
+	f6i = fib6_table_lookup(net, table, oif, fl6, strict);
+	if (f6i->fib6_nsiblings)
+		f6i = fib6_multipath_select(net, f6i, fl6, oif, skb, strict);
+
 	if (f6i == net->ipv6.fib6_null_entry) {
 		rt = net->ipv6.ip6_null_entry;
 		rcu_read_unlock();
-- 
2.11.0

^ permalink raw reply related

* [bpf-next v3 2/9] net/ipv6: Rename rt6_multipath_select
From: David Ahern @ 2018-05-10  3:34 UTC (permalink / raw)
  To: netdev, borkmann, ast
  Cc: davem, shm, roopa, brouer, toke, john.fastabend, David Ahern
In-Reply-To: <20180510033427.20756-1-dsahern@gmail.com>

Rename rt6_multipath_select to fib6_multipath_select and export it.
A later patch wants access to it similar to IPv4's fib_select_path.

Signed-off-by: David Ahern <dsahern@gmail.com>
Acked-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_fib.h |  5 +++++
 net/ipv6/route.c      | 17 +++++++++--------
 2 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 43ab545e64ea..2597d8fdd92f 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -376,6 +376,11 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
 				   const struct sk_buff *skb,
 				   int flags, pol_lookup_t lookup);
 
+struct fib6_info *fib6_multipath_select(const struct net *net,
+					struct fib6_info *match,
+					struct flowi6 *fl6, int oif,
+					const struct sk_buff *skb, int strict);
+
 struct fib6_node *fib6_node_lookup(struct fib6_node *root,
 				   const struct in6_addr *daddr,
 				   const struct in6_addr *saddr);
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 443d2a0bc150..6a10608d9025 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -419,11 +419,11 @@ static bool rt6_check_expired(const struct rt6_info *rt)
 	return false;
 }
 
-static struct fib6_info *rt6_multipath_select(const struct net *net,
-					      struct fib6_info *match,
-					     struct flowi6 *fl6, int oif,
-					     const struct sk_buff *skb,
-					     int strict)
+struct fib6_info *fib6_multipath_select(const struct net *net,
+					struct fib6_info *match,
+					struct flowi6 *fl6, int oif,
+					const struct sk_buff *skb,
+					int strict)
 {
 	struct fib6_info *sibling, *next_sibling;
 
@@ -1068,8 +1068,9 @@ static struct rt6_info *ip6_pol_route_lookup(struct net *net,
 		f6i = rt6_device_match(net, f6i, &fl6->saddr,
 				      fl6->flowi6_oif, flags);
 		if (f6i->fib6_nsiblings && fl6->flowi6_oif == 0)
-			f6i = rt6_multipath_select(net, f6i, fl6,
-						   fl6->flowi6_oif, skb, flags);
+			f6i = fib6_multipath_select(net, f6i, fl6,
+						    fl6->flowi6_oif, skb,
+						    flags);
 	}
 	if (f6i == net->ipv6.fib6_null_entry) {
 		fn = fib6_backtrack(fn, &fl6->saddr);
@@ -1824,7 +1825,7 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
 redo_rt6_select:
 	f6i = rt6_select(net, fn, oif, strict);
 	if (f6i->fib6_nsiblings)
-		f6i = rt6_multipath_select(net, f6i, fl6, oif, skb, strict);
+		f6i = fib6_multipath_select(net, f6i, fl6, oif, skb, strict);
 	if (f6i == net->ipv6.fib6_null_entry) {
 		fn = fib6_backtrack(fn, &fl6->saddr);
 		if (fn)
-- 
2.11.0

^ permalink raw reply related

* [bpf-next v3 1/9] net/ipv6: Rename fib6_lookup to fib6_node_lookup
From: David Ahern @ 2018-05-10  3:34 UTC (permalink / raw)
  To: netdev, borkmann, ast
  Cc: davem, shm, roopa, brouer, toke, john.fastabend, David Ahern
In-Reply-To: <20180510033427.20756-1-dsahern@gmail.com>

Rename fib6_lookup to fib6_node_lookup to better reflect what it
returns. The fib6_lookup name will be used in a later patch for
an IPv6 equivalent to IPv4's fib_lookup.

Signed-off-by: David Ahern <dsahern@gmail.com>
Acked-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_fib.h |  6 +++---
 net/ipv6/ip6_fib.c    | 14 ++++++++------
 net/ipv6/route.c      |  8 ++++----
 3 files changed, 15 insertions(+), 13 deletions(-)

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index a3ec08d05756..43ab545e64ea 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -376,9 +376,9 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
 				   const struct sk_buff *skb,
 				   int flags, pol_lookup_t lookup);
 
-struct fib6_node *fib6_lookup(struct fib6_node *root,
-			      const struct in6_addr *daddr,
-			      const struct in6_addr *saddr);
+struct fib6_node *fib6_node_lookup(struct fib6_node *root,
+				   const struct in6_addr *daddr,
+				   const struct in6_addr *saddr);
 
 struct fib6_node *fib6_locate(struct fib6_node *root,
 			      const struct in6_addr *daddr, int dst_len,
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index f0a4262a4789..487faffeae28 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -1354,8 +1354,8 @@ struct lookup_args {
 	const struct in6_addr	*addr;		/* search key			*/
 };
 
-static struct fib6_node *fib6_lookup_1(struct fib6_node *root,
-				       struct lookup_args *args)
+static struct fib6_node *fib6_node_lookup_1(struct fib6_node *root,
+					    struct lookup_args *args)
 {
 	struct fib6_node *fn;
 	__be32 dir;
@@ -1400,7 +1400,8 @@ static struct fib6_node *fib6_lookup_1(struct fib6_node *root,
 #ifdef CONFIG_IPV6_SUBTREES
 				if (subtree) {
 					struct fib6_node *sfn;
-					sfn = fib6_lookup_1(subtree, args + 1);
+					sfn = fib6_node_lookup_1(subtree,
+								 args + 1);
 					if (!sfn)
 						goto backtrack;
 					fn = sfn;
@@ -1422,8 +1423,9 @@ static struct fib6_node *fib6_lookup_1(struct fib6_node *root,
 
 /* called with rcu_read_lock() held
  */
-struct fib6_node *fib6_lookup(struct fib6_node *root, const struct in6_addr *daddr,
-			      const struct in6_addr *saddr)
+struct fib6_node *fib6_node_lookup(struct fib6_node *root,
+				   const struct in6_addr *daddr,
+				   const struct in6_addr *saddr)
 {
 	struct fib6_node *fn;
 	struct lookup_args args[] = {
@@ -1442,7 +1444,7 @@ struct fib6_node *fib6_lookup(struct fib6_node *root, const struct in6_addr *dad
 		}
 	};
 
-	fn = fib6_lookup_1(root, daddr ? args : args + 1);
+	fn = fib6_node_lookup_1(root, daddr ? args : args + 1);
 	if (!fn || fn->fn_flags & RTN_TL_ROOT)
 		fn = root;
 
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index daa3662da0ee..443d2a0bc150 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1006,7 +1006,7 @@ static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
 		pn = rcu_dereference(fn->parent);
 		sn = FIB6_SUBTREE(pn);
 		if (sn && sn != fn)
-			fn = fib6_lookup(sn, NULL, saddr);
+			fn = fib6_node_lookup(sn, NULL, saddr);
 		else
 			fn = pn;
 		if (fn->fn_flags & RTN_RTINFO)
@@ -1059,7 +1059,7 @@ static struct rt6_info *ip6_pol_route_lookup(struct net *net,
 		flags &= ~RT6_LOOKUP_F_IFACE;
 
 	rcu_read_lock();
-	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
+	fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
 restart:
 	f6i = rcu_dereference(fn->leaf);
 	if (!f6i) {
@@ -1815,7 +1815,7 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
 
 	rcu_read_lock();
 
-	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
+	fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
 	saved_fn = fn;
 
 	if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
@@ -2425,7 +2425,7 @@ static struct rt6_info *__ip6_route_redirect(struct net *net,
 	 */
 
 	rcu_read_lock();
-	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
+	fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
 restart:
 	for_each_fib6_node_rt_rcu(fn) {
 		if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
-- 
2.11.0

^ permalink raw reply related

* [bpf-next v3 0/9] bpf: Add helper to do FIB lookups
From: David Ahern @ 2018-05-10  3:34 UTC (permalink / raw)
  To: netdev, borkmann, ast
  Cc: davem, shm, roopa, brouer, toke, john.fastabend, David Ahern

Provide a helper for doing a FIB and neighbor lookup in the kernel
tables from an XDP program. The helper provides a fastpath for forwarding
packets. If the packet is a local delivery or for any reason is not a
simple lookup and forward, the packet is expected to continue up the stack
for full processing.

The response from a FIB and neighbor lookup is either the egress index
with the bpf_fib_lookup struct filled in with dmac and gateway or
0 meaning the packet should continue up the stack. In time we can
revisit this to return the FIB lookup result errno if it is one of the
special RTN_'s such as RTN_BLACKHOLE (-EINVAL) so that the XDP
programs can do an early drop if desired.

Patches 1-6 do some more refactoring to IPv6 with the end goal of
extracting a FIB lookup function that aligns with fib_lookup for IPv4,
basically returning a fib6_info without creating a dst based entry.

Patch 7 adds lookup functions to the ipv6 stub. These are needed since
bpf is built into the kernel and ipv6 may not be built or loaded.

Patch 8 adds the bpf helper and 9 adds a sample program.

v3
- remove ETH_ALEN and in6_addr from uapi header

v2
- removed pkt_access from bpf_func_proto as noticed by Daniel
- added check in that IPv6 forwarding is enabled
- added DaveM's ack on patches 1-7 and 9 based on v1 response and
  fact that no changes were made to them in v2

v1
- updated commit messages and cover letter
- added comment to sample program noting lack of verification on
  egress device supporting XDP

RFC v2
- fixed use of foward helper from cls_act as noted by Daniel
- in patch 1 rename fib6_lookup_1 as well for consistency

David Ahern (9):
  net/ipv6: Rename fib6_lookup to fib6_node_lookup
  net/ipv6: Rename rt6_multipath_select
  net/ipv6: Extract table lookup from ip6_pol_route
  net/ipv6: Refactor fib6_rule_action
  net/ipv6: Add fib6_lookup
  net/ipv6: Update fib6 tracepoint to take fib6_info
  net/ipv6: Add fib lookup stubs for use in bpf helper
  bpf: Provide helper to do forwarding lookups in kernel FIB table
  samples/bpf: Add example of ipv4 and ipv6 forwarding in XDP

 include/net/addrconf.h                    |  14 ++
 include/net/ip6_fib.h                     |  21 ++-
 include/trace/events/fib6.h               |  14 +-
 include/uapi/linux/bpf.h                  |  81 ++++++++-
 net/core/filter.c                         | 267 ++++++++++++++++++++++++++++++
 net/ipv6/addrconf_core.c                  |  33 +++-
 net/ipv6/af_inet6.c                       |   6 +-
 net/ipv6/fib6_rules.c                     | 138 ++++++++++++---
 net/ipv6/ip6_fib.c                        |  21 ++-
 net/ipv6/route.c                          |  76 +++++----
 samples/bpf/Makefile                      |   4 +
 samples/bpf/xdp_fwd_kern.c                | 115 +++++++++++++
 samples/bpf/xdp_fwd_user.c                | 136 +++++++++++++++
 tools/testing/selftests/bpf/bpf_helpers.h |   3 +
 14 files changed, 854 insertions(+), 75 deletions(-)
 create mode 100644 samples/bpf/xdp_fwd_kern.c
 create mode 100644 samples/bpf/xdp_fwd_user.c

-- 
2.11.0

^ permalink raw reply

* Re: [PATCH net-next] tcp: Add mark for TIMEWAIT sockets
From: Eric Dumazet @ 2018-05-10  3:32 UTC (permalink / raw)
  To: Jon Maxwell, davem; +Cc: kuznet, yoshfuji, netdev, linux-kernel, jmaxwell
In-Reply-To: <20180510020739.8599-1-jmaxwell37@gmail.com>



On 05/09/2018 07:07 PM, Jon Maxwell wrote:
> Aidan McGurn from Openwave Mobility systems reported the following bug:
> 
> "Marked routing is broken on customer deployment. Its effects are large 
> increase in Uplink retransmissions caused by the client never receiving 
> the final ACK to their FINACK - this ACK misses the mark and routes out 
> of the incorrect route."
> 
> Currently marks are added to sk_buffs for replies when the "fwmark_reflect" 
> sysctl is enabled. But not for TIME_WAIT sockets where the original socket had 
> sk->sk_mark set via setsockopt(SO_MARK..).  
> 
> Fix this in IPv4/v6 by adding tw->tw_mark for TIME_WAIT sockets. Copy the the 
> original sk->sk_mark in __inet_twsk_hashdance() to the new tw->tw_mark location. 
> Then copy this into ctl_sk->sk_mark so that the skb gets sent with the correct 
> mark. Do the same for resets. Give the "fwmark_reflect" sysctl precedence over 
> sk->sk_mark so that netfilter rules are still honored.
> 
> Signed-off-by: Jon Maxwell <jmaxwell37@gmail.com>
> ---
>  include/net/inet_timewait_sock.h |  1 +
>  net/ipv4/ip_output.c             |  3 ++-
>  net/ipv4/tcp_ipv4.c              | 18 ++++++++++++++++--
>  net/ipv4/tcp_minisocks.c         |  1 +
>  net/ipv6/tcp_ipv6.c              |  8 +++++++-
>  5 files changed, 27 insertions(+), 4 deletions(-)
> 
> diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h
> index c7be1ca8e562..659d8ed5a3bc 100644
> --- a/include/net/inet_timewait_sock.h
> +++ b/include/net/inet_timewait_sock.h
> @@ -62,6 +62,7 @@ struct inet_timewait_sock {
>  #define tw_dr			__tw_common.skc_tw_dr
>  
>  	int			tw_timeout;
> +	__u32			tw_mark;
>  	volatile unsigned char	tw_substate;
>  	unsigned char		tw_rcv_wscale;
>  
> diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
> index 95adb171f852..cca4412dc4cb 100644
> --- a/net/ipv4/ip_output.c
> +++ b/net/ipv4/ip_output.c
> @@ -1539,6 +1539,7 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
>  	struct sk_buff *nskb;
>  	int err;
>  	int oif;
> +	__u32 mark = IP4_REPLY_MARK(net, skb->mark);
>  
>  	if (__ip_options_echo(net, &replyopts.opt.opt, skb, sopt))
>  		return;
> @@ -1561,7 +1562,7 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
>  		oif = skb->skb_iif;
>  
>  	flowi4_init_output(&fl4, oif,
> -			   IP4_REPLY_MARK(net, skb->mark),
> +			   mark ? (mark) : sk->sk_mark,

You can avoid the declaration of mark variable and simply use here :

			IP4_REPLY_MARK(net, skb->mark) ?: sk->sk_mark,

>  			   RT_TOS(arg->tos),
>  			   RT_SCOPE_UNIVERSE, ip_hdr(skb)->protocol,
>  			   ip_reply_arg_flowi_flags(arg),
> diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
> index f70586b50838..fbee36579c83 100644
> --- a/net/ipv4/tcp_ipv4.c
> +++ b/net/ipv4/tcp_ipv4.c
> @@ -621,6 +621,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
>  	struct sock *sk1 = NULL;
>  #endif
>  	struct net *net;
> +	struct sock *ctl_sk;
>  
>  	/* Never send a reset in response to a reset. */
>  	if (th->rst)
> @@ -723,11 +724,17 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
>  	arg.tos = ip_hdr(skb)->tos;
>  	arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
>  	local_bh_disable();
> -	ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
> +	ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk);
> +	if (sk && sk->sk_state == TCP_TIME_WAIT)
> +		ctl_sk->sk_mark = inet_twsk(sk)->tw_mark;
> +	else if (sk && sk_fullsock(sk))
> +		ctl_sk->sk_mark = sk->sk_mark;
> +	ip_send_unicast_reply(ctl_sk,
>  			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
>  			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
>  			      &arg, arg.iov[0].iov_len);
>  
> +	ctl_sk->sk_mark = 0;
>  	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
>  	__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
>  	local_bh_enable();
> @@ -759,6 +766,7 @@ static void tcp_v4_send_ack(const struct sock *sk,
>  	} rep;
>  	struct net *net = sock_net(sk);
>  	struct ip_reply_arg arg;
> +	struct sock *ctl_sk;
>  
>  	memset(&rep.th, 0, sizeof(struct tcphdr));
>  	memset(&arg, 0, sizeof(arg));
> @@ -809,11 +817,17 @@ static void tcp_v4_send_ack(const struct sock *sk,
>  	arg.tos = tos;
>  	arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
>  	local_bh_disable();
> -	ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
> +	ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk);
> +	if (sk && sk->sk_state == TCP_TIME_WAIT)
> +		ctl_sk->sk_mark = inet_twsk(sk)->tw_mark;
> +	else if (sk && sk_fullsock(sk))
> +		ctl_sk->sk_mark = sk->sk_mark;
> +	ip_send_unicast_reply(ctl_sk,
>  			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
>  			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
>  			      &arg, arg.iov[0].iov_len);
>  
> +	ctl_sk->sk_mark = 0;
>  	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
>  	local_bh_enable();
>  }
> diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
> index 57b5468b5139..f867658b4b30 100644
> --- a/net/ipv4/tcp_minisocks.c
> +++ b/net/ipv4/tcp_minisocks.c
> @@ -263,6 +263,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
>  		struct inet_sock *inet = inet_sk(sk);
>  
>  		tw->tw_transparent	= inet->transparent;
> +		tw->tw_mark		= sk->sk_mark;
>  		tw->tw_rcv_wscale	= tp->rx_opt.rcv_wscale;
>  		tcptw->tw_rcv_nxt	= tp->rcv_nxt;
>  		tcptw->tw_snd_nxt	= tp->snd_nxt;
> diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
> index 6d664d83cd16..a6f876125091 100644
> --- a/net/ipv6/tcp_ipv6.c
> +++ b/net/ipv6/tcp_ipv6.c
> @@ -803,6 +803,7 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32
>  	unsigned int tot_len = sizeof(struct tcphdr);
>  	struct dst_entry *dst;
>  	__be32 *topt;
> +	__u32 mark = IP6_REPLY_MARK(net, skb->mark);
>  
>  	if (tsecr)
>  		tot_len += TCPOLEN_TSTAMP_ALIGNED;
> @@ -871,11 +872,16 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32
>  		fl6.flowi6_oif = oif;
>  	}
>  
> -	fl6.flowi6_mark = IP6_REPLY_MARK(net, skb->mark);
> +	if (sk && sk->sk_state == TCP_TIME_WAIT)
> +		ctl_sk->sk_mark = inet_twsk(sk)->tw_mark;
> +	else if (sk && sk_fullsock(sk))
> +		ctl_sk->sk_mark = sk->sk_mark;

Unfortunately IPv6 has a single net->ipv6.tcp_sk, shared by all cpus.

So writing ctl_sk->sk_mark is racy on SMP hosts.

I would suggest using a local variable, and not touch ctl_sk->sk_mark

For consistency, you could do the same for IPv4, even if IPv4 currently uses per-cpu sockets


> +	fl6.flowi6_mark = mark ? (mark) : ctl_sk->sk_mark;
>  	fl6.fl6_dport = t1->dest;
>  	fl6.fl6_sport = t1->source;
>  	fl6.flowi6_uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
>  	security_skb_classify_flow(skb, flowi6_to_flowi(&fl6));
> +	ctl_sk->sk_mark = 0;
>  
>  	/* Pass a socket to ip6_dst_lookup either it is for RST
>  	 * Underlying function will use this to retrieve the network
> 

^ permalink raw reply

* [PATCH net-next] net/core: delete the no need variable description in devlink_resource_register() comment
From: Sun Lianwen @ 2018-05-10  3:28 UTC (permalink / raw)
  To: davem; +Cc: netdev

The variable "top_hierarchy" and "reload_required" is not existence in
devlink_resource_register()

Signed-off-by: Sun Lianwen <sunlw.fnst@cn.fujitsu.com>
---
 net/core/devlink.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/net/core/devlink.c b/net/core/devlink.c
index ad1317376798..c9596ea1d016 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -3162,9 +3162,6 @@ EXPORT_SYMBOL_GPL(devlink_dpipe_table_unregister);
  *
  *	@devlink: devlink
  *	@resource_name: resource's name
- *	@top_hierarchy: top hierarchy
- *	@reload_required: reload is required for new configuration to
- *			  apply
  *	@resource_size: resource's size
  *	@resource_id: resource's id
  *	@parent_reosurce_id: resource's parent id
-- 
2.17.0

^ permalink raw reply related

* [PATCH] bpf, doc: clarification for the meaning of 'id'
From: Wang YanQing @ 2018-05-10  3:09 UTC (permalink / raw)
  To: daniel; +Cc: ast, corbet, netdev, linux-kernel, linux-doc

For me, as a reader whose mother language isn't English, the
old words bring a little difficulty to catch the meaning, this
patch rewords the subsection in a more clarificatory way.

This patch also add blank lines as separator at two places
to improve readability.

Signed-off-by: Wang YanQing <udknight@gmail.com>
---
 Documentation/networking/filter.txt | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/Documentation/networking/filter.txt b/Documentation/networking/filter.txt
index 5032e12..e6b4ebb 100644
--- a/Documentation/networking/filter.txt
+++ b/Documentation/networking/filter.txt
@@ -1142,6 +1142,7 @@ into a register from memory, the register's top 56 bits are known zero, while
 the low 8 are unknown - which is represented as the tnum (0x0; 0xff).  If we
 then OR this with 0x40, we get (0x40; 0xbf), then if we add 1 we get (0x0;
 0x1ff), because of potential carries.
+
 Besides arithmetic, the register state can also be updated by conditional
 branches.  For instance, if a SCALAR_VALUE is compared > 8, in the 'true' branch
 it will have a umin_value (unsigned minimum value) of 9, whereas in the 'false'
@@ -1150,14 +1151,16 @@ BPF_JSGE) would instead update the signed minimum/maximum values.  Information
 from the signed and unsigned bounds can be combined; for instance if a value is
 first tested < 8 and then tested s> 4, the verifier will conclude that the value
 is also > 4 and s< 8, since the bounds prevent crossing the sign boundary.
+
 PTR_TO_PACKETs with a variable offset part have an 'id', which is common to all
 pointers sharing that same variable offset.  This is important for packet range
-checks: after adding some variable to a packet pointer, if you then copy it to
-another register and (say) add a constant 4, both registers will share the same
-'id' but one will have a fixed offset of +4.  Then if it is bounds-checked and
-found to be less than a PTR_TO_PACKET_END, the other register is now known to
-have a safe range of at least 4 bytes.  See 'Direct packet access', below, for
-more on PTR_TO_PACKET ranges.
+checks: after adding a variable to a packet pointer register A, if you then copy
+it to another register B and then add a constant 4 to A, both registers will
+share the same 'id' but the A will have a fixed offset of +4.  Then if A is
+bounds-checked and found to be less than a PTR_TO_PACKET_END, the register B is
+now known to have a safe range of at least 4 bytes.  See 'Direct packet access',
+below, for more on PTR_TO_PACKET ranges.
+
 The 'id' field is also used on PTR_TO_MAP_VALUE_OR_NULL, common to all copies of
 the pointer returned from a map lookup.  This means that when one copy is
 checked and found to be non-NULL, all copies can become PTR_TO_MAP_VALUEs.
-- 
1.8.5.6.2.g3d8a54e.dirty

^ permalink raw reply related

* [PATCH net-next] net/core: correct the variable name in dev_ioctl() comment
From: Sun Lianwen @ 2018-05-10  3:01 UTC (permalink / raw)
  To: davem; +Cc: netdev

The variable name is not "arg" but "ifr" in dev_ioctl()

Signed-off-by: Sun Lianwen <sunlw.fnst@cn.fujitsu.com>
---
 net/core/dev_ioctl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c
index a04e1e88bf3a..114e29053977 100644
--- a/net/core/dev_ioctl.c
+++ b/net/core/dev_ioctl.c
@@ -378,7 +378,7 @@ EXPORT_SYMBOL(dev_load);
  *	dev_ioctl	-	network device ioctl
  *	@net: the applicable net namespace
  *	@cmd: command to issue
- *	@arg: pointer to a struct ifreq in user space
+ *	@ifr: pointer to a struct ifreq in user space
  *
  *	Issue ioctl functions to devices. This is normally called by the
  *	user space syscall interfaces but can sometimes be useful for
-- 
2.17.0

^ permalink raw reply related

* Re: [PATCH net-next RFC 1/3] net: Add support to configure SR-IOV VF minimum and maximum queues.
From: Michael Chan @ 2018-05-10  2:32 UTC (permalink / raw)
  To: Jakub Kicinski; +Cc: Or Gerlitz, David Miller, Netdev
In-Reply-To: <20180509181018.4583e577@cakuba.netronome.com>

On Wed, May 9, 2018 at 6:10 PM, Jakub Kicinski
<jakub.kicinski@netronome.com> wrote:
> On Wed, 9 May 2018 17:22:50 -0700, Michael Chan wrote:
>> On Wed, May 9, 2018 at 4:15 PM, Jakub Kicinski wrote:
>> > On Wed,  9 May 2018 07:21:41 -0400, Michael Chan wrote:
>> >> VF Queue resources are always limited and there is currently no
>> >> infrastructure to allow the admin. on the host to add or reduce queue
>> >> resources for any particular VF.  With ever increasing number of VFs
>> >> being supported, it is desirable to allow the admin. to configure queue
>> >> resources differently for the VFs.  Some VFs may require more or fewer
>> >> queues due to different bandwidth requirements or different number of
>> >> vCPUs in the VM.  This patch adds the infrastructure to do that by
>> >> adding IFLA_VF_QUEUES netlink attribute and a new .ndo_set_vf_queues()
>> >> to the net_device_ops.
>> >>
>> >> Four parameters are exposed for each VF:
>> >>
>> >> o min_tx_queues - Guaranteed or current tx queues assigned to the VF.
>> >
>> > This muxing of semantics may be a little awkward and unnecessary, would
>> > it make sense for struct ifla_vf_info to have a separate fields for
>> > current number of queues and the admin-set guaranteed min?
>>
>> The loose semantics is mainly to allow some flexibility in
>> implementation.  Sure, we can tighten the definitions or add
>> additional fields.
>
> I would appreciate that, if others don't disagree.  I personally don't
> see the need for flexibility (AKA per-vendor behaviour) here, quite the
> opposite, min/max/current number of queues seems quite self-explanatory.
>
> Or at least don't allow min to mean current?  Otherwise the API gets a
> bit asymmetrical :(

Sure, will do.

>
>> > Is there a real world use case for the min value or are you trying to
>> > make the API feature complete?
>>
>> In this proposal, these parameters are mainly viewed as the bounds for
>> the queues that each VF can potentially allocate.  The actual number
>> of queues chosen by the VF driver or modified by the VF user can be
>> any number within the bounds.
>
> Perhaps you have misspoken here - these are not allowed bounds, right?
> min is the guarantee that queues will be available, not requirement.
> Similar to bandwidth allocation.
>
> IOW if the bounds are set [4, 16] the VF may still choose to use 1
> queue, event thought that's not within bounds.

Yes, you are absolutely right.  The VF can allocate 1 queue.  Up to
min is guaranteed.  Up to max is not guaranteed.

^ permalink raw reply

* linux-next: manual merge of the net-next tree with the net tree
From: Stephen Rothwell @ 2018-05-10  2:13 UTC (permalink / raw)
  To: David Miller, Networking
  Cc: Linux-Next Mailing List, Linux Kernel Mailing List,
	Heiner Kallweit

[-- Attachment #1: Type: text/plain, Size: 3147 bytes --]

Hi all,

Today's linux-next merge of the net-next tree got a conflict in:

  drivers/net/ethernet/realtek/r8169.c

between commit:

  3148dedfe79e ("r8169: fix powering up RTL8168h")

from the net tree and commit:

  4f447d296982 ("r8169: drop member pll_power_ops from struct rtl8169_private")

from the net-next tree.

I fixed it up (I think - see below) and can carry the fix as
necessary. This is now fixed as far as linux-next is concerned, but any
non trivial conflicts should be mentioned to your upstream maintainer
when your tree is submitted for merging.  You may also want to consider
cooperating with the maintainer of the conflicting tree to minimise any
particularly complex conflicts.

-- 
Cheers,
Stephen Rothwell

diff --cc drivers/net/ethernet/realtek/r8169.c
index c7aac1fc99e8,6d99b141a7aa..000000000000
--- a/drivers/net/ethernet/realtek/r8169.c
+++ b/drivers/net/ethernet/realtek/r8169.c
@@@ -4980,72 -4767,13 +4767,16 @@@ static void rtl_pll_power_down(struct r
  
  static void rtl_pll_power_up(struct rtl8169_private *tp)
  {
- 	rtl_generic_op(tp, tp->pll_power_ops.up);
- 
- 	/* give MAC/PHY some time to resume */
- 	msleep(20);
- }
- 
- static void rtl_init_pll_power_ops(struct rtl8169_private *tp)
- {
- 	struct pll_power_ops *ops = &tp->pll_power_ops;
- 
  	switch (tp->mac_version) {
- 	case RTL_GIGA_MAC_VER_07:
- 	case RTL_GIGA_MAC_VER_08:
- 	case RTL_GIGA_MAC_VER_09:
- 	case RTL_GIGA_MAC_VER_10:
- 	case RTL_GIGA_MAC_VER_16:
- 	case RTL_GIGA_MAC_VER_29:
- 	case RTL_GIGA_MAC_VER_30:
- 	case RTL_GIGA_MAC_VER_37:
- 	case RTL_GIGA_MAC_VER_39:
- 	case RTL_GIGA_MAC_VER_43:
- 	case RTL_GIGA_MAC_VER_47:
- 	case RTL_GIGA_MAC_VER_48:
- 		ops->down	= r810x_pll_power_down;
- 		ops->up		= r810x_pll_power_up;
- 		break;
- 
- 	case RTL_GIGA_MAC_VER_11:
- 	case RTL_GIGA_MAC_VER_12:
- 	case RTL_GIGA_MAC_VER_17:
- 	case RTL_GIGA_MAC_VER_18:
- 	case RTL_GIGA_MAC_VER_19:
- 	case RTL_GIGA_MAC_VER_20:
- 	case RTL_GIGA_MAC_VER_21:
- 	case RTL_GIGA_MAC_VER_22:
- 	case RTL_GIGA_MAC_VER_23:
- 	case RTL_GIGA_MAC_VER_24:
- 	case RTL_GIGA_MAC_VER_25:
- 	case RTL_GIGA_MAC_VER_26:
- 	case RTL_GIGA_MAC_VER_27:
- 	case RTL_GIGA_MAC_VER_28:
- 	case RTL_GIGA_MAC_VER_31:
- 	case RTL_GIGA_MAC_VER_32:
- 	case RTL_GIGA_MAC_VER_33:
- 	case RTL_GIGA_MAC_VER_34:
- 	case RTL_GIGA_MAC_VER_35:
- 	case RTL_GIGA_MAC_VER_36:
- 	case RTL_GIGA_MAC_VER_38:
- 	case RTL_GIGA_MAC_VER_40:
- 	case RTL_GIGA_MAC_VER_41:
- 	case RTL_GIGA_MAC_VER_42:
- 	case RTL_GIGA_MAC_VER_44:
- 	case RTL_GIGA_MAC_VER_45:
- 	case RTL_GIGA_MAC_VER_46:
- 	case RTL_GIGA_MAC_VER_49:
- 	case RTL_GIGA_MAC_VER_50:
- 	case RTL_GIGA_MAC_VER_51:
- 		ops->down	= r8168_pll_power_down;
- 		ops->up		= r8168_pll_power_up;
+ 	case RTL_GIGA_MAC_VER_01 ... RTL_GIGA_MAC_VER_06:
+ 	case RTL_GIGA_MAC_VER_13 ... RTL_GIGA_MAC_VER_15:
  		break;
- 
  	default:
- 		ops->down	= NULL;
- 		ops->up		= NULL;
- 		break;
+ 		r8168_pll_power_up(tp);
  	}
++
++	/* give MAC/PHY some time to resume */
++	msleep(20);
  }
  
  static void rtl_init_rxcfg(struct rtl8169_private *tp)

[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply

* [PATCH net-next] tcp: Add mark for TIMEWAIT sockets
From: Jon Maxwell @ 2018-05-10  2:07 UTC (permalink / raw)
  To: davem; +Cc: kuznet, yoshfuji, netdev, linux-kernel, jmaxwell

Aidan McGurn from Openwave Mobility systems reported the following bug:

"Marked routing is broken on customer deployment. Its effects are large 
increase in Uplink retransmissions caused by the client never receiving 
the final ACK to their FINACK - this ACK misses the mark and routes out 
of the incorrect route."

Currently marks are added to sk_buffs for replies when the "fwmark_reflect" 
sysctl is enabled. But not for TIME_WAIT sockets where the original socket had 
sk->sk_mark set via setsockopt(SO_MARK..).  

Fix this in IPv4/v6 by adding tw->tw_mark for TIME_WAIT sockets. Copy the the 
original sk->sk_mark in __inet_twsk_hashdance() to the new tw->tw_mark location. 
Then copy this into ctl_sk->sk_mark so that the skb gets sent with the correct 
mark. Do the same for resets. Give the "fwmark_reflect" sysctl precedence over 
sk->sk_mark so that netfilter rules are still honored.

Signed-off-by: Jon Maxwell <jmaxwell37@gmail.com>
---
 include/net/inet_timewait_sock.h |  1 +
 net/ipv4/ip_output.c             |  3 ++-
 net/ipv4/tcp_ipv4.c              | 18 ++++++++++++++++--
 net/ipv4/tcp_minisocks.c         |  1 +
 net/ipv6/tcp_ipv6.c              |  8 +++++++-
 5 files changed, 27 insertions(+), 4 deletions(-)

diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h
index c7be1ca8e562..659d8ed5a3bc 100644
--- a/include/net/inet_timewait_sock.h
+++ b/include/net/inet_timewait_sock.h
@@ -62,6 +62,7 @@ struct inet_timewait_sock {
 #define tw_dr			__tw_common.skc_tw_dr
 
 	int			tw_timeout;
+	__u32			tw_mark;
 	volatile unsigned char	tw_substate;
 	unsigned char		tw_rcv_wscale;
 
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 95adb171f852..cca4412dc4cb 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -1539,6 +1539,7 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
 	struct sk_buff *nskb;
 	int err;
 	int oif;
+	__u32 mark = IP4_REPLY_MARK(net, skb->mark);
 
 	if (__ip_options_echo(net, &replyopts.opt.opt, skb, sopt))
 		return;
@@ -1561,7 +1562,7 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
 		oif = skb->skb_iif;
 
 	flowi4_init_output(&fl4, oif,
-			   IP4_REPLY_MARK(net, skb->mark),
+			   mark ? (mark) : sk->sk_mark,
 			   RT_TOS(arg->tos),
 			   RT_SCOPE_UNIVERSE, ip_hdr(skb)->protocol,
 			   ip_reply_arg_flowi_flags(arg),
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index f70586b50838..fbee36579c83 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -621,6 +621,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
 	struct sock *sk1 = NULL;
 #endif
 	struct net *net;
+	struct sock *ctl_sk;
 
 	/* Never send a reset in response to a reset. */
 	if (th->rst)
@@ -723,11 +724,17 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
 	arg.tos = ip_hdr(skb)->tos;
 	arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
 	local_bh_disable();
-	ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
+	ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk);
+	if (sk && sk->sk_state == TCP_TIME_WAIT)
+		ctl_sk->sk_mark = inet_twsk(sk)->tw_mark;
+	else if (sk && sk_fullsock(sk))
+		ctl_sk->sk_mark = sk->sk_mark;
+	ip_send_unicast_reply(ctl_sk,
 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 			      &arg, arg.iov[0].iov_len);
 
+	ctl_sk->sk_mark = 0;
 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 	__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
 	local_bh_enable();
@@ -759,6 +766,7 @@ static void tcp_v4_send_ack(const struct sock *sk,
 	} rep;
 	struct net *net = sock_net(sk);
 	struct ip_reply_arg arg;
+	struct sock *ctl_sk;
 
 	memset(&rep.th, 0, sizeof(struct tcphdr));
 	memset(&arg, 0, sizeof(arg));
@@ -809,11 +817,17 @@ static void tcp_v4_send_ack(const struct sock *sk,
 	arg.tos = tos;
 	arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
 	local_bh_disable();
-	ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
+	ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk);
+	if (sk && sk->sk_state == TCP_TIME_WAIT)
+		ctl_sk->sk_mark = inet_twsk(sk)->tw_mark;
+	else if (sk && sk_fullsock(sk))
+		ctl_sk->sk_mark = sk->sk_mark;
+	ip_send_unicast_reply(ctl_sk,
 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 			      &arg, arg.iov[0].iov_len);
 
+	ctl_sk->sk_mark = 0;
 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 	local_bh_enable();
 }
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 57b5468b5139..f867658b4b30 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -263,6 +263,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
 		struct inet_sock *inet = inet_sk(sk);
 
 		tw->tw_transparent	= inet->transparent;
+		tw->tw_mark		= sk->sk_mark;
 		tw->tw_rcv_wscale	= tp->rx_opt.rcv_wscale;
 		tcptw->tw_rcv_nxt	= tp->rcv_nxt;
 		tcptw->tw_snd_nxt	= tp->snd_nxt;
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 6d664d83cd16..a6f876125091 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -803,6 +803,7 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32
 	unsigned int tot_len = sizeof(struct tcphdr);
 	struct dst_entry *dst;
 	__be32 *topt;
+	__u32 mark = IP6_REPLY_MARK(net, skb->mark);
 
 	if (tsecr)
 		tot_len += TCPOLEN_TSTAMP_ALIGNED;
@@ -871,11 +872,16 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32
 		fl6.flowi6_oif = oif;
 	}
 
-	fl6.flowi6_mark = IP6_REPLY_MARK(net, skb->mark);
+	if (sk && sk->sk_state == TCP_TIME_WAIT)
+		ctl_sk->sk_mark = inet_twsk(sk)->tw_mark;
+	else if (sk && sk_fullsock(sk))
+		ctl_sk->sk_mark = sk->sk_mark;
+	fl6.flowi6_mark = mark ? (mark) : ctl_sk->sk_mark;
 	fl6.fl6_dport = t1->dest;
 	fl6.fl6_sport = t1->source;
 	fl6.flowi6_uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
 	security_skb_classify_flow(skb, flowi6_to_flowi(&fl6));
+	ctl_sk->sk_mark = 0;
 
 	/* Pass a socket to ip6_dst_lookup either it is for RST
 	 * Underlying function will use this to retrieve the network
-- 
2.13.6

^ permalink raw reply related

* Re: [PATCH net] macmace: Set platform device coherent_dma_mask
From: Finn Thain @ 2018-05-10  1:25 UTC (permalink / raw)
  To: Geert Uytterhoeven
  Cc: David S. Miller, linux-m68k, netdev, Linux Kernel Mailing List,
	Christoph Hellwig
In-Reply-To: <CAMuHMdU1XBqt7hwEW6JTas64ZNGCGCMr5HMZwuLo0O-ZBCOWyA@mail.gmail.com>

On Thu, 3 May 2018, Geert Uytterhoeven wrote:

> 
> Perhaps you can add a new helper 
> (platform_device_register_simple_dma()?) that takes the DMA mask, too?

Would there be enough potential callers in future to justify that API?
It seems that there haven't been many in the past. I found four users of 
platform_device_register_simple() which might benefit. Mostly these call 
dma_set_coherent_mask() in the platform driver probe routine.

drivers/gpu/drm/etnaviv/etnaviv_drv.c
drivers/gpu/drm/exynos/exynos_drm_drv.c
drivers/gpu/drm/omapdrm/omap_drv.c
drivers/parport/parport_pc.c

(Am I missing any others?)

To actually hoist the dma mask setup out of existing platform drivers 
would have implications for every device that matches with those drivers. 

That's a bit risky since I can't test those devices -- that's assuming I 
could identify them all; sometimes platform device matching is not well 
defined at build time (see loongson_sysconf.ecname).

So far, it looks like macmace and macsonic would be the only callers of 
this new API call.

What's worse, if you do pass a dma_mask in struct platform_device_info, 
you end up with this problem in platform_device_register_full():

        if (pdevinfo->dma_mask) {
                /*
                 * This memory isn't freed when the device is put,
                 * I don't have a nice idea for that though.  Conceptually
                 * dma_mask in struct device should not be a pointer.
                 * See http://thread.gmane.org/gmane.linux.kernel.pci/9081
                 */
                pdev->dev.dma_mask =
                        kmalloc(sizeof(*pdev->dev.dma_mask), GFP_KERNEL);

Most of the platform drivers that call dma_coerce_mask_and_coherent() are 
using pdev->of_match_table, not platform_device_register_simple(). Many of 
them have a comment like this:

        /*
         * Right now device-tree probed devices don't get dma_mask set.
         * Since shared usb code relies on it, set it here for now.
         * Once we have dma capability bindings this can go away.
         */

> With people setting the mask to kill the WARNING splat, this may become 
> more common.

Since the commit which introduced the WARNING, only commits f61e64310b75 
("m68k: set dma and coherent masks for platform FEC ethernets") and 
7bcfab202ca7 ("powerpc/macio: set a proper dma_coherent_mask") seem to be 
aimed at squelching that WARNING.

(Am I missing any others?)

So far, this is not looking like a common problem, and I'm having trouble 
finding some way to improve on my original patches.

-- 

^ permalink raw reply

* Re: [PATCH net] macmace: Set platform device coherent_dma_mask
From: Finn Thain @ 2018-05-10  1:25 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Geert Uytterhoeven, David S. Miller, linux-m68k, netdev,
	Linux Kernel Mailing List
In-Reply-To: <20180503085120.GA14574@lst.de>

On Thu, 3 May 2018, Christoph Hellwig wrote:

> On Thu, May 03, 2018 at 10:46:56AM +0200, Geert Uytterhoeven wrote:
> > Perhaps you can add a new helper 
> > (platform_device_register_simple_dma()?) that takes the DMA mask, too? 
> > With people setting the mask to kill the WARNING splat, this may 
> > become more common.
> > 
> > struct platform_device_info already has a dma_mask field, but 
> > platform_device_register_resndata() explicitly sets it to zero.
> 
> Yes, that would be useful.  The other assumption could be that platform 
> devices always allow an all-0xff dma mask.

Could that have unintended side effects? The mask is presently unset by 
default, and my worry would be that changing it may cause some drivers to 
behave differently.

A quick grep turns up this in drivers/spi/spi-au1550.c for example,

	if (pdev->dev.dma_mask == NULL)
		dev_warn(&pdev->dev, "no dma mask\n");
	else
		hw->usedma = 1;

Also, if pdev.dev.dma_mask is to get a default value, shouldn't it use the 
same default as dma_get_mask, to avoid unintended side effects?

static inline u64 dma_get_mask(struct device *dev)
{
        if (dev && dev->dma_mask && *dev->dma_mask)
                return *dev->dma_mask;
        return DMA_BIT_MASK(32);
}

-- 

^ permalink raw reply

* Re: [PATCH net-next RFC 1/3] net: Add support to configure SR-IOV VF minimum and maximum queues.
From: Jakub Kicinski @ 2018-05-10  1:10 UTC (permalink / raw)
  To: Michael Chan; +Cc: Or Gerlitz, David Miller, Netdev
In-Reply-To: <CACKFLik6Mcd3YL8hUGG=uFV6QkBbnj3rskg08GezQrWKK5TBMA@mail.gmail.com>

On Wed, 9 May 2018 17:22:50 -0700, Michael Chan wrote:
> On Wed, May 9, 2018 at 4:15 PM, Jakub Kicinski wrote:
> > On Wed,  9 May 2018 07:21:41 -0400, Michael Chan wrote:  
> >> VF Queue resources are always limited and there is currently no
> >> infrastructure to allow the admin. on the host to add or reduce queue
> >> resources for any particular VF.  With ever increasing number of VFs
> >> being supported, it is desirable to allow the admin. to configure queue
> >> resources differently for the VFs.  Some VFs may require more or fewer
> >> queues due to different bandwidth requirements or different number of
> >> vCPUs in the VM.  This patch adds the infrastructure to do that by
> >> adding IFLA_VF_QUEUES netlink attribute and a new .ndo_set_vf_queues()
> >> to the net_device_ops.
> >>
> >> Four parameters are exposed for each VF:
> >>
> >> o min_tx_queues - Guaranteed or current tx queues assigned to the VF.  
> >
> > This muxing of semantics may be a little awkward and unnecessary, would
> > it make sense for struct ifla_vf_info to have a separate fields for
> > current number of queues and the admin-set guaranteed min?  
> 
> The loose semantics is mainly to allow some flexibility in
> implementation.  Sure, we can tighten the definitions or add
> additional fields.

I would appreciate that, if others don't disagree.  I personally don't
see the need for flexibility (AKA per-vendor behaviour) here, quite the
opposite, min/max/current number of queues seems quite self-explanatory.

Or at least don't allow min to mean current?  Otherwise the API gets a
bit asymmetrical :(

> > Is there a real world use case for the min value or are you trying to
> > make the API feature complete?  
> 
> In this proposal, these parameters are mainly viewed as the bounds for
> the queues that each VF can potentially allocate.  The actual number
> of queues chosen by the VF driver or modified by the VF user can be
> any number within the bounds.

Perhaps you have misspoken here - these are not allowed bounds, right?
min is the guarantee that queues will be available, not requirement.
Similar to bandwidth allocation.

IOW if the bounds are set [4, 16] the VF may still choose to use 1
queue, event thought that's not within bounds.

> We currently need to have min and max parameters to support the
> different modes we use to distribute the queue resources to the VFs.
> In one mode, for example, resources are statically divided and each VF
> has a small number of guaranteed queues (min = max).  In a different
> mode, we allow more flexible resource allocation with each VF having a
> small number of guaranteed queues but a higher number of
> non-guaranteed queues (min < max).  Some VFs may be able to allocate
> queues much higher than min when resources are still available, while
> others may only be able to allocate min queues when resources are used
> up.
> 
> With min and max exposed, the PF user can properly tweak the resources
> for each VF described above.

Right, I was just looking for a real world scenario where this
flexibility is going to be used - mainly because the switchdev model I
described below won't allow it.  I'm not sure users will leave a
portion of queues to be allocated by chance.

> >> o max_tx_queues - Maximum but not necessarily guaranteed tx queues
> >>   available to the VF.
> >>
> >> o min_rx_queues - Guaranteed or current rx queues assigned to the VF.
> >>
> >> o max_rx_queues - Maximum but not necessarily guaranteed rx queues
> >>   available to the VF.
> >>
> >> The "ip link set" command will subsequently be patched to support the new
> >> operation to set the above parameters.
> >>
> >> After the admin. makes a change to the above parameters, the corresponding
> >> VF will have a new range of channels to set using ethtool -L.
> >>
> >> Signed-off-by: Michael Chan <michael.chan@broadcom.com>  
> >
> > In switchdev mode we can use number of queues on the representor as a
> > proxy for max number of queues allowed for the ASIC port.  This works
> > better when representors are muxed in the first place than when they
> > have actual queues backing them.  WDYT about such scheme, Or?  A very
> > pleasant side-effect is that one can configure qdiscs and get stats
> > per-HW queue.  
> 
> This is an interesting approach.  But it doesn't have the min and max
> for each VF, and also only works in switchdev mode.

It allows controlling all ports of the switch with the same, existing
and well known API (incl. PFs).  But sadly I don't think we are at the
point where switchdev-mode solutions are considered an alternative, so
I'm only mentioning it to broaden the discussion :)  I'm not opposed to
your patches :)

^ permalink raw reply

* Proposal
From: Zeliha Omer Faruk @ 2018-05-10  0:39 UTC (permalink / raw)




-- 
Hello

Greetings to you please i have a business proposal for you contact me
for more detailes asap thanks.

Best Regards,
Miss.Zeliha ömer faruk
Esentepe Mahallesi Büyükdere
Caddesi Kristal Kule Binasi
No:215
Sisli - Istanbul, Turkey

^ permalink raw reply

* Re: [PATCH net-next RFC 1/3] net: Add support to configure SR-IOV VF minimum and maximum queues.
From: Michael Chan @ 2018-05-10  0:22 UTC (permalink / raw)
  To: Jakub Kicinski; +Cc: Or Gerlitz, David Miller, Netdev
In-Reply-To: <20180509161509.373f8c1b@cakuba.netronome.com>

On Wed, May 9, 2018 at 4:15 PM, Jakub Kicinski
<jakub.kicinski@netronome.com> wrote:
> On Wed,  9 May 2018 07:21:41 -0400, Michael Chan wrote:
>> VF Queue resources are always limited and there is currently no
>> infrastructure to allow the admin. on the host to add or reduce queue
>> resources for any particular VF.  With ever increasing number of VFs
>> being supported, it is desirable to allow the admin. to configure queue
>> resources differently for the VFs.  Some VFs may require more or fewer
>> queues due to different bandwidth requirements or different number of
>> vCPUs in the VM.  This patch adds the infrastructure to do that by
>> adding IFLA_VF_QUEUES netlink attribute and a new .ndo_set_vf_queues()
>> to the net_device_ops.
>>
>> Four parameters are exposed for each VF:
>>
>> o min_tx_queues - Guaranteed or current tx queues assigned to the VF.
>
> This muxing of semantics may be a little awkward and unnecessary, would
> it make sense for struct ifla_vf_info to have a separate fields for
> current number of queues and the admin-set guaranteed min?

The loose semantics is mainly to allow some flexibility in
implementation.  Sure, we can tighten the definitions or add
additional fields.

>
> Is there a real world use case for the min value or are you trying to
> make the API feature complete?

In this proposal, these parameters are mainly viewed as the bounds for
the queues that each VF can potentially allocate.  The actual number
of queues chosen by the VF driver or modified by the VF user can be
any number within the bounds.

We currently need to have min and max parameters to support the
different modes we use to distribute the queue resources to the VFs.
In one mode, for example, resources are statically divided and each VF
has a small number of guaranteed queues (min = max).  In a different
mode, we allow more flexible resource allocation with each VF having a
small number of guaranteed queues but a higher number of
non-guaranteed queues (min < max).  Some VFs may be able to allocate
queues much higher than min when resources are still available, while
others may only be able to allocate min queues when resources are used
up.

With min and max exposed, the PF user can properly tweak the resources
for each VF described above.

>
>> o max_tx_queues - Maximum but not necessarily guaranteed tx queues
>>   available to the VF.
>>
>> o min_rx_queues - Guaranteed or current rx queues assigned to the VF.
>>
>> o max_rx_queues - Maximum but not necessarily guaranteed rx queues
>>   available to the VF.
>>
>> The "ip link set" command will subsequently be patched to support the new
>> operation to set the above parameters.
>>
>> After the admin. makes a change to the above parameters, the corresponding
>> VF will have a new range of channels to set using ethtool -L.
>>
>> Signed-off-by: Michael Chan <michael.chan@broadcom.com>
>
> In switchdev mode we can use number of queues on the representor as a
> proxy for max number of queues allowed for the ASIC port.  This works
> better when representors are muxed in the first place than when they
> have actual queues backing them.  WDYT about such scheme, Or?  A very
> pleasant side-effect is that one can configure qdiscs and get stats
> per-HW queue.

This is an interesting approach.  But it doesn't have the min and max
for each VF, and also only works in switchdev mode.

^ permalink raw reply

* [PATCH net 2/2] bonding: send learning packets for vlans on slave
From: Debabrata Banerjee @ 2018-05-09 23:32 UTC (permalink / raw)
  To: David S . Miller, netdev, Vlad Yasevich
  Cc: Jay Vosburgh, Veaceslav Falico, Andy Gospodarek, dbanerje
In-Reply-To: <20180509233211.28207-1-dbanerje@akamai.com>

There was a regression at some point from the intended functionality of
commit f60c3704e87d ("bonding: Fix alb mode to only use first level
vlans.")

Given the return value vlan_get_encap_level() we need to store the nest
level of the bond device, and then compare the vlan's encap level to
this. Without this, this check always fails and learning packets are
never sent.

In addition, this same commit caused a regression in the behavior of
balance_alb, which requires learning packets be sent for all interfaces
using the slave's mac in order to load balance properly. For vlan's
that have not set a user mac, we can send after checking one bit.
Otherwise we need send the set mac, albeit defeating rx load balancing
for that vlan.

Signed-off-by: Debabrata Banerjee <dbanerje@akamai.com>
---
 drivers/net/bonding/bond_alb.c  | 13 ++++++++-----
 drivers/net/bonding/bond_main.c |  2 ++
 include/net/bonding.h           |  1 +
 3 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/drivers/net/bonding/bond_alb.c b/drivers/net/bonding/bond_alb.c
index 3f6faa657360..5eb0df2e5464 100644
--- a/drivers/net/bonding/bond_alb.c
+++ b/drivers/net/bonding/bond_alb.c
@@ -943,6 +943,10 @@ static void alb_send_lp_vid(struct slave *slave, u8 mac_addr[],
 	skb->priority = TC_PRIO_CONTROL;
 	skb->dev = slave->dev;
 
+	netdev_dbg(slave->bond->dev,
+		   "Send learning packet: dev %s mac %pM vlan %d\n",
+		   slave->dev->name, mac_addr, vid);
+
 	if (vid)
 		__vlan_hwaccel_put_tag(skb, vlan_proto, vid);
 
@@ -965,14 +969,13 @@ static int alb_upper_dev_walk(struct net_device *upper, void *_data)
 	u8 *mac_addr = data->mac_addr;
 	struct bond_vlan_tag *tags;
 
-	if (is_vlan_dev(upper) && vlan_get_encap_level(upper) == 0) {
-		if (strict_match &&
-		    ether_addr_equal_64bits(mac_addr,
-					    upper->dev_addr)) {
+	if (is_vlan_dev(upper) &&
+	    bond->nest_level == vlan_get_encap_level(upper) - 1) {
+		if (upper->addr_assign_type == NET_ADDR_STOLEN) {
 			alb_send_lp_vid(slave, mac_addr,
 					vlan_dev_vlan_proto(upper),
 					vlan_dev_vlan_id(upper));
-		} else if (!strict_match) {
+		} else {
 			alb_send_lp_vid(slave, upper->dev_addr,
 					vlan_dev_vlan_proto(upper),
 					vlan_dev_vlan_id(upper));
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 718e4914e3a0..1f1e97b26f95 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -1738,6 +1738,8 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev,
 	if (bond_mode_uses_xmit_hash(bond))
 		bond_update_slave_arr(bond, NULL);
 
+	bond->nest_level = dev_get_nest_level(bond_dev);
+
 	netdev_info(bond_dev, "Enslaving %s as %s interface with %s link\n",
 		    slave_dev->name,
 		    bond_is_active_slave(new_slave) ? "an active" : "a backup",
diff --git a/include/net/bonding.h b/include/net/bonding.h
index f801fc940b29..b52235158836 100644
--- a/include/net/bonding.h
+++ b/include/net/bonding.h
@@ -198,6 +198,7 @@ struct bonding {
 	struct   slave __rcu *primary_slave;
 	struct   bond_up_slave __rcu *slave_arr; /* Array of usable slaves */
 	bool     force_primary;
+	u32      nest_level;
 	s32      slave_cnt; /* never change this value outside the attach/detach wrappers */
 	int     (*recv_probe)(const struct sk_buff *, struct bonding *,
 			      struct slave *);
-- 
2.17.0

^ permalink raw reply related

* Proposal
From: Zeliha Omer Faruk @ 2018-05-09 23:57 UTC (permalink / raw)




-- 
Hello

Greetings to you please i have a business proposal for you contact me
for more detailes asap thanks.

Best Regards,
Miss.Zeliha ömer faruk
Esentepe Mahallesi Büyükdere
Caddesi Kristal Kule Binasi
No:215
Sisli - Istanbul, Turkey

^ permalink raw reply

* [PATCH net 1/2] bonding: do not allow rlb updates to invalid mac
From: Debabrata Banerjee @ 2018-05-09 23:32 UTC (permalink / raw)
  To: David S . Miller, netdev, Vlad Yasevich
  Cc: Jay Vosburgh, Veaceslav Falico, Andy Gospodarek, dbanerje
In-Reply-To: <20180509233211.28207-1-dbanerje@akamai.com>

Make sure multicast, broadcast, and zero mac's cannot be the output of rlb
updates, which should all be directed arps. Receive load balancing will be
collapsed if any of these happen, as the switch will broadcast.

Signed-off-by: Debabrata Banerjee <dbanerje@akamai.com>
---
 drivers/net/bonding/bond_alb.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/bonding/bond_alb.c b/drivers/net/bonding/bond_alb.c
index 1ed9529e7bd1..3f6faa657360 100644
--- a/drivers/net/bonding/bond_alb.c
+++ b/drivers/net/bonding/bond_alb.c
@@ -450,7 +450,7 @@ static void rlb_update_client(struct rlb_client_info *client_info)
 {
 	int i;

-	if (!client_info->slave)
+	if (!client_info->slave || !is_valid_ether_addr(client_info->mac_dst))
 		return;

 	for (i = 0; i < RLB_ARP_BURST_SIZE; i++) {
-- 
2.17.0

^ permalink raw reply related

* [PATCH net 0/2] bonding: bug fixes and regressions
From: Debabrata Banerjee @ 2018-05-09 23:32 UTC (permalink / raw)
  To: David S . Miller, netdev, Vlad Yasevich
  Cc: Jay Vosburgh, Veaceslav Falico, Andy Gospodarek, dbanerje

Fixes to bonding driver for balance-alb mode, suitable for stable.

Debabrata Banerjee (2):
  bonding: do not allow rlb updates to invalid mac
  bonding: send learning packets for vlans on slave

 drivers/net/bonding/bond_alb.c  | 15 +++++++++------
 drivers/net/bonding/bond_main.c |  2 ++
 include/net/bonding.h           |  1 +
 3 files changed, 12 insertions(+), 6 deletions(-)

-- 
2.17.0

^ permalink raw reply

* Re: linux-next: Tree for May 9 (mlx5)
From: Saeed Mahameed @ 2018-05-09 23:21 UTC (permalink / raw)
  To: sfr@canb.auug.org.au, rdunlap@infradead.org, Israel Rukshin,
	linux-next@vger.kernel.org, Max Gurtovoy
  Cc: linux-kernel@vger.kernel.org, netdev@vger.kernel.org,
	Leon Romanovsky, Matan Barak
In-Reply-To: <826ab15b-f2bc-0c5d-8d5e-6466badcd3e0@infradead.org>

On Wed, 2018-05-09 at 08:31 -0700, Randy Dunlap wrote:
> On 05/09/2018 04:21 AM, Stephen Rothwell wrote:
> > Hi all,
> > 
> > Changes since 20180508:
> > 
> 
> on x86_64:
> # CONFIG_SMP is not set
> 
> In file included from
> ../drivers/net/ethernet/mellanox/mlx5/core/main.c:43:0:
> ../include/linux/mlx5/driver.h: In function
> 'mlx5_get_vector_affinity_hint':
> ../include/linux/mlx5/driver.h:1299:13: error: 'struct irq_desc' has
> no member named 'affinity_hint'
>   return desc->affinity_hint;
>              ^
> 
> 

Hi Stephen,

Israel and Max are working on a solution, will provide it ASAP.

Thanks,
Saeed.

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox