Netdev List
 help / color / mirror / Atom feed
* [PATCH net-next 3/9] net/ipv6: Plumb support for filtering route dumps
From: David Ahern @ 2018-10-11 15:06 UTC (permalink / raw)
  To: netdev, davem; +Cc: David Ahern
In-Reply-To: <20181011150627.4010-1-dsahern@kernel.org>

From: David Ahern <dsahern@gmail.com>

Implement kernel side filtering of routes by table id, egress device
index, protocol, and route type.

Move the existing route flags check for prefix only routes to the new
filter.

Signed-off-by: David Ahern <dsahern@gmail.com>
---
 net/ipv6/ip6_fib.c |  9 +++++++++
 net/ipv6/route.c   | 40 ++++++++++++++++++++++++++++++++--------
 2 files changed, 41 insertions(+), 8 deletions(-)

diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 6a169794a674..dd6a43874192 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -580,6 +580,11 @@ static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
 		err = ip_valid_fib_dump_req(net, nlh, &arg.filter, cb->extack);
 		if (err < 0)
 			return err;
+	} else if (nlmsg_len(nlh) >= sizeof(struct rtmsg)) {
+		struct rtmsg *rtm = nlmsg_data(nlh);
+
+		if (rtm->rtm_flags & RTM_F_PREFIX)
+			arg.filter.flags = RTM_F_PREFIX;
 	}
 
 	s_h = cb->args[0];
@@ -616,6 +621,10 @@ static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
 		hlist_for_each_entry_rcu(tb, head, tb6_hlist) {
 			if (e < s_e)
 				goto next;
+			if (arg.filter.table_id &&
+			    arg.filter.table_id != tb->tb6_id)
+				goto next;
+
 			res = fib6_dump_table(tb, skb, cb);
 			if (res != 0)
 				goto out;
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index bf4cd647d8b8..8ed2e7462657 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -4763,28 +4763,52 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb,
 	return -EMSGSIZE;
 }
 
+static bool fib6_info_uses_dev(const struct fib6_info *f6i,
+			       const struct net_device *dev)
+{
+	if (f6i->fib6_nh.nh_dev == dev)
+		return true;
+
+	if (f6i->fib6_nsiblings) {
+		struct fib6_info *sibling, *next_sibling;
+
+		list_for_each_entry_safe(sibling, next_sibling,
+					 &f6i->fib6_siblings, fib6_siblings) {
+			if (sibling->fib6_nh.nh_dev == dev)
+				return true;
+		}
+	}
+
+	return false;
+}
+
 int rt6_dump_route(struct fib6_info *rt, void *p_arg)
 {
 	struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
+	struct fib_dump_filter *filter = &arg->filter;
+	unsigned int flags = NLM_F_MULTI;
 	struct net *net = arg->net;
 
 	if (rt == net->ipv6.fib6_null_entry)
 		return 0;
 
-	if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
-		struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
-
-		/* user wants prefix routes only */
-		if (rtm->rtm_flags & RTM_F_PREFIX &&
-		    !(rt->fib6_flags & RTF_PREFIX_RT)) {
-			/* success since this is not a prefix route */
+	if ((filter->flags & RTM_F_PREFIX) &&
+	    !(rt->fib6_flags & RTF_PREFIX_RT)) {
+		/* success since this is not a prefix route */
+		return 1;
+	}
+	if (filter->filter_set) {
+		if ((filter->rt_type && rt->fib6_type != filter->rt_type) ||
+		    (filter->dev && !fib6_info_uses_dev(rt, filter->dev)) ||
+		    (filter->protocol && rt->fib6_protocol != filter->protocol)) {
 			return 1;
 		}
+		flags |= NLM_F_DUMP_FILTERED;
 	}
 
 	return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0,
 			     RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid,
-			     arg->cb->nlh->nlmsg_seq, NLM_F_MULTI);
+			     arg->cb->nlh->nlmsg_seq, flags);
 }
 
 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
-- 
2.11.0

^ permalink raw reply related

* [PATCH net-next 7/9] net/mpls: Handle kernel side filtering of route dumps
From: David Ahern @ 2018-10-11 15:06 UTC (permalink / raw)
  To: netdev, davem; +Cc: David Ahern
In-Reply-To: <20181011150627.4010-1-dsahern@kernel.org>

From: David Ahern <dsahern@gmail.com>

Update the dump request parsing in MPLS for the non-INET case to
enable kernel side filtering. If INET is disabled the other filters
that make sense for MPLS are protocol and nexthop device.

Signed-off-by: David Ahern <dsahern@gmail.com>
---
 net/mpls/af_mpls.c | 32 +++++++++++++++++++++++++++-----
 1 file changed, 27 insertions(+), 5 deletions(-)

diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
index 48f4cbd9fb38..b256de02251b 100644
--- a/net/mpls/af_mpls.c
+++ b/net/mpls/af_mpls.c
@@ -2043,7 +2043,9 @@ static int mpls_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh,
 				   struct fib_dump_filter *filter,
 				   struct netlink_ext_ack *extack)
 {
+	struct nlattr *tb[RTA_MAX + 1];
 	struct rtmsg *rtm;
+	int err, i;
 
 	if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
 		NL_SET_ERR_MSG_MOD(extack, "Invalid header for FIB dump request");
@@ -2052,15 +2054,35 @@ static int mpls_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh,
 
 	rtm = nlmsg_data(nlh);
 	if (rtm->rtm_dst_len || rtm->rtm_src_len  || rtm->rtm_tos   ||
-	    rtm->rtm_table   || rtm->rtm_protocol || rtm->rtm_scope ||
-	    rtm->rtm_type    || rtm->rtm_flags) {
+	    rtm->rtm_table   || rtm->rtm_scope    || rtm->rtm_type  ||
+	    rtm->rtm_flags) {
 		NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for FIB dump request");
 		return -EINVAL;
 	}
 
-	if (nlmsg_attrlen(nlh, sizeof(*rtm))) {
-		NL_SET_ERR_MSG_MOD(extack, "Invalid data after header in FIB dump request");
-		return -EINVAL;
+	if (rtm->rtm_protocol) {
+		filter->protocol = rtm->rtm_protocol;
+		filter->filter_set = 1;
+	}
+
+	err = nlmsg_parse_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
+				 rtm_mpls_policy, extack);
+	if (err < 0)
+		return err;
+
+	for (i = 0; i <= RTA_MAX; ++i) {
+		int ifindex;
+
+		if (i == RTA_OIF) {
+			ifindex = nla_get_u32(tb[i]);
+			filter->dev = __dev_get_by_index(net, ifindex);
+			if (!filter->dev)
+				return -ENODEV;
+			filter->filter_set = 1;
+		} else if (tb[i]) {
+			NL_SET_ERR_MSG(extack, "Unsupported attribute in dump request");
+			return -EINVAL;
+		}
 	}
 
 	return 0;
-- 
2.11.0

^ permalink raw reply related

* [PATCH net-next 4/9] net/mpls: Plumb support for filtering route dumps
From: David Ahern @ 2018-10-11 15:06 UTC (permalink / raw)
  To: netdev, davem; +Cc: David Ahern
In-Reply-To: <20181011150627.4010-1-dsahern@kernel.org>

From: David Ahern <dsahern@gmail.com>

Implement kernel side filtering of routes by egress device index and
protocol. MPLS uses only a single table and route type.

Signed-off-by: David Ahern <dsahern@gmail.com>
---
 net/mpls/af_mpls.c | 42 +++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 41 insertions(+), 1 deletion(-)

diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
index bfcb4759c9ee..48f4cbd9fb38 100644
--- a/net/mpls/af_mpls.c
+++ b/net/mpls/af_mpls.c
@@ -2067,12 +2067,35 @@ static int mpls_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh,
 }
 #endif
 
+static bool mpls_rt_uses_dev(struct mpls_route *rt,
+			     const struct net_device *dev)
+{
+	struct net_device *nh_dev;
+
+	if (rt->rt_nhn == 1) {
+		struct mpls_nh *nh = rt->rt_nh;
+
+		nh_dev = rtnl_dereference(nh->nh_dev);
+		if (dev == nh_dev)
+			return true;
+	} else {
+		for_nexthops(rt) {
+			nh_dev = rtnl_dereference(nh->nh_dev);
+			if (nh_dev == dev)
+				return true;
+		} endfor_nexthops(rt);
+	}
+
+	return false;
+}
+
 static int mpls_dump_routes(struct sk_buff *skb, struct netlink_callback *cb)
 {
 	const struct nlmsghdr *nlh = cb->nlh;
 	struct net *net = sock_net(skb->sk);
 	struct mpls_route __rcu **platform_label;
 	struct fib_dump_filter filter = {};
+	unsigned int flags = NLM_F_MULTI;
 	size_t platform_labels;
 	unsigned int index;
 
@@ -2084,6 +2107,14 @@ static int mpls_dump_routes(struct sk_buff *skb, struct netlink_callback *cb)
 		err = mpls_valid_fib_dump_req(net, nlh, &filter, cb->extack);
 		if (err < 0)
 			return err;
+
+		/* for MPLS, there is only 1 table with fixed type and flags.
+		 * If either are set in the filter then return nothing.
+		 */
+		if ((filter.table_id && filter.table_id != RT_TABLE_MAIN) ||
+		    (filter.rt_type && filter.rt_type != RTN_UNICAST) ||
+		     filter.flags)
+			return skb->len;
 	}
 
 	index = cb->args[0];
@@ -2092,15 +2123,24 @@ static int mpls_dump_routes(struct sk_buff *skb, struct netlink_callback *cb)
 
 	platform_label = rtnl_dereference(net->mpls.platform_label);
 	platform_labels = net->mpls.platform_labels;
+
+	if (filter.filter_set)
+		flags |= NLM_F_DUMP_FILTERED;
+
 	for (; index < platform_labels; index++) {
 		struct mpls_route *rt;
+
 		rt = rtnl_dereference(platform_label[index]);
 		if (!rt)
 			continue;
 
+		if ((filter.dev && !mpls_rt_uses_dev(rt, filter.dev)) ||
+		    (filter.protocol && rt->rt_protocol != filter.protocol))
+			continue;
+
 		if (mpls_dump_route(skb, NETLINK_CB(cb->skb).portid,
 				    cb->nlh->nlmsg_seq, RTM_NEWROUTE,
-				    index, rt, NLM_F_MULTI) < 0)
+				    index, rt, flags) < 0)
 			break;
 	}
 	cb->args[0] = index;
-- 
2.11.0

^ permalink raw reply related

* [PATCH net-next 8/9] net/ipv6: Bail early if user only wants cloned entries
From: David Ahern @ 2018-10-11 15:06 UTC (permalink / raw)
  To: netdev, davem; +Cc: David Ahern
In-Reply-To: <20181011150627.4010-1-dsahern@kernel.org>

From: David Ahern <dsahern@gmail.com>

Similar to IPv4, IPv6 fib no longer contains cloned routes. If a user
requests a route dump for only cloned entries, no sense walking the FIB
and returning everything.

Signed-off-by: David Ahern <dsahern@gmail.com>
---
 net/ipv6/ip6_fib.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index dd6a43874192..0399fafc5136 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -583,10 +583,13 @@ static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
 	} else if (nlmsg_len(nlh) >= sizeof(struct rtmsg)) {
 		struct rtmsg *rtm = nlmsg_data(nlh);
 
-		if (rtm->rtm_flags & RTM_F_PREFIX)
-			arg.filter.flags = RTM_F_PREFIX;
+		arg.filter.flags = rtm->rtm_flags & (RTM_F_PREFIX | RTM_F_CLONED);
 	}
 
+	/* fib entries are never clones */
+	if (arg.filter.flags & RTM_F_CLONED)
+		return skb->len;
+
 	s_h = cb->args[0];
 	s_e = cb->args[1];
 
-- 
2.11.0

^ permalink raw reply related

* [PATCH net-next 6/9] net: Enable kernel side filtering of route dumps
From: David Ahern @ 2018-10-11 15:06 UTC (permalink / raw)
  To: netdev, davem; +Cc: David Ahern
In-Reply-To: <20181011150627.4010-1-dsahern@kernel.org>

From: David Ahern <dsahern@gmail.com>

Update parsing of route dump request to enable kernel side filtering.
Allow filtering results by protocol (e.g., which routing daemon installed
the route), route type (e.g., unicast), table id and nexthop device. These
amount to the low hanging fruit, yet a huge improvement, for dumping
routes.

ip_valid_fib_dump_req is called with RTNL held, so __dev_get_by_index can
be used to look up the device index without taking a reference. From
there filter->dev is only used during dump loops with the lock still held.

Signed-off-by: David Ahern <dsahern@gmail.com>
---
 net/ipv4/fib_frontend.c | 45 ++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 40 insertions(+), 5 deletions(-)

diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 1528b0919951..a99f2c7ba4e6 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -806,7 +806,11 @@ int ip_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh,
 			  struct fib_dump_filter *filter,
 			  struct netlink_ext_ack *extack)
 {
+	struct nlattr *tb[RTA_MAX + 1];
 	struct rtmsg *rtm;
+	int err, i;
+
+	ASSERT_RTNL();
 
 	if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
 		NL_SET_ERR_MSG(extack, "Invalid header for FIB dump request");
@@ -815,8 +819,7 @@ int ip_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh,
 
 	rtm = nlmsg_data(nlh);
 	if (rtm->rtm_dst_len || rtm->rtm_src_len  || rtm->rtm_tos   ||
-	    rtm->rtm_table   || rtm->rtm_protocol || rtm->rtm_scope ||
-	    rtm->rtm_type) {
+	    rtm->rtm_scope) {
 		NL_SET_ERR_MSG(extack, "Invalid values in header for FIB dump request");
 		return -EINVAL;
 	}
@@ -825,9 +828,41 @@ int ip_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh,
 		return -EINVAL;
 	}
 
-	if (nlmsg_attrlen(nlh, sizeof(*rtm))) {
-		NL_SET_ERR_MSG(extack, "Invalid data after header in FIB dump request");
-		return -EINVAL;
+	filter->flags    = rtm->rtm_flags;
+	filter->protocol = rtm->rtm_protocol;
+	filter->rt_type  = rtm->rtm_type;
+	filter->table_id = rtm->rtm_table;
+
+	err = nlmsg_parse_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
+				 rtm_ipv4_policy, extack);
+	if (err < 0)
+		return err;
+
+	for (i = 0; i <= RTA_MAX; ++i) {
+		int ifindex;
+
+		if (!tb[i])
+			continue;
+
+		switch (i) {
+		case RTA_TABLE:
+			filter->table_id = nla_get_u32(tb[i]);
+			break;
+		case RTA_OIF:
+			ifindex = nla_get_u32(tb[i]);
+			filter->dev = __dev_get_by_index(net, ifindex);
+			if (!filter->dev)
+				return -ENODEV;
+			break;
+		default:
+			NL_SET_ERR_MSG(extack, "Unsupported attribute in dump request");
+			return -EINVAL;
+		}
+	}
+
+	if (filter->flags || filter->protocol || filter->rt_type ||
+	    filter->table_id || filter->dev) {
+		filter->filter_set = 1;
 	}
 
 	return 0;
-- 
2.11.0

^ permalink raw reply related

* [PATCH net-next 5/9] net: Plumb support for filtering ipv4 and ipv6 multicast route dumps
From: David Ahern @ 2018-10-11 15:06 UTC (permalink / raw)
  To: netdev, davem; +Cc: David Ahern
In-Reply-To: <20181011150627.4010-1-dsahern@kernel.org>

From: David Ahern <dsahern@gmail.com>

Implement kernel side filtering of routes by egress device index and
table id.

Signed-off-by: David Ahern <dsahern@gmail.com>
---
 include/linux/mroute_base.h |  5 +++--
 net/ipv4/ipmr.c             |  2 +-
 net/ipv4/ipmr_base.c        | 33 ++++++++++++++++++++++++++++++++-
 net/ipv6/ip6mr.c            |  2 +-
 4 files changed, 37 insertions(+), 5 deletions(-)

diff --git a/include/linux/mroute_base.h b/include/linux/mroute_base.h
index 6675b9f81979..8fc516c47a64 100644
--- a/include/linux/mroute_base.h
+++ b/include/linux/mroute_base.h
@@ -7,6 +7,7 @@
 #include <net/net_namespace.h>
 #include <net/sock.h>
 #include <net/fib_notifier.h>
+#include <net/ip_fib.h>
 
 /**
  * struct vif_device - interface representor for multicast routing
@@ -290,7 +291,7 @@ int mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb,
 				 struct sk_buff *skb,
 				 u32 portid, u32 seq, struct mr_mfc *c,
 				 int cmd, int flags),
-		     spinlock_t *lock);
+		     spinlock_t *lock, struct fib_dump_filter *filter);
 
 int mr_dump(struct net *net, struct notifier_block *nb, unsigned short family,
 	    int (*rules_dump)(struct net *net,
@@ -340,7 +341,7 @@ mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb,
 			     struct sk_buff *skb,
 			     u32 portid, u32 seq, struct mr_mfc *c,
 			     int cmd, int flags),
-		 spinlock_t *lock)
+		 spinlock_t *lock, struct fib_dump_filter *filter)
 {
 	return -EINVAL;
 }
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 44d777058960..f6ad4ef1d3c7 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -2539,7 +2539,7 @@ static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb)
 	}
 
 	return mr_rtm_dumproute(skb, cb, ipmr_mr_table_iter,
-				_ipmr_fill_mroute, &mfc_unres_lock);
+				_ipmr_fill_mroute, &mfc_unres_lock, &filter);
 }
 
 static const struct nla_policy rtm_ipmr_policy[RTA_MAX + 1] = {
diff --git a/net/ipv4/ipmr_base.c b/net/ipv4/ipmr_base.c
index 1ad9aa62a97b..647300a55f42 100644
--- a/net/ipv4/ipmr_base.c
+++ b/net/ipv4/ipmr_base.c
@@ -268,6 +268,24 @@ int mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
 }
 EXPORT_SYMBOL(mr_fill_mroute);
 
+static bool mr_mfc_uses_dev(const struct mr_table *mrt,
+			    const struct mr_mfc *c,
+			    const struct net_device *dev)
+{
+	int ct;
+
+	for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) {
+		if (VIF_EXISTS(mrt, ct) && c->mfc_un.res.ttls[ct] < 255) {
+			const struct vif_device *vif;
+
+			vif = &mrt->vif_table[ct];
+			if (vif->dev == dev)
+				return true;
+		}
+	}
+	return false;
+}
+
 int mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb,
 		     struct mr_table *(*iter)(struct net *net,
 					      struct mr_table *mrt),
@@ -275,17 +293,26 @@ int mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb,
 				 struct sk_buff *skb,
 				 u32 portid, u32 seq, struct mr_mfc *c,
 				 int cmd, int flags),
-		     spinlock_t *lock)
+		     spinlock_t *lock, struct fib_dump_filter *filter)
 {
 	unsigned int t = 0, e = 0, s_t = cb->args[0], s_e = cb->args[1];
 	struct net *net = sock_net(skb->sk);
 	struct mr_table *mrt;
 	struct mr_mfc *mfc;
 
+	/* multicast does not track protocol or have route type other
+	 * than RTN_MULTICAST
+	 */
+	if (filter->protocol || filter->flags ||
+	    (filter->rt_type && filter->rt_type != RTN_MULTICAST))
+		return 0;
+
 	rcu_read_lock();
 	for (mrt = iter(net, NULL); mrt; mrt = iter(net, mrt)) {
 		if (t < s_t)
 			goto next_table;
+		if (filter->table_id && filter->table_id != mrt->id)
+			goto next_table;
 		list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list) {
 			if (e < s_e)
 				goto next_entry;
@@ -303,6 +330,10 @@ int mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb,
 		list_for_each_entry(mfc, &mrt->mfc_unres_queue, list) {
 			if (e < s_e)
 				goto next_entry2;
+			if (filter->dev &&
+			    !mr_mfc_uses_dev(mrt, mfc, filter->dev))
+				goto next_entry2;
+
 			if (fill(mrt, skb, NETLINK_CB(cb->skb).portid,
 				 cb->nlh->nlmsg_seq, mfc,
 				 RTM_NEWROUTE, NLM_F_MULTI) < 0) {
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index dbd5166c5599..a7593d1c372c 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -2470,5 +2470,5 @@ static int ip6mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb)
 	}
 
 	return mr_rtm_dumproute(skb, cb, ip6mr_mr_table_iter,
-				_ip6mr_fill_mroute, &mfc_unres_lock);
+				_ip6mr_fill_mroute, &mfc_unres_lock, &filter);
 }
-- 
2.11.0

^ permalink raw reply related

* [PATCH net-next 1/9] net: Add struct for fib dump filter
From: David Ahern @ 2018-10-11 15:06 UTC (permalink / raw)
  To: netdev, davem; +Cc: David Ahern
In-Reply-To: <20181011150627.4010-1-dsahern@kernel.org>

From: David Ahern <dsahern@gmail.com>

Add struct fib_dump_filter for options on limiting which routes are
returned in a dump request. The current list is table id, protocol,
route type, rtm_flags and nexthop device index. struct net is needed
to lookup the net_device from the index.

Plumb the new arguments from dump handlers to ip_valid_fib_dump_req.

Signed-off-by: David Ahern <dsahern@gmail.com>
---
 include/net/ip6_route.h |  1 +
 include/net/ip_fib.h    | 12 +++++++++++-
 net/ipv4/fib_frontend.c |  6 ++++--
 net/ipv4/ipmr.c         |  6 +++++-
 net/ipv6/ip6_fib.c      |  5 +++--
 net/ipv6/ip6mr.c        |  5 ++++-
 net/mpls/af_mpls.c      | 12 ++++++++----
 7 files changed, 36 insertions(+), 11 deletions(-)

diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index cef186dbd2ce..7ab119936e69 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -174,6 +174,7 @@ struct rt6_rtnl_dump_arg {
 	struct sk_buff *skb;
 	struct netlink_callback *cb;
 	struct net *net;
+	struct fib_dump_filter filter;
 };
 
 int rt6_dump_route(struct fib6_info *f6i, void *p_arg);
diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 9846b79c9ee1..9dde41ad02a1 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -222,6 +222,15 @@ struct fib_table {
 	unsigned long		__data[0];
 };
 
+struct fib_dump_filter {
+	bool			filter_set;
+	unsigned char		protocol;
+	unsigned char		rt_type;
+	u32			table_id;
+	unsigned int		flags;
+	struct net_device	*dev;
+};
+
 int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
 		     struct fib_result *res, int fib_flags);
 int fib_table_insert(struct net *, struct fib_table *, struct fib_config *,
@@ -452,6 +461,7 @@ static inline void fib_proc_exit(struct net *net)
 
 u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr);
 
-int ip_valid_fib_dump_req(const struct nlmsghdr *nlh,
+int ip_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh,
+			  struct fib_dump_filter *filter,
 			  struct netlink_ext_ack *extack);
 #endif  /* _NET_FIB_H */
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 038f511c73fa..d0fb9b7efa27 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -802,7 +802,8 @@ static int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
 	return err;
 }
 
-int ip_valid_fib_dump_req(const struct nlmsghdr *nlh,
+int ip_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh,
+			  struct fib_dump_filter *filter,
 			  struct netlink_ext_ack *extack)
 {
 	struct rtmsg *rtm;
@@ -837,6 +838,7 @@ static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
 {
 	const struct nlmsghdr *nlh = cb->nlh;
 	struct net *net = sock_net(skb->sk);
+	struct fib_dump_filter filter = {};
 	unsigned int h, s_h;
 	unsigned int e = 0, s_e;
 	struct fib_table *tb;
@@ -844,7 +846,7 @@ static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
 	int dumped = 0, err;
 
 	if (cb->strict_check) {
-		err = ip_valid_fib_dump_req(nlh, cb->extack);
+		err = ip_valid_fib_dump_req(net, nlh, &filter, cb->extack);
 		if (err < 0)
 			return err;
 	}
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 91b0d5671649..44d777058960 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -2527,9 +2527,13 @@ static int ipmr_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
 
 static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb)
 {
+	struct fib_dump_filter filter = {};
+
 	if (cb->strict_check) {
-		int err = ip_valid_fib_dump_req(cb->nlh, cb->extack);
+		int err;
 
+		err = ip_valid_fib_dump_req(sock_net(skb->sk), cb->nlh,
+					    &filter, cb->extack);
 		if (err < 0)
 			return err;
 	}
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index e14d244c551f..6a169794a674 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -566,17 +566,18 @@ static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
 {
 	const struct nlmsghdr *nlh = cb->nlh;
 	struct net *net = sock_net(skb->sk);
+	struct rt6_rtnl_dump_arg arg = {};
 	unsigned int h, s_h;
 	unsigned int e = 0, s_e;
-	struct rt6_rtnl_dump_arg arg;
 	struct fib6_walker *w;
 	struct fib6_table *tb;
 	struct hlist_head *head;
 	int res = 0;
 
 	if (cb->strict_check) {
-		int err = ip_valid_fib_dump_req(nlh, cb->extack);
+		int err;
 
+		err = ip_valid_fib_dump_req(net, nlh, &arg.filter, cb->extack);
 		if (err < 0)
 			return err;
 	}
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index d7563ef76518..dbd5166c5599 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -2458,10 +2458,13 @@ static void mrt6msg_netlink_event(struct mr_table *mrt, struct sk_buff *pkt)
 static int ip6mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb)
 {
 	const struct nlmsghdr *nlh = cb->nlh;
+	struct fib_dump_filter filter = {};
 
 	if (cb->strict_check) {
-		int err = ip_valid_fib_dump_req(nlh, cb->extack);
+		int err;
 
+		err = ip_valid_fib_dump_req(sock_net(skb->sk), nlh,
+					    &filter, cb->extack);
 		if (err < 0)
 			return err;
 	}
diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
index 5fe274c47c41..bfcb4759c9ee 100644
--- a/net/mpls/af_mpls.c
+++ b/net/mpls/af_mpls.c
@@ -2032,13 +2032,15 @@ static int mpls_dump_route(struct sk_buff *skb, u32 portid, u32 seq, int event,
 }
 
 #if IS_ENABLED(CONFIG_INET)
-static int mpls_valid_fib_dump_req(const struct nlmsghdr *nlh,
+static int mpls_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh,
+				   struct fib_dump_filter *filter,
 				   struct netlink_ext_ack *extack)
 {
-	return ip_valid_fib_dump_req(nlh, extack);
+	return ip_valid_fib_dump_req(net, nlh, filter, extack);
 }
 #else
-static int mpls_valid_fib_dump_req(const struct nlmsghdr *nlh,
+static int mpls_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh,
+				   struct fib_dump_filter *filter,
 				   struct netlink_ext_ack *extack)
 {
 	struct rtmsg *rtm;
@@ -2070,14 +2072,16 @@ static int mpls_dump_routes(struct sk_buff *skb, struct netlink_callback *cb)
 	const struct nlmsghdr *nlh = cb->nlh;
 	struct net *net = sock_net(skb->sk);
 	struct mpls_route __rcu **platform_label;
+	struct fib_dump_filter filter = {};
 	size_t platform_labels;
 	unsigned int index;
 
 	ASSERT_RTNL();
 
 	if (cb->strict_check) {
-		int err = mpls_valid_fib_dump_req(nlh, cb->extack);
+		int err;
 
+		err = mpls_valid_fib_dump_req(net, nlh, &filter, cb->extack);
 		if (err < 0)
 			return err;
 	}
-- 
2.11.0

^ permalink raw reply related

* [PATCH net-next 0/9] net: Kernel side filtering for route dumps
From: David Ahern @ 2018-10-11 15:06 UTC (permalink / raw)
  To: netdev, davem; +Cc: David Ahern

From: David Ahern <dsahern@gmail.com>

Implement kernel side filtering of route dumps by protocol (e.g., which
routing daemon installed the route), route type (e.g., unicast), table
id and nexthop device.

iproute2 has been doing this filtering in userspace for years; pushing
the filters to the kernel side reduces the amount of data the kernel
sends and reduces wasted cycles on both sides processing unwanted data.
These initial options provide a huge improvement for efficiently
examining routes on large scale systems.

David Ahern (9):
  net: Add struct for fib dump filter
  net/ipv4: Plumb support for filtering route dumps
  net/ipv6: Plumb support for filtering route dumps
  net/mpls: Plumb support for filtering route dumps
  net: Plumb support for filtering ipv4 and ipv6 multicast route dumps
  net: Enable kernel side filtering of route dumps
  net/mpls: Handle kernel side filtering of route dumps
  net/ipv6: Bail early if user only wants cloned entries
  net/ipv4: Bail early if user only wants prefix entries

 include/linux/mroute_base.h |  5 +--
 include/net/ip6_route.h     |  1 +
 include/net/ip_fib.h        | 14 ++++++--
 net/ipv4/fib_frontend.c     | 64 +++++++++++++++++++++++++++------
 net/ipv4/fib_trie.c         | 37 +++++++++++++------
 net/ipv4/ipmr.c             |  8 +++--
 net/ipv4/ipmr_base.c        | 33 ++++++++++++++++-
 net/ipv6/ip6_fib.c          | 17 +++++++--
 net/ipv6/ip6mr.c            |  7 ++--
 net/ipv6/route.c            | 40 ++++++++++++++++-----
 net/mpls/af_mpls.c          | 86 +++++++++++++++++++++++++++++++++++++++------
 11 files changed, 262 insertions(+), 50 deletions(-)

-- 
2.11.0

^ permalink raw reply

* [PATCH net-next 2/9] net/ipv4: Plumb support for filtering route dumps
From: David Ahern @ 2018-10-11 15:06 UTC (permalink / raw)
  To: netdev, davem; +Cc: David Ahern
In-Reply-To: <20181011150627.4010-1-dsahern@kernel.org>

From: David Ahern <dsahern@gmail.com>

Implement kernel side filtering of routes by table id, egress device index,
protocol and route type.

Signed-off-by: David Ahern <dsahern@gmail.com>
---
 include/net/ip_fib.h    |  2 +-
 net/ipv4/fib_frontend.c |  5 ++++-
 net/ipv4/fib_trie.c     | 37 ++++++++++++++++++++++++++-----------
 3 files changed, 31 insertions(+), 13 deletions(-)

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 9dde41ad02a1..68967f4bd024 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -238,7 +238,7 @@ int fib_table_insert(struct net *, struct fib_table *, struct fib_config *,
 int fib_table_delete(struct net *, struct fib_table *, struct fib_config *,
 		     struct netlink_ext_ack *extack);
 int fib_table_dump(struct fib_table *table, struct sk_buff *skb,
-		   struct netlink_callback *cb);
+		   struct netlink_callback *cb, struct fib_dump_filter *filter);
 int fib_table_flush(struct net *net, struct fib_table *table);
 struct fib_table *fib_trie_unmerge(struct fib_table *main_tb);
 void fib_table_flush_external(struct fib_table *table);
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index d0fb9b7efa27..1528b0919951 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -866,10 +866,13 @@ static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
 		hlist_for_each_entry_rcu(tb, head, tb_hlist) {
 			if (e < s_e)
 				goto next;
+			if (filter.table_id && filter.table_id != tb->tb_id)
+				goto next;
+
 			if (dumped)
 				memset(&cb->args[2], 0, sizeof(cb->args) -
 						 2 * sizeof(cb->args[0]));
-			err = fib_table_dump(tb, skb, cb);
+			err = fib_table_dump(tb, skb, cb, &filter);
 			if (err < 0) {
 				if (likely(skb->len))
 					goto out;
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 5bc0c89e81e4..237c9f72b265 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -2003,12 +2003,17 @@ void fib_free_table(struct fib_table *tb)
 }
 
 static int fn_trie_dump_leaf(struct key_vector *l, struct fib_table *tb,
-			     struct sk_buff *skb, struct netlink_callback *cb)
+			     struct sk_buff *skb, struct netlink_callback *cb,
+			     struct fib_dump_filter *filter)
 {
+	unsigned int flags = NLM_F_MULTI;
 	__be32 xkey = htonl(l->key);
 	struct fib_alias *fa;
 	int i, s_i;
 
+	if (filter->filter_set)
+		flags |= NLM_F_DUMP_FILTERED;
+
 	s_i = cb->args[4];
 	i = 0;
 
@@ -2016,25 +2021,35 @@ static int fn_trie_dump_leaf(struct key_vector *l, struct fib_table *tb,
 	hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) {
 		int err;
 
-		if (i < s_i) {
-			i++;
-			continue;
-		}
+		if (i < s_i)
+			goto next;
 
-		if (tb->tb_id != fa->tb_id) {
-			i++;
-			continue;
+		if (tb->tb_id != fa->tb_id)
+			goto next;
+
+		if (filter->filter_set) {
+			if (filter->rt_type && fa->fa_type != filter->rt_type)
+				goto next;
+
+			if ((filter->protocol &&
+			     fa->fa_info->fib_protocol != filter->protocol))
+				goto next;
+
+			if (filter->dev &&
+			    !fib_info_nh_uses_dev(fa->fa_info, filter->dev))
+				goto next;
 		}
 
 		err = fib_dump_info(skb, NETLINK_CB(cb->skb).portid,
 				    cb->nlh->nlmsg_seq, RTM_NEWROUTE,
 				    tb->tb_id, fa->fa_type,
 				    xkey, KEYLENGTH - fa->fa_slen,
-				    fa->fa_tos, fa->fa_info, NLM_F_MULTI);
+				    fa->fa_tos, fa->fa_info, flags);
 		if (err < 0) {
 			cb->args[4] = i;
 			return err;
 		}
+next:
 		i++;
 	}
 
@@ -2044,7 +2059,7 @@ static int fn_trie_dump_leaf(struct key_vector *l, struct fib_table *tb,
 
 /* rcu_read_lock needs to be hold by caller from readside */
 int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
-		   struct netlink_callback *cb)
+		   struct netlink_callback *cb, struct fib_dump_filter *filter)
 {
 	struct trie *t = (struct trie *)tb->tb_data;
 	struct key_vector *l, *tp = t->kv;
@@ -2057,7 +2072,7 @@ int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
 	while ((l = leaf_walk_rcu(&tp, key)) != NULL) {
 		int err;
 
-		err = fn_trie_dump_leaf(l, tb, skb, cb);
+		err = fn_trie_dump_leaf(l, tb, skb, cb, filter);
 		if (err < 0) {
 			cb->args[3] = key;
 			cb->args[2] = count;
-- 
2.11.0

^ permalink raw reply related

* Re: [iproute2-next] tipc: support interface name when activating UDP bearer
From: Stephen Hemminger @ 2018-10-11 15:04 UTC (permalink / raw)
  To: Hoang Le; +Cc: jon.maloy, maloy, ying.xue, netdev, tipc-discussion
In-Reply-To: <20181011020708.7585-1-hoang.h.le@dektech.com.au>

On Thu, 11 Oct 2018 09:07:08 +0700
Hoang Le <hoang.h.le@dektech.com.au> wrote:

This looks fine.

> +static int cmd_bearer_validate_and_get_addr(const char *name, char *straddr)
> +{
> +	struct ifreq ifc;
> +	struct sockaddr_in *ip4addr;
> +	struct sockaddr_in6 *ip6addr;
> +	int fd = 0;
> +
> +	if (!name || !straddr)
> +		return 0;
> +
> +	fd = socket(PF_INET, SOCK_DGRAM, 0);

Will goahead and apply but minor nits.

The initialization of fd to zero is unnecessary.
This function is return 0, -1, or -EINVAL but only caller only cares about
zero or non-zero.

^ permalink raw reply

* RE: [PATCH net-next] hv_netvsc: fix vf serial matching with pci slot info
From: Haiyang Zhang @ 2018-10-11 22:30 UTC (permalink / raw)
  To: Haiyang Zhang, davem@davemloft.net, netdev@vger.kernel.org
  Cc: KY Srinivasan, Stephen Hemminger, olaf@aepfle.de, vkuznets,
	devel@linuxdriverproject.org, linux-kernel@vger.kernel.org
In-Reply-To: <20181011201434.30737-1-haiyangz@linuxonhyperv.com>



> -----Original Message-----
> From: Haiyang Zhang <haiyangz@linuxonhyperv.com>
> Sent: Thursday, October 11, 2018 4:15 PM
> To: davem@davemloft.net; netdev@vger.kernel.org
> Cc: Haiyang Zhang <haiyangz@microsoft.com>; KY Srinivasan
> <kys@microsoft.com>; Stephen Hemminger <sthemmin@microsoft.com>;
> olaf@aepfle.de; vkuznets <vkuznets@redhat.com>;
> devel@linuxdriverproject.org; linux-kernel@vger.kernel.org
> Subject: [PATCH net-next] hv_netvsc: fix vf serial matching with pci slot info
> 
> From: Haiyang Zhang <haiyangz@microsoft.com>
> 
> The VF device's serial number is saved as a string in PCI slot's kobj name, not
> the slot->number. This patch corrects the netvsc driver, so the VF device can be
> successfully paired with synthetic NIC.
> 
> Fixes: 00d7ddba1143 ("hv_netvsc: pair VF based on serial number")
> Signed-off-by: Haiyang Zhang <haiyangz@microsoft.com>

Thanks Stephen for the reminder -- I added the "reported-by" here:

Reported-by: Vitaly Kuznetsov <vkuznets@redhat.com>

^ permalink raw reply

* Re: [PATCH v2 1/2] net/ncsi: Add NCSI Broadcom OEM command
From: Vijay Khemka @ 2018-10-11 22:12 UTC (permalink / raw)
  To: Samuel Mendoza-Jonas, David S. Miller, netdev@vger.kernel.org,
	linux-kernel@vger.kernel.org
  Cc: openbmc @ lists . ozlabs . org, Justin.Lee1@Dell.com,
	joel@jms.id.au, linux-aspeed@lists.ozlabs.org
In-Reply-To: <7893ec3aa673b6009d00df1646b2820be2fc33a6.camel@mendozajonas.com>

Thanks Sam,
I will take care of this and generate new patch.

Regards
-Vijay

On 10/9/18, 9:40 PM, "Samuel Mendoza-Jonas" <sam@mendozajonas.com> wrote:

    On Tue, 2018-10-09 at 10:48 -0700, Vijay Khemka wrote:
    > This patch adds OEM Broadcom commands and response handling. It also
    > defines OEM Get MAC Address handler to get and configure the device.
    > 
    > ncsi_oem_gma_handler_bcm: This handler send NCSI broadcom command for
    > getting mac address.
    > ncsi_rsp_handler_oem_bcm: This handles response received for all
    > broadcom OEM commands.
    > ncsi_rsp_handler_oem_bcm_gma: This handles get mac address response and
    > set it to device.
    > 
    > Signed-off-by: Vijay Khemka <vijaykhemka@fb.com>
    > ---
    >  net/ncsi/Kconfig       |  6 ++++
    >  net/ncsi/internal.h    |  8 +++++
    >  net/ncsi/ncsi-manage.c | 70 +++++++++++++++++++++++++++++++++++++++++-
    >  net/ncsi/ncsi-pkt.h    |  8 +++++
    >  net/ncsi/ncsi-rsp.c    | 40 +++++++++++++++++++++++-
    >  5 files changed, 130 insertions(+), 2 deletions(-)
    > 
    
    <snip>
    
    >  static void ncsi_configure_channel(struct ncsi_dev_priv *ndp)
    >  {
    >  	struct ncsi_dev *nd = &ndp->ndev;
    > @@ -643,9 +676,10 @@ static void ncsi_configure_channel(struct ncsi_dev_priv *ndp)
    >  	struct ncsi_channel *nc = ndp->active_channel;
    >  	struct ncsi_channel *hot_nc = NULL;
    >  	struct ncsi_cmd_arg nca;
    > +	struct ncsi_oem_gma_handler *nch = NULL;
    >  	unsigned char index;
    >  	unsigned long flags;
    > -	int ret;
    > +	int ret, i;
    >  
    
    I just noticed that if CONFIG_NCSI_OEM_CMD_GET_MAC is not set then this
    generates unused variable warnings for i and nch. Otherwise all looking good!
    
    Regards,
    Sam
    
    
    


^ permalink raw reply

* RE: [RFC PATCH 2/2] net/ncsi: Configure multi-package, multi-channel modes with failover
From: Justin.Lee1 @ 2018-10-11 22:09 UTC (permalink / raw)
  To: sam, netdev; +Cc: davem, linux-kernel, openbmc
In-Reply-To: <8548f1872badb35588d5506da21f0d89bea71127.camel@mendozajonas.com>

Hi Sam,

Please see my comments below.

Thanks,
Justin
 
 
> On Wed, 2018-10-10 at 22:36 +0000, Justin.Lee1@Dell.com wrote:
> > Hi Samuel,
> > 
> > I am still testing your change and have some comments below.
> > 
> > Thanks,
> > Justin
> > 
> > 
> > > This patch extends the ncsi-netlink interface with two new commands and
> > > three new attributes to configure multiple packages and/or channels at
> > > once, and configure specific failover modes.
> > > 
> > > NCSI_CMD_SET_PACKAGE mask and NCSI_CMD_SET_CHANNEL_MASK set a whitelist
> > > of packages or channels allowed to be configured with the
> > > NCSI_ATTR_PACKAGE_MASK and NCSI_ATTR_CHANNEL_MASK attributes
> > > respectively. If one of these whitelists is set only packages or
> > > channels matching the whitelist are considered for the channel queue in
> > > ncsi_choose_active_channel().
> > > 
> > > These commands may also use the NCSI_ATTR_MULTI_FLAG to signal that
> > > multiple packages or channels may be configured simultaneously. NCSI
> > > hardware arbitration (HWA) must be available in order to enable
> > > multi-package mode. Multi-channel mode is always available.
> > > 
> > > If the NCSI_ATTR_CHANNEL_ID attribute is present in the
> > > NCSI_CMD_SET_CHANNEL_MASK command the it sets the preferred channel as
> > > with the NCSI_CMD_SET_INTERFACE command. The combination of preferred
> > > channel and channel whitelist defines a primary channel and the allowed
> > > failover channels.
> > > If the NCSI_ATTR_MULTI_FLAG attribute is also present then the preferred
> > > channel is configured for Tx/Rx and the other channels are enabled only
> > > for Rx.
> > > 
> > > Signed-off-by: Samuel Mendoza-Jonas <sam@mendozajonas.com>
> > > ---
> > >  include/uapi/linux/ncsi.h |  16 +++
> > >  net/ncsi/internal.h       |  11 +-
> > >  net/ncsi/ncsi-aen.c       |   2 +-
> > >  net/ncsi/ncsi-manage.c    | 138 ++++++++++++++++--------
> > >  net/ncsi/ncsi-netlink.c   | 217 +++++++++++++++++++++++++++++++++-----
> > >  net/ncsi/ncsi-rsp.c       |   2 +-
> > >  6 files changed, 312 insertions(+), 74 deletions(-)
> > > 
> > > diff --git a/include/uapi/linux/ncsi.h b/include/uapi/linux/ncsi.h
> > > index 4c292ecbb748..035fba1693f9 100644
> > > --- a/include/uapi/linux/ncsi.h
> > > +++ b/include/uapi/linux/ncsi.h
> > > @@ -23,6 +23,13 @@
> > >   *	optionally the preferred NCSI_ATTR_CHANNEL_ID.
> > >   * @NCSI_CMD_CLEAR_INTERFACE: clear any preferred package/channel combination.
> > >   *	Requires NCSI_ATTR_IFINDEX.
> > > + * @NCSI_CMD_SET_PACKAGE_MASK: set a whitelist of allowed packages.
> > > + * @NCSI_CMD_SET_PACKAGE_MASK: set a whitelist of allowed channels.
> > > + *	Requires NCSI_ATTR_IFINDEX and NCSI_ATTR_PACKAGE_MASK.
> > > + * @NCSI_CMD_SET_PACKAGE_MASK: set a whitelist of allowed channels.
> > > + *	Requires NCSI_ATTR_IFINDEX, NCSI_ATTR_PACKAGE_ID, and
> > > + *	NCSI_ATTR_CHANNEL_MASK. If NCSI_ATTR_CHANNEL_ID is present it sets
> > > + *	the primary channel.
> > >   * @NCSI_CMD_MAX: highest command number
> > >   */
> > 
> > There are some typo in the description.
> > * @NCSI_CMD_SET_PACKAGE_MASK: set a whitelist of allowed packages.
> >  *	Requires NCSI_ATTR_IFINDEX and NCSI_ATTR_PACKAGE_MASK.
> >  * @NCSI_CMD_SET_CHANNEL_MASK: set a whitelist of allowed channels.
> >  *	Requires NCSI_ATTR_IFINDEX, NCSI_ATTR_PACKAGE_ID, and
> >  *	NCSI_ATTR_CHANNEL_MASK. If NCSI_ATTR_CHANNEL_ID is present it sets
> >  *	the primary channel.
> 
> Haha, yes I threw that in at the end, thanks for catching it.
> 
> > 
> > >  enum ncsi_nl_commands {
> > > @@ -30,6 +37,8 @@ enum ncsi_nl_commands {
> > >  	NCSI_CMD_PKG_INFO,
> > >  	NCSI_CMD_SET_INTERFACE,
> > >  	NCSI_CMD_CLEAR_INTERFACE,
> > > +	NCSI_CMD_SET_PACKAGE_MASK,
> > > +	NCSI_CMD_SET_CHANNEL_MASK,
> > >  
> > >  	__NCSI_CMD_AFTER_LAST,
> > >  	NCSI_CMD_MAX = __NCSI_CMD_AFTER_LAST - 1
> > > @@ -43,6 +52,10 @@ enum ncsi_nl_commands {
> > >   * @NCSI_ATTR_PACKAGE_LIST: nested array of NCSI_PKG_ATTR attributes
> > >   * @NCSI_ATTR_PACKAGE_ID: package ID
> > >   * @NCSI_ATTR_CHANNEL_ID: channel ID
> > > + * @NCSI_ATTR_MULTI_FLAG: flag to signal that multi-mode should be enabled with
> > > + *	NCSI_CMD_SET_PACKAGE_MASK or NCSI_CMD_SET_CHANNEL_MASK.
> > > + * @NCSI_ATTR_PACKAGE_MASK: 32-bit mask of allowed packages.
> > > + * @NCSI_ATTR_CHANNEL_MASK: 32-bit mask of allowed channels.
> > >   * @NCSI_ATTR_MAX: highest attribute number
> > >   */
> > >  enum ncsi_nl_attrs {
> > > @@ -51,6 +64,9 @@ enum ncsi_nl_attrs {
> > >  	NCSI_ATTR_PACKAGE_LIST,
> > >  	NCSI_ATTR_PACKAGE_ID,
> > >  	NCSI_ATTR_CHANNEL_ID,
> > > +	NCSI_ATTR_MULTI_FLAG,
> > > +	NCSI_ATTR_PACKAGE_MASK,
> > > +	NCSI_ATTR_CHANNEL_MASK,
> > 
> > Is there a case that we might set these two masks at the same time?
> > If not, maybe we can just have one generic MASK attribute.
> > 
> 
> I thought of this too: not yet, but I wonder if we might in the future.
> I'll have a think about it.
> 
> > >  
> > >  	__NCSI_ATTR_AFTER_LAST,
> > >  	NCSI_ATTR_MAX = __NCSI_ATTR_AFTER_LAST - 1
> > > diff --git a/net/ncsi/internal.h b/net/ncsi/internal.h
> > > index 3d0a33b874f5..8437474d0a78 100644
> > > --- a/net/ncsi/internal.h
> > > +++ b/net/ncsi/internal.h
> > > @@ -213,6 +213,10 @@ struct ncsi_package {
> > >  	unsigned int         channel_num; /* Number of channels     */
> > >  	struct list_head     channels;    /* List of chanels        */
> > >  	struct list_head     node;        /* Form list of packages  */
> > > +
> > > +	bool                 multi_channel; /* Enable multiple channels  */
> > > +	u32                  channel_whitelist; /* Channels to configure */
> > > +	struct ncsi_channel  *preferred_channel; /* Primary channel      */
> > >  };
> > >  
> > >  struct ncsi_request {
> > > @@ -280,8 +284,6 @@ struct ncsi_dev_priv {
> > >  	unsigned int        package_num;     /* Number of packages         */
> > >  	struct list_head    packages;        /* List of packages           */
> > >  	struct ncsi_channel *hot_channel;    /* Channel was ever active    */
> > > -	struct ncsi_package *force_package;  /* Force a specific package   */
> > > -	struct ncsi_channel *force_channel;  /* Force a specific channel   */
> > >  	struct ncsi_request requests[256];   /* Request table              */
> > >  	unsigned int        request_id;      /* Last used request ID       */
> > >  #define NCSI_REQ_START_IDX	1
> > > @@ -294,6 +296,9 @@ struct ncsi_dev_priv {
> > >  	struct list_head    node;            /* Form NCSI device list      */
> > >  #define NCSI_MAX_VLAN_VIDS	15
> > >  	struct list_head    vlan_vids;       /* List of active VLAN IDs */
> > > +
> > > +	bool                multi_package;   /* Enable multiple packages   */
> > > +	u32                 package_whitelist; /* Packages to configure    */
> > >  };
> > >  
> > >  struct ncsi_cmd_arg {
> > > @@ -345,6 +350,8 @@ struct ncsi_request *ncsi_alloc_request(struct ncsi_dev_priv *ndp,
> > >  void ncsi_free_request(struct ncsi_request *nr);
> > >  struct ncsi_dev *ncsi_find_dev(struct net_device *dev);
> > >  int ncsi_process_next_channel(struct ncsi_dev_priv *ndp);
> > > +bool ncsi_channel_is_last(struct ncsi_dev_priv *ndp,
> > > +			  struct ncsi_channel *channel);
> > >  
> > >  /* Packet handlers */
> > >  u32 ncsi_calculate_checksum(unsigned char *data, int len);
> > > diff --git a/net/ncsi/ncsi-aen.c b/net/ncsi/ncsi-aen.c
> > > index 65f47a648be3..eac56aee30c4 100644
> > > --- a/net/ncsi/ncsi-aen.c
> > > +++ b/net/ncsi/ncsi-aen.c
> > > @@ -86,7 +86,7 @@ static int ncsi_aen_handler_lsc(struct ncsi_dev_priv *ndp,
> > >  	    !(state == NCSI_CHANNEL_ACTIVE && !(data & 0x1)))
> > >  		return 0;
> > >  
> > > -	if (state == NCSI_CHANNEL_ACTIVE)
> > > +	if (state == NCSI_CHANNEL_ACTIVE && ncsi_channel_is_last(ndp, nc))
> > >  		ndp->flags |= NCSI_DEV_RESHUFFLE;
> > >  
> > >  	ncsi_stop_channel_monitor(nc);
> > > diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c
> > > index 665bee25ec44..6a55df700bcb 100644
> > > --- a/net/ncsi/ncsi-manage.c
> > > +++ b/net/ncsi/ncsi-manage.c
> > > @@ -27,6 +27,24 @@
> > >  LIST_HEAD(ncsi_dev_list);
> > >  DEFINE_SPINLOCK(ncsi_dev_lock);
> > >  
> > > +/* Returns true if the given channel is the last channel available */
> > > +bool ncsi_channel_is_last(struct ncsi_dev_priv *ndp,
> > > +			  struct ncsi_channel *channel)
> > > +{
> > > +	struct ncsi_package *np;
> > > +	struct ncsi_channel *nc;
> > > +
> > > +	NCSI_FOR_EACH_PACKAGE(ndp, np)
> > > +		NCSI_FOR_EACH_CHANNEL(np, nc) {
> > > +			if (nc == channel)
> > > +				continue;
> > > +			if (nc->state == NCSI_CHANNEL_ACTIVE)
> > > +				return false;
> > > +		}
> > > +
> > > +	return true;
> > > +}
> > > +
> > >  static void ncsi_report_link(struct ncsi_dev_priv *ndp, bool force_down)
> > >  {
> > >  	struct ncsi_dev *nd = &ndp->ndev;
> > > @@ -266,6 +284,7 @@ struct ncsi_package *ncsi_add_package(struct ncsi_dev_priv *ndp,
> > >  	np->ndp = ndp;
> > >  	spin_lock_init(&np->lock);
> > >  	INIT_LIST_HEAD(&np->channels);
> > > +	np->channel_whitelist = UINT_MAX;
> > >  
> > >  	spin_lock_irqsave(&ndp->lock, flags);
> > >  	tmp = ncsi_find_package(ndp, id);
> > > @@ -633,6 +652,34 @@ static int set_one_vid(struct ncsi_dev_priv *ndp, struct ncsi_channel *nc,
> > >  	return 0;
> > >  }
> > >  
> > > +/* Determine if a given channel should be the Tx channel */
> > > +bool ncsi_channel_is_tx(struct ncsi_dev_priv *ndp, struct ncsi_channel *nc)
> > > +{
> > > +	struct ncsi_package *np = nc->package;
> > > +	struct ncsi_channel *channel;
> > > +	struct ncsi_channel_mode *ncm;

Learn from Dave. All local variable declarations from longest to shortest
line. :)

> > > +
> > > +	NCSI_FOR_EACH_CHANNEL(np, channel) {
> > > +		ncm = &channel->modes[NCSI_MODE_TX_ENABLE];
> > > +		/* Another channel is already Tx */
> > > +		if (ncm->enable)
> > > +			return false;
> > > +	}
> > > +
> > > +	if (!np->preferred_channel)
> > > +		return true;
> > > +
> > > +	if (np->preferred_channel == nc)
> > > +		return true;
> > > +
> > > +	/* The preferred channel is not in the queue and not active */
> > > +	if (list_empty(&np->preferred_channel->link) &&
> > > +	    np->preferred_channel->state != NCSI_CHANNEL_ACTIVE)
> > > +		return true;
> > > +
> > > +	return false;
> > > +}
> > > +
> > >  static void ncsi_configure_channel(struct ncsi_dev_priv *ndp)
> > >  {
> > >  	struct ncsi_dev *nd = &ndp->ndev;
> > > @@ -745,18 +792,22 @@ static void ncsi_configure_channel(struct ncsi_dev_priv *ndp)
> > >  		} else if (nd->state == ncsi_dev_state_config_ebf) {
> > >  			nca.type = NCSI_PKT_CMD_EBF;
> > >  			nca.dwords[0] = nc->caps[NCSI_CAP_BC].cap;
> > > -			nd->state = ncsi_dev_state_config_ecnt;
> > > +			if (ncsi_channel_is_tx(ndp, nc))
> > > +				nd->state = ncsi_dev_state_config_ecnt;
> > > +			else
> > > +				nd->state = ncsi_dev_state_config_ec;
> > >  #if IS_ENABLED(CONFIG_IPV6)
> > >  			if (ndp->inet6_addr_num > 0 &&
> > >  			    (nc->caps[NCSI_CAP_GENERIC].cap &
> > >  			     NCSI_CAP_GENERIC_MC))
> > >  				nd->state = ncsi_dev_state_config_egmf;
> > > -			else
> > > -				nd->state = ncsi_dev_state_config_ecnt;
> > >  		} else if (nd->state == ncsi_dev_state_config_egmf) {
> > >  			nca.type = NCSI_PKT_CMD_EGMF;
> > >  			nca.dwords[0] = nc->caps[NCSI_CAP_MC].cap;
> > > -			nd->state = ncsi_dev_state_config_ecnt;
> > > +			if (ncsi_channel_is_tx(ndp, nc))
> > > +				nd->state = ncsi_dev_state_config_ecnt;
> > > +			else
> > > +				nd->state = ncsi_dev_state_config_ec;
> > >  #endif /* CONFIG_IPV6 */
> > >  		} else if (nd->state == ncsi_dev_state_config_ecnt) {
> > >  			nca.type = NCSI_PKT_CMD_ECNT;
> > > @@ -840,43 +891,35 @@ static void ncsi_configure_channel(struct ncsi_dev_priv *ndp)
> > >  
> > >  static int ncsi_choose_active_channel(struct ncsi_dev_priv *ndp)
> > >  {
> > > -	struct ncsi_package *np, *force_package;
> > > -	struct ncsi_channel *nc, *found, *hot_nc, *force_channel;
> > > +	struct ncsi_package *np;
> > > +	struct ncsi_channel *nc, *found, *hot_nc;
> > >  	struct ncsi_channel_mode *ncm;
> > > -	unsigned long flags;
> > > +	unsigned long flags, cflags;
> > > +	bool with_link;

All local variable declarations from longest to shortest
line.

> > >  
> > >  	spin_lock_irqsave(&ndp->lock, flags);
> > >  	hot_nc = ndp->hot_channel;
> > > -	force_channel = ndp->force_channel;
> > > -	force_package = ndp->force_package;
> > >  	spin_unlock_irqrestore(&ndp->lock, flags);
> > >  
> > > -	/* Force a specific channel whether or not it has link if we have been
> > > -	 * configured to do so
> > > -	 */
> > > -	if (force_package && force_channel) {
> > > -		found = force_channel;
> > > -		ncm = &found->modes[NCSI_MODE_LINK];
> > > -		if (!(ncm->data[2] & 0x1))
> > > -			netdev_info(ndp->ndev.dev,
> > > -				    "NCSI: Channel %u forced, but it is link down\n",
> > > -				    found->id);
> > > -		goto out;
> > > -	}
> > > -
> > > -	/* The search is done once an inactive channel with up
> > > -	 * link is found.
> > > +	/* By default the search is done once an inactive channel with up
> > > +	 * link is found, unless a preferred channel is set.
> > > +	 * If multi_package or multi_channel are configured all channels in the
> > > +	 * whitelist with link are added to the channel queue.
> > >  	 */
> > >  	found = NULL;
> > > +	with_link = false;
> > >  	NCSI_FOR_EACH_PACKAGE(ndp, np) {
> > > -		if (ndp->force_package && np != ndp->force_package)
> > > +		if (!(ndp->package_whitelist & (0x1 << np->id)))
> > >  			continue;
> > >  		NCSI_FOR_EACH_CHANNEL(np, nc) {
> > > -			spin_lock_irqsave(&nc->lock, flags);
> > > +			if (!(np->channel_whitelist & (0x1 << nc->id)))
> > > +				continue;
> > > +
> > > +			spin_lock_irqsave(&nc->lock, cflags);
> > >  
> > >  			if (!list_empty(&nc->link) ||
> > >  			    nc->state != NCSI_CHANNEL_INACTIVE) {
> > > -				spin_unlock_irqrestore(&nc->lock, flags);
> > > +				spin_unlock_irqrestore(&nc->lock, cflags);
> > >  				continue;
> > >  			}
> > >  
> > > @@ -888,32 +931,42 @@ static int ncsi_choose_active_channel(struct ncsi_dev_priv *ndp)
> > >  
> > >  			ncm = &nc->modes[NCSI_MODE_LINK];
> > >  			if (ncm->data[2] & 0x1) {
> > 
> > This data will not be updated if the channel monitor for it is not running.
> > If I move the cable from the current configured channel to the other channel,
> > NC-SI module will not detect the link status as the other channel is not configured
> > and AEN will not happen.
> > Is it per design that NC-SI module will always use the first interface with the link?
> 
> We'll check the link state of every channel at init, and per-package on
> suspend events, but you're right that we only get AENs right now from
> configured channels. There's probably no reason why we could at least
> enable AENs on all channels even if we don't use them for network, maybe
> during the package probe step.
> 

To receive the AEN packet, the channel (RX) needs to be enabled.
It seems that we need to send Get Link Status command to all channels before choosing
The active channel.

> > 
> > > -				spin_unlock_irqrestore(&nc->lock, flags);
> > >  				found = nc;
> > > -				goto out;
> > > +				with_link = true;
> > > +
> > > +				spin_lock_irqsave(&ndp->lock, flags);
> > > +				list_add_tail_rcu(&found->link,
> > > +						  &ndp->channel_queue);
> > > +				spin_unlock_irqrestore(&ndp->lock, flags);
> > > +
> > > +				netdev_dbg(ndp->ndev.dev,
> > > +					   "NCSI: Channel %u added to queue (link %s)\n",
> > > +					   found->id,
> > > +					   ncm->data[2] & 0x1 ? "up" : "down");
> > >  			}

If multi_channel is enabled, the interface without link still needs to be processed.
Something likes below?
			} else if (np->multi_channel) {
				found = nc;

				spin_lock_irqsave(&ndp->lock, flags);
				list_add_tail_rcu(&found->link,
						  &ndp->channel_queue);
				spin_unlock_irqrestore(&ndp->lock, flags);

				netdev_dbg(ndp->ndev.dev,
					   "NCSI: pkg %u ch %u added to queue (link %s)\n",
					   found->package->id,
					   found->id,
					   ncm->data[2] & 0x1 ? "up" : "down");
			}

> > > +			spin_unlock_irqrestore(&nc->lock, cflags);
> > >  
> > > -			spin_unlock_irqrestore(&nc->lock, flags);
> > > +			if (with_link && !np->multi_channel)
> > > +				break;
> > >  		}
> > > +		if (with_link && !ndp->multi_package)
> > > +			break;
> > >  	}
> > >  
> > > -	if (!found) {
> > > +	if (!with_link && found) {
> > > +		netdev_info(ndp->ndev.dev,
> > > +			    "NCSI: No channel with link found, configuring channel %u\n",
> > > +			    found->id);
> > > +		spin_lock_irqsave(&ndp->lock, flags);
> > > +		list_add_tail_rcu(&found->link, &ndp->channel_queue);
> > > +		spin_unlock_irqrestore(&ndp->lock, flags);
> > > +	} else if (!found) {
> > >  		netdev_warn(ndp->ndev.dev,
> > > -			    "NCSI: No channel found with link\n");
> > > +			    "NCSI: No channel found to configure!\n");
> > >  		ncsi_report_link(ndp, true);
> > >  		return -ENODEV;
> > >  	}
> > >  
> > > -	ncm = &found->modes[NCSI_MODE_LINK];
> > > -	netdev_dbg(ndp->ndev.dev,
> > > -		   "NCSI: Channel %u added to queue (link %s)\n",
> > > -		   found->id, ncm->data[2] & 0x1 ? "up" : "down");
> > > -
> > > -out:
> > > -	spin_lock_irqsave(&ndp->lock, flags);
> > > -	list_add_tail_rcu(&found->link, &ndp->channel_queue);
> > > -	spin_unlock_irqrestore(&ndp->lock, flags);
> > > -
> > >  	return ncsi_process_next_channel(ndp);
> > >  }
> > >  
> > > @@ -1428,6 +1481,7 @@ struct ncsi_dev *ncsi_register_dev(struct net_device *dev,
> > >  	INIT_LIST_HEAD(&ndp->channel_queue);
> > >  	INIT_LIST_HEAD(&ndp->vlan_vids);
> > >  	INIT_WORK(&ndp->work, ncsi_dev_work);
> > > +	ndp->package_whitelist = UINT_MAX;
> > >  
> > >  	/* Initialize private NCSI device */
> > >  	spin_lock_init(&ndp->lock);
> > > diff --git a/net/ncsi/ncsi-netlink.c b/net/ncsi/ncsi-netlink.c
> > > index 32cb7751d216..33a091e6f466 100644
> > > --- a/net/ncsi/ncsi-netlink.c
> > > +++ b/net/ncsi/ncsi-netlink.c
> > 
> > Is the following missed in the patch?
> > static const struct nla_policy ncsi_genl_policy[NCSI_ATTR_MAX + 1] = {
> > ...
> > 	[NCSI_ATTR_MULTI_FLAG] =	{ .type = NLA_FLAG },
> > 	[NCSI_ATTR_PACKAGE_MASK] =	{ .type = NLA_U32 },
> > 	[NCSI_ATTR_CHANNEL_MASK] =	{ .type = NLA_U32 },
> 
> Oops, added.
> 
> > 
> > > @@ -67,7 +67,7 @@ static int ncsi_write_channel_info(struct sk_buff *skb,
> > >  	nla_put_u32(skb, NCSI_CHANNEL_ATTR_LINK_STATE, m->data[2]);
> > >  	if (nc->state == NCSI_CHANNEL_ACTIVE)
> > >  		nla_put_flag(skb, NCSI_CHANNEL_ATTR_ACTIVE);
> > > -	if (ndp->force_channel == nc)
> > > +	if (nc == nc->package->preferred_channel)
> > >  		nla_put_flag(skb, NCSI_CHANNEL_ATTR_FORCED);
> > >  
> > >  	nla_put_u32(skb, NCSI_CHANNEL_ATTR_VERSION_MAJOR, nc->version.version);
> > > @@ -112,7 +112,7 @@ static int ncsi_write_package_info(struct sk_buff *skb,
> > >  		if (!pnest)
> > >  			return -ENOMEM;
> > >  		nla_put_u32(skb, NCSI_PKG_ATTR_ID, np->id);
> > > -		if (ndp->force_package == np)
> > > +		if ((0x1 << np->id) == ndp->package_whitelist)
> > >  			nla_put_flag(skb, NCSI_PKG_ATTR_FORCED);
> > >  		cnest = nla_nest_start(skb, NCSI_PKG_ATTR_CHANNEL_LIST);
> > >  		if (!cnest) {
> > > @@ -288,45 +288,54 @@ static int ncsi_set_interface_nl(struct sk_buff *msg, struct genl_info *info)
> > >  	package_id = nla_get_u32(info->attrs[NCSI_ATTR_PACKAGE_ID]);
> > >  	package = NULL;
> > >  
> > > -	spin_lock_irqsave(&ndp->lock, flags);
> > > -
> > >  	NCSI_FOR_EACH_PACKAGE(ndp, np)
> > >  		if (np->id == package_id)
> > >  			package = np;
> > >  	if (!package) {
> > >  		/* The user has set a package that does not exist */
> > > -		spin_unlock_irqrestore(&ndp->lock, flags);
> > >  		return -ERANGE;
> > >  	}
> > >  
> > >  	channel = NULL;
> > > -	if (!info->attrs[NCSI_ATTR_CHANNEL_ID]) {
> > > -		/* Allow any channel */
> > > -		channel_id = NCSI_RESERVED_CHANNEL;
> > > -	} else {
> > > +	if (info->attrs[NCSI_ATTR_CHANNEL_ID]) {
> > >  		channel_id = nla_get_u32(info->attrs[NCSI_ATTR_CHANNEL_ID]);
> > >  		NCSI_FOR_EACH_CHANNEL(package, nc)
> > > -			if (nc->id == channel_id)
> > > +			if (nc->id == channel_id) {
> > >  				channel = nc;
> > > +				break;
> > > +			}
> > > +		if (!channel) {
> > > +			netdev_info(ndp->ndev.dev,
> > > +				    "NCSI: Channel %u does not exist!\n",
> > > +				    channel_id);
> > > +			return -ERANGE;
> > > +		}
> > >  	}
> > >  
> > > -	if (channel_id != NCSI_RESERVED_CHANNEL && !channel) {
> > > -		/* The user has set a channel that does not exist on this
> > > -		 * package
> > > -		 */
> > > -		spin_unlock_irqrestore(&ndp->lock, flags);
> > > -		netdev_info(ndp->ndev.dev, "NCSI: Channel %u does not exist!\n",
> > > -			    channel_id);
> > > -		return -ERANGE;
> > > -	}
> > > -
> > > -	ndp->force_package = package;
> > > -	ndp->force_channel = channel;
> > > +	spin_lock_irqsave(&ndp->lock, flags);
> > > +	ndp->package_whitelist = 0x1 << package->id;
> > > +	ndp->multi_package = false;
> > >  	spin_unlock_irqrestore(&ndp->lock, flags);
> > >  
> > > -	netdev_info(ndp->ndev.dev, "Set package 0x%x, channel 0x%x%s as preferred\n",
> > > -		    package_id, channel_id,
> > > -		    channel_id == NCSI_RESERVED_CHANNEL ? " (any)" : "");
> > > +	spin_lock_irqsave(&package->lock, flags);
> > > +	package->multi_channel = false;
> > > +	if (channel) {
> > > +		package->channel_whitelist = 0x1 << channel->id;
> > > +		package->preferred_channel = channel;
> > > +	} else {
> > > +		/* Allow any channel */
> > > +		package->channel_whitelist = UINT_MAX;
> > > +		package->preferred_channel = NULL;
> > > +	}
> > > +	spin_unlock_irqrestore(&package->lock, flags);
> > > +
> > > +	if (channel)
> > > +		netdev_info(ndp->ndev.dev,
> > > +			    "Set package 0x%x, channel 0x%x as preferred\n",
> > > +			    package_id, channel_id);
> > > +	else
> > > +		netdev_info(ndp->ndev.dev, "Set package 0x%x as preferred\n",
> > > +			    package_id);
> > >  
> > >  	/* Bounce the NCSI channel to set changes */
> > >  	ncsi_stop_dev(&ndp->ndev);
> > > @@ -338,6 +347,7 @@ static int ncsi_set_interface_nl(struct sk_buff *msg, struct genl_info *info)
> > >  static int ncsi_clear_interface_nl(struct sk_buff *msg, struct genl_info *info)
> > >  {
> > >  	struct ncsi_dev_priv *ndp;
> > > +	struct ncsi_package *np;
> > >  	unsigned long flags;
> > >  
> > >  	if (!info || !info->attrs)
> > > @@ -351,11 +361,19 @@ static int ncsi_clear_interface_nl(struct sk_buff *msg, struct genl_info *info)
> > >  	if (!ndp)
> > >  		return -ENODEV;
> > >  
> > > -	/* Clear any override */
> > > +	/* Reset any whitelists and disable multi mode */
> > >  	spin_lock_irqsave(&ndp->lock, flags);
> > > -	ndp->force_package = NULL;
> > > -	ndp->force_channel = NULL;
> > > +	ndp->package_whitelist = UINT_MAX;
> > > +	ndp->multi_package = false;
> > >  	spin_unlock_irqrestore(&ndp->lock, flags);
> > > +
> > > +	NCSI_FOR_EACH_PACKAGE(ndp, np) {
> > > +		spin_lock_irqsave(&np->lock, flags);
> > > +		np->multi_channel = false;
> > > +		np->channel_whitelist = UINT_MAX;
> > > +		np->preferred_channel = NULL;
> > > +		spin_unlock_irqrestore(&np->lock, flags);
> > > +	}
> > >  	netdev_info(ndp->ndev.dev, "NCSI: Cleared preferred package/channel\n");
> > >  
> > >  	/* Bounce the NCSI channel to set changes */
> > > @@ -365,6 +383,137 @@ static int ncsi_clear_interface_nl(struct sk_buff *msg, struct genl_info *info)
> > >  	return 0;
> > >  }
> > >  
> > > +static int ncsi_set_package_mask_nl(struct sk_buff *msg,
> > > +				    struct genl_info *info)
> > > +{
> > > +	struct ncsi_dev_priv *ndp;
> > > +	unsigned long flags;
> > > +	int rc;
> > > +
> > > +	if (!info || !info->attrs)
> > > +		return -EINVAL;
> > > +
> > > +	if (!info->attrs[NCSI_ATTR_IFINDEX])
> > > +		return -EINVAL;
> > > +
> > > +	if (!info->attrs[NCSI_ATTR_PACKAGE_MASK])
> > > +		return -EINVAL;
> > > +
> > > +	ndp = ndp_from_ifindex(get_net(sock_net(msg->sk)),
> > > +			       nla_get_u32(info->attrs[NCSI_ATTR_IFINDEX]));
> > > +	if (!ndp)
> > > +		return -ENODEV;
> > > +
> > > +	spin_lock_irqsave(&ndp->lock, flags);
> > > +	ndp->package_whitelist =
> > > +		nla_get_u32(info->attrs[NCSI_ATTR_PACKAGE_MASK]);
> > > +
> > > +	if (nla_get_flag(info->attrs[NCSI_ATTR_MULTI_FLAG])) {
> > > +		if (ndp->flags & NCSI_DEV_HWA) {
> > > +			ndp->multi_package = true;
> > > +			rc = 0;
> > > +		} else {
> > > +			netdev_err(ndp->ndev.dev,
> > > +				   "NCSI: Can't use multiple packages without HWA\n");
> > > +			rc = -EPERM;
> > > +		}
> > > +	} else {
> > > +		rc = 0;
> > > +	}

If the flag is not set, do we need to clear the multi_package flag? It is possible that it is
user's intension to disable the multi-mode.
	} else {
		ndp->multi_package = false;
		rc = 0;
	}

> > > +
> > > +	spin_unlock_irqrestore(&ndp->lock, flags);
> > > +
> > > +	if (!rc) {
> > > +		/* Bounce the NCSI channel to set changes */
> > > +		ncsi_stop_dev(&ndp->ndev);
> > > +		ncsi_start_dev(&ndp->ndev);
> > 
> > Is it possible to delay the restart? If we have two packages, we might send
> > set_package_mask command once and set_channel_mask command twice.
> > We will see the unnecessary reconfigurations in a very short period time.
> 
> We could probably do with a better way of bouncing the link here too. I
> suppose we could schedule the actual link 'bounce' to occur a second or
> two later, and delay if we receive new configuration in that time.
> Perhaps something outside of the scope of this patch though.
> 
> > 
> > > +	}
> > > +
> > > +	return rc;
> > > +}
> > > +
> > > +static int ncsi_set_channel_mask_nl(struct sk_buff *msg,
> > > +				    struct genl_info *info)
> > > +{
> > > +	struct ncsi_package *np, *package;
> > > +	struct ncsi_channel *nc, *channel;
> > > +	struct ncsi_dev_priv *ndp;
> > > +	unsigned long flags;
> > > +	u32 package_id, channel_id;

All local variable declarations from longest to shortest
line.

> > > +
> > > +	if (!info || !info->attrs)
> > > +		return -EINVAL;
> > > +
> > > +	if (!info->attrs[NCSI_ATTR_IFINDEX])
> > > +		return -EINVAL;
> > > +
> > > +	if (!info->attrs[NCSI_ATTR_PACKAGE_ID])
> > > +		return -EINVAL;
> > > +
> > > +	if (!info->attrs[NCSI_ATTR_CHANNEL_MASK])
> > > +		return -EINVAL;
> > > +
> > > +	ndp = ndp_from_ifindex(get_net(sock_net(msg->sk)),
> > > +			       nla_get_u32(info->attrs[NCSI_ATTR_IFINDEX]));
> > > +	if (!ndp)
> > > +		return -ENODEV;
> > > +
> > > +	package_id = nla_get_u32(info->attrs[NCSI_ATTR_PACKAGE_ID]);
> > > +	package = NULL;
> > > +	NCSI_FOR_EACH_PACKAGE(ndp, np)
> > > +		if (np->id == package_id) {
> > > +			package = np;
> > > +			break;
> > > +		}
> > > +	if (!package)
> > > +		return -ERANGE;
> > > +
> > > +	spin_lock_irqsave(&package->lock, flags);
> > > +
> > > +	channel = NULL;
> > > +	if (info->attrs[NCSI_ATTR_CHANNEL_ID]) {
> > > +		channel_id = nla_get_u32(info->attrs[NCSI_ATTR_CHANNEL_ID]);
> > > +		NCSI_FOR_EACH_CHANNEL(np, nc)
> > > +			if (nc->id == channel_id) {
> > > +				channel = nc;
> > > +				break;
> > > +			}
> > > +		if (!channel) {
> > > +			spin_unlock_irqrestore(&package->lock, flags);
> > > +			return -ERANGE;
> > > +		}
> > > +		netdev_dbg(ndp->ndev.dev,
> > > +			   "NCSI: Channel %u set as preferred channel\n",
> > > +			   channel->id);
> > > +	}
> > > +
> > > +	package->channel_whitelist =
> > > +		nla_get_u32(info->attrs[NCSI_ATTR_CHANNEL_MASK]);
> > > +	if (package->channel_whitelist == 0)
> > > +		netdev_dbg(ndp->ndev.dev,
> > > +			   "NCSI: Package %u set to all channels disabled\n",
> > > +			   package->id);
> > > +
> > > +	package->preferred_channel = channel;
> > > +
> > > +	if (nla_get_flag(info->attrs[NCSI_ATTR_MULTI_FLAG])) {
> > > +		package->multi_channel = true;
> > > +		netdev_info(ndp->ndev.dev,
> > > +			    "NCSI: Multi-channel enabled on package %u\n",
> > > +			    package_id);
> > > +	} else {
> > > +		package->multi_channel = false;
> > > +	}
> > > +
> > > +	spin_unlock_irqrestore(&package->lock, flags);
> > > +
> > > +	/* Bounce the NCSI channel to set changes */
> > > +	ncsi_stop_dev(&ndp->ndev);
> > > +	ncsi_start_dev(&ndp->ndev);
> > 
> > Same question as set_package_mask function.
> > Is it possible to delay the restart? If we have two packages, we might send
> > set_package_mask command once and set_channel_mask command twice.
> > We will see the unnecessary reconfigurations in a very short period time.
> > 
> > > +
> > > +	return 0;
> > > +}
> > > +
> > >  static const struct genl_ops ncsi_ops[] = {
> > >  	{
> > >  		.cmd = NCSI_CMD_PKG_INFO,
> > > @@ -385,6 +534,18 @@ static const struct genl_ops ncsi_ops[] = {
> > >  		.doit = ncsi_clear_interface_nl,
> > >  		.flags = GENL_ADMIN_PERM,
> > >  	},
> > > +	{
> > > +		.cmd = NCSI_CMD_SET_PACKAGE_MASK,
> > > +		.policy = ncsi_genl_policy,
> > > +		.doit = ncsi_set_package_mask_nl,
> > > +		.flags = GENL_ADMIN_PERM,
> > > +	},
> > > +	{
> > > +		.cmd = NCSI_CMD_SET_CHANNEL_MASK,
> > > +		.policy = ncsi_genl_policy,
> > > +		.doit = ncsi_set_channel_mask_nl,
> > > +		.flags = GENL_ADMIN_PERM,
> > > +	},
> > >  };
> > >  
> > >  static struct genl_family ncsi_genl_family __ro_after_init = {
> > > diff --git a/net/ncsi/ncsi-rsp.c b/net/ncsi/ncsi-rsp.c
> > > index d66b34749027..02ce7626b579 100644
> > > --- a/net/ncsi/ncsi-rsp.c
> > > +++ b/net/ncsi/ncsi-rsp.c
> > > @@ -241,7 +241,7 @@ static int ncsi_rsp_handler_dcnt(struct ncsi_request *nr)
> > >  	if (!ncm->enable)
> > >  		return 0;
> > >  
> > > -	ncm->enable = 1;
> > > +	ncm->enable = 0;
> > >  	return 0;
> > >  }
> > >  
> > > -- 
> > > 2.19.0
> > 
> > 


^ permalink raw reply

* [PATCH net] net: bcmgenet: Poll internal PHY for GENETv5
From: Florian Fainelli @ 2018-10-11 22:06 UTC (permalink / raw)
  To: netdev; +Cc: Florian Fainelli, Doug Berger, David S. Miller, open list

On GENETv5, there is a hardware issue which prevents the GENET hardware
from generating a link UP interrupt when the link is operating at
10Mbits/sec. Since we do not have any way to configure the link
detection logic, fallback to polling in that case.

Fixes: 421380856d9c ("net: bcmgenet: add support for the GENETv5 hardware")
Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
---
 drivers/net/ethernet/broadcom/genet/bcmmii.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/genet/bcmmii.c b/drivers/net/ethernet/broadcom/genet/bcmmii.c
index 4241ae928d4a..34af5f1569c8 100644
--- a/drivers/net/ethernet/broadcom/genet/bcmmii.c
+++ b/drivers/net/ethernet/broadcom/genet/bcmmii.c
@@ -321,9 +321,12 @@ int bcmgenet_mii_probe(struct net_device *dev)
 	phydev->advertising = phydev->supported;
 
 	/* The internal PHY has its link interrupts routed to the
-	 * Ethernet MAC ISRs
+	 * Ethernet MAC ISRs. On GENETv5 there is a hardware issue
+	 * that prevents the signaling of link UP interrupts when
+	 * the link operates at 10Mbps, so fallback to polling for
+	 * those versions of GENET.
 	 */
-	if (priv->internal_phy)
+	if (priv->internal_phy && !GENET_IS_V5(priv))
 		dev->phydev->irq = PHY_IGNORE_INTERRUPT;
 
 	return 0;
-- 
2.17.1

^ permalink raw reply related

* Re: [virtio-dev] Re: [PATCH net-next v2 0/5] virtio: support packed ring
From: Tiwei Bie @ 2018-10-11 14:13 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Jason Wang, virtualization, linux-kernel, netdev, virtio-dev,
	wexu, jfreimann, maxime.coquelin, zhihong.wang
In-Reply-To: <20181011094705-mutt-send-email-mst@kernel.org>

On Thu, Oct 11, 2018 at 09:48:48AM -0400, Michael S. Tsirkin wrote:
> On Thu, Oct 11, 2018 at 08:12:21PM +0800, Tiwei Bie wrote:
> > > > But if it's not too late, I second for a OUT_OF_ORDER feature.
> > > > Starting from in order can have much simpler code in driver.
> > > > 
> > > > Thanks
> > > 
> > > It's tricky to change the flag polarity because of compatibility
> > > with legacy interfaces. Why is this such a big deal?
> > > 
> > > Let's teach drivers about IN_ORDER, then if devices
> > > are in order it will get enabled by default.
> > 
> > Yeah, make sense.
> > 
> > Besides, I have done some further profiling and debugging
> > both in kernel driver and DPDK vhost. Previously I was mislead
> > by a bug in vhost code. I will send a patch to fix that bug.
> > With that bug fixed, the performance of packed ring in the
> > test between kernel driver and DPDK vhost is better now.
> 
> OK, if we get a performance gain on the virtio side, we can finally
> upstream it. If you see that please re-post ASAP so we can
> put it in the next kernel release.

Got it, I will re-post ASAP.

Thanks!


> 
> -- 
> MST

^ permalink raw reply

* Re: Possible bug in traffic control?
From: Josh Coombs @ 2018-10-11 14:05 UTC (permalink / raw)
  To: xiyou.wangcong; +Cc: netdev
In-Reply-To: <CAM_iQpUqwM8vP+_qxXHJnEZ0AttBWF4nTckRiP0nY=Eq+=gUpg@mail.gmail.com>

I'm actually leaning towards macsec now.  I'm at 6TB transferred in a
double hop, no macsec over the bridge setup without triggering the
fault.  I'm going to let it continue to churn and setup a second
testbed that JUST uses macsec without traffic control bridging to see
if I can trip the issue there.    That should determine if it's macsec
itself, or an interaction between macsec and traffic control.

Joshua Coombs
GWI

office 207-494-2140
www.gwi.net

On Wed, Oct 10, 2018 at 12:39 PM Cong Wang <xiyou.wangcong@gmail.com> wrote:
>
> On Wed, Oct 10, 2018 at 8:54 AM Josh Coombs <jcoombs@staff.gwi.net> wrote:
> >
> > 2.3 billion 1 byte packets failed to re-create the bug.  To try and
> > simplify the setup I removed macsec from the equation, using a single
> > host in the middle as the bridge.  Interestingly, rather than 1.3Gbits
> > a second in both directions, it ran around 8Mbits a second.  Switching
> > the filter from u32 to matchall didn't change the performance.  Going
> > back to the four machine test bed, again removing macsec and just
> > bridging through radically decreased the throughput to around 8Mbits.
> > Flip on macsec for the bridge and 1.3Gbits?
>
> This is a great narrow down! We can rule out macsec for guilty.
>
> Can you share a minimum reproducer for this problem? If so I can take
> a look.

^ permalink raw reply

* [PATCH net] rxrpc: use correct kvec num when sending BUSY response packet
From: David Howells @ 2018-10-11 21:32 UTC (permalink / raw)
  Cc: netdev, dhowells, linux-afs, linux-kernel

From: YueHaibing <yuehaibing@huawei.com>

Fixes gcc '-Wunused-but-set-variable' warning:

net/rxrpc/output.c: In function 'rxrpc_reject_packets':
net/rxrpc/output.c:527:11: warning:
 variable 'ioc' set but not used [-Wunused-but-set-variable]

'ioc' is the correct kvec num when sending a BUSY (or an ABORT) response
packet.

Fixes: ece64fec164f ("rxrpc: Emit BUSY packets when supposed to rather than ABORTs")
Signed-off-by: YueHaibing <yuehaibing@huawei.com>
Signed-off-by: David Howells <dhowells@redhat.com>
---

 net/rxrpc/output.c |    3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c
index e8fb8922bca8..a141ee3ab812 100644
--- a/net/rxrpc/output.c
+++ b/net/rxrpc/output.c
@@ -572,7 +572,8 @@ void rxrpc_reject_packets(struct rxrpc_local *local)
 			whdr.flags	^= RXRPC_CLIENT_INITIATED;
 			whdr.flags	&= RXRPC_CLIENT_INITIATED;
 
-			ret = kernel_sendmsg(local->socket, &msg, iov, 2, size);
+			ret = kernel_sendmsg(local->socket, &msg,
+					     iov, ioc, size);
 			if (ret < 0)
 				trace_rxrpc_tx_fail(local->debug_id, 0, ret,
 						    rxrpc_tx_point_reject);

^ permalink raw reply related

* [PATCH net] rxrpc: Fix an uninitialised variable
From: David Howells @ 2018-10-11 21:32 UTC (permalink / raw)
  Cc: netdev, dhowells, linux-afs, linux-kernel

Fix an uninitialised variable introduced by the last patch.  This can cause
a crash when a new call comes in to a local service, such as when an AFS
fileserver calls back to the local cache manager.

Fixes: c1e15b4944c9 ("rxrpc: Fix the packet reception routine")
Signed-off-by: David Howells <dhowells@redhat.com>
---

 net/rxrpc/call_accept.c |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c
index 652e314de38e..8079aacaecac 100644
--- a/net/rxrpc/call_accept.c
+++ b/net/rxrpc/call_accept.c
@@ -337,7 +337,7 @@ struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *local,
 {
 	struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
 	struct rxrpc_connection *conn;
-	struct rxrpc_peer *peer;
+	struct rxrpc_peer *peer = NULL;
 	struct rxrpc_call *call;
 
 	_enter("");

^ permalink raw reply related

* [PATCH bpf-next 2/2] bpf, libbpf: simplify perf RB walk and do incremental updates
From: Daniel Borkmann @ 2018-10-11 14:02 UTC (permalink / raw)
  To: alexei.starovoitov; +Cc: netdev, Daniel Borkmann
In-Reply-To: <20181011140207.27602-1-daniel@iogearbox.net>

Clean up and improve bpf_perf_event_read_simple() ring walk a bit
to use similar tail update scheme as in perf and bcc allowing the
kernel to make forward progress not only after full timely walk.
Also few other improvements to use realloc() instead of free() and
malloc() combination and for the callback use proper perf_event_header
instead of void pointer, so that real applications can use container_of()
macro with proper type checking.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/bpf/bpftool/map_perf_ring.c           | 10 ++--
 tools/lib/bpf/libbpf.c                      | 77 +++++++++++++----------------
 tools/lib/bpf/libbpf.h                      | 14 +++---
 tools/testing/selftests/bpf/trace_helpers.c |  7 +--
 4 files changed, 53 insertions(+), 55 deletions(-)

diff --git a/tools/bpf/bpftool/map_perf_ring.c b/tools/bpf/bpftool/map_perf_ring.c
index 6d41323..bdaf406 100644
--- a/tools/bpf/bpftool/map_perf_ring.c
+++ b/tools/bpf/bpftool/map_perf_ring.c
@@ -50,15 +50,17 @@ static void int_exit(int signo)
 	stop = true;
 }
 
-static enum bpf_perf_event_ret print_bpf_output(void *event, void *priv)
+static enum bpf_perf_event_ret
+print_bpf_output(struct perf_event_header *event, void *private_data)
 {
-	struct event_ring_info *ring = priv;
-	struct perf_event_sample *e = event;
+	struct perf_event_sample *e = container_of(event, struct perf_event_sample,
+						   header);
+	struct event_ring_info *ring = private_data;
 	struct {
 		struct perf_event_header header;
 		__u64 id;
 		__u64 lost;
-	} *lost = event;
+	} *lost = (typeof(lost))event;
 
 	if (json_output) {
 		jsonw_start_object(json_wtr);
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 1ac8856..35f4a77 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -2448,58 +2448,51 @@ static void bpf_perf_write_tail(struct perf_event_mmap_page *header,
 }
 
 enum bpf_perf_event_ret
-bpf_perf_event_read_simple(void *mem, unsigned long size,
-			   unsigned long page_size, void **buf, size_t *buf_len,
-			   bpf_perf_event_print_t fn, void *priv)
-{
-	struct perf_event_mmap_page *header = mem;
-	__u64 data_head = bpf_perf_read_head(header);
-	__u64 data_tail = header->data_tail;
-	int ret = LIBBPF_PERF_EVENT_ERROR;
-	void *base, *begin, *end;
-
-	if (data_head == data_tail)
-		return LIBBPF_PERF_EVENT_CONT;
-
-	base = ((char *)header) + page_size;
-
-	begin = base + data_tail % size;
-	end = base + data_head % size;
-
-	while (begin != end) {
-		struct perf_event_header *ehdr;
-
-		ehdr = begin;
-		if (begin + ehdr->size > base + size) {
-			long len = base + size - begin;
-
-			if (*buf_len < ehdr->size) {
-				free(*buf);
-				*buf = malloc(ehdr->size);
-				if (!*buf) {
+bpf_perf_event_read_simple(void *mmap_mem, size_t mmap_size, size_t page_size,
+			   void **copy_mem, size_t *copy_size,
+			   bpf_perf_event_print_t fn, void *private_data)
+{
+	struct perf_event_mmap_page *header = mmap_mem;
+	void *base = ((__u8 *)header) + page_size;
+	int ret = LIBBPF_PERF_EVENT_CONT;
+	struct perf_event_header *ehdr;
+	__u64 data_head, data_tail;
+	size_t ehdr_size;
+
+	for (data_tail = header->data_tail,
+	     data_head = bpf_perf_read_head(header);
+	     data_head != data_tail;
+	     data_head = bpf_perf_read_head(header)) {
+		ehdr = base + (data_tail & (mmap_size - 1));
+		ehdr_size = ehdr->size;
+		if (((void *)ehdr) + ehdr_size > base + mmap_size) {
+			void *copy_start = ehdr;
+			size_t len_first = base + mmap_size - copy_start;
+			size_t len_secnd = ehdr_size - len_first;
+
+			if (*copy_size < ehdr_size) {
+				void *ptr = realloc(*copy_mem, ehdr_size);
+
+				if (!ptr) {
 					ret = LIBBPF_PERF_EVENT_ERROR;
 					break;
 				}
-				*buf_len = ehdr->size;
+				*copy_mem = ptr;
+				*copy_size = ehdr_size;
 			}
 
-			memcpy(*buf, begin, len);
-			memcpy(*buf + len, base, ehdr->size - len);
-			ehdr = (void *)*buf;
-			begin = base + ehdr->size - len;
-		} else if (begin + ehdr->size == base + size) {
-			begin = base;
-		} else {
-			begin += ehdr->size;
+			memcpy(*copy_mem, copy_start, len_first);
+			memcpy(*copy_mem + len_first, base, len_secnd);
+			ehdr = *copy_mem;
 		}
 
-		ret = fn(ehdr, priv);
+		ret = fn(ehdr, private_data);
+
+		data_tail += ehdr_size;
+		bpf_perf_write_tail(header, data_tail);
 		if (ret != LIBBPF_PERF_EVENT_CONT)
 			break;
-
-		data_tail += ehdr->size;
 	}
 
-	bpf_perf_write_tail(header, data_tail);
 	return ret;
 }
diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index 8af8d36..16b5c95 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -284,12 +284,14 @@ enum bpf_perf_event_ret {
 	LIBBPF_PERF_EVENT_CONT	= -2,
 };
 
-typedef enum bpf_perf_event_ret (*bpf_perf_event_print_t)(void *event,
-							  void *priv);
-int bpf_perf_event_read_simple(void *mem, unsigned long size,
-			       unsigned long page_size,
-			       void **buf, size_t *buf_len,
-			       bpf_perf_event_print_t fn, void *priv);
+struct perf_event_header;
+typedef enum bpf_perf_event_ret
+	(*bpf_perf_event_print_t)(struct perf_event_header *hdr,
+				  void *private_data);
+enum bpf_perf_event_ret
+bpf_perf_event_read_simple(void *mmap_mem, size_t mmap_size, size_t page_size,
+			   void **copy_mem, size_t *copy_size,
+			   bpf_perf_event_print_t fn, void *private_data);
 
 struct nlattr;
 typedef int (*libbpf_dump_nlmsg_t)(void *cookie, void *msg, struct nlattr **tb);
diff --git a/tools/testing/selftests/bpf/trace_helpers.c b/tools/testing/selftests/bpf/trace_helpers.c
index cabe2a3..1764093 100644
--- a/tools/testing/selftests/bpf/trace_helpers.c
+++ b/tools/testing/selftests/bpf/trace_helpers.c
@@ -124,10 +124,11 @@ struct perf_event_sample {
 	char data[];
 };
 
-static enum bpf_perf_event_ret bpf_perf_event_print(void *event, void *priv)
+static enum bpf_perf_event_ret
+bpf_perf_event_print(struct perf_event_header *hdr, void *private_data)
 {
-	struct perf_event_sample *e = event;
-	perf_event_print_fn fn = priv;
+	struct perf_event_sample *e = (struct perf_event_sample *)hdr;
+	perf_event_print_fn fn = private_data;
 	int ret;
 
 	if (e->header.type == PERF_RECORD_SAMPLE) {
-- 
2.9.5

^ permalink raw reply related

* [PATCH bpf-next 1/2] bpf, libbpf: use proper barriers in perf RB walk
From: Daniel Borkmann @ 2018-10-11 14:02 UTC (permalink / raw)
  To: alexei.starovoitov; +Cc: netdev, Daniel Borkmann
In-Reply-To: <20181011140207.27602-1-daniel@iogearbox.net>

User proper CPU barrier instead of just a compile barrier when fetching
ring's data_head in bpf_perf_event_read_simple() which is not correct.
Also, add two small helpers bpf_perf_read_head() and bpf_perf_write_tail()
to make used barriers more obvious and a comment to what they pair to.

Fixes: d0cabbb021be ("tools: bpf: move the event reading loop to libbpf")
Fixes: 39111695b1b8 ("samples: bpf: add bpf_perf_event_output example")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/lib/bpf/libbpf.c | 52 ++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 46 insertions(+), 6 deletions(-)

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 176cf55..1ac8856 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -20,6 +20,7 @@
 #include <fcntl.h>
 #include <errno.h>
 #include <asm/unistd.h>
+#include <asm/barrier.h>
 #include <linux/err.h>
 #include <linux/kernel.h>
 #include <linux/bpf.h>
@@ -27,6 +28,7 @@
 #include <linux/list.h>
 #include <linux/limits.h>
 #include <linux/perf_event.h>
+#include <linux/compiler.h>
 #include <sys/stat.h>
 #include <sys/types.h>
 #include <sys/vfs.h>
@@ -2404,18 +2406,58 @@ int bpf_prog_load_xattr(const struct bpf_prog_load_attr *attr,
 	return 0;
 }
 
+/*
+ * Comment from kernel/events/ring_buffer.c:
+ *
+ * Since the mmap() consumer (userspace) can run on a different CPU:
+ *
+ *   kernel				user
+ *
+ *   if (LOAD ->data_tail) {		LOAD ->data_head
+ *			(A)		smp_rmb()	(C)
+ *	STORE $data			LOAD $data
+ *	smp_wmb()	(B)		smp_mb()	(D)
+ *	STORE ->data_head		STORE ->data_tail
+ *   }
+ *
+ * Where A pairs with D, and B pairs with C.
+ *
+ * In our case (A) is a control dependency that separates the load of
+ * the ->data_tail and the stores of $data. In case ->data_tail
+ * indicates there is no room in the buffer to store $data we do not.
+ *
+ * D needs to be a full barrier since it separates the data READ
+ * from the tail WRITE.
+ *
+ * For B a WMB is sufficient since it separates two WRITEs, and for C
+ * an RMB is sufficient since it separates two READs.
+ */
+static __u64 bpf_perf_read_head(struct perf_event_mmap_page *header)
+{
+	__u64 data_head = READ_ONCE(header->data_head);
+
+	rmb();
+	return data_head;
+}
+
+static void bpf_perf_write_tail(struct perf_event_mmap_page *header,
+				__u64 data_tail)
+{
+	mb();
+	header->data_tail = data_tail;
+}
+
 enum bpf_perf_event_ret
 bpf_perf_event_read_simple(void *mem, unsigned long size,
 			   unsigned long page_size, void **buf, size_t *buf_len,
 			   bpf_perf_event_print_t fn, void *priv)
 {
-	volatile struct perf_event_mmap_page *header = mem;
+	struct perf_event_mmap_page *header = mem;
+	__u64 data_head = bpf_perf_read_head(header);
 	__u64 data_tail = header->data_tail;
-	__u64 data_head = header->data_head;
 	int ret = LIBBPF_PERF_EVENT_ERROR;
 	void *base, *begin, *end;
 
-	asm volatile("" ::: "memory"); /* in real code it should be smp_rmb() */
 	if (data_head == data_tail)
 		return LIBBPF_PERF_EVENT_CONT;
 
@@ -2458,8 +2500,6 @@ bpf_perf_event_read_simple(void *mem, unsigned long size,
 		data_tail += ehdr->size;
 	}
 
-	__sync_synchronize(); /* smp_mb() */
-	header->data_tail = data_tail;
-
+	bpf_perf_write_tail(header, data_tail);
 	return ret;
 }
-- 
2.9.5

^ permalink raw reply related

* [PATCH bpf-next 0/2] Two libbpf RB walk improvements
From: Daniel Borkmann @ 2018-10-11 14:02 UTC (permalink / raw)
  To: alexei.starovoitov; +Cc: netdev, Daniel Borkmann

Two small improvements to libbpf's perf RB walk: first one fixes
used barriers and second one simplifies ring walking code. For
details see individual patches.

Thanks!

Daniel Borkmann (2):
  bpf, libbpf: use proper barriers in perf RB walk
  bpf, libbpf: simplify perf RB walk and do incremental updates

 tools/bpf/bpftool/map_perf_ring.c           |  10 ++-
 tools/lib/bpf/libbpf.c                      | 117 ++++++++++++++++++----------
 tools/lib/bpf/libbpf.h                      |  14 ++--
 tools/testing/selftests/bpf/trace_helpers.c |   7 +-
 4 files changed, 93 insertions(+), 55 deletions(-)

-- 
2.9.5

^ permalink raw reply

* Re: [PATCH 1/1] net: socionext: clear rx irq correctly
From: Ard Biesheuvel @ 2018-10-11 13:49 UTC (permalink / raw)
  To: Ilias Apalodimas
  Cc: <netdev@vger.kernel.org>, Jaswinder Singh, Masami Hiramatsu
In-Reply-To: <1539260906-29596-1-git-send-email-ilias.apalodimas@linaro.org>

On 11 October 2018 at 14:28, Ilias Apalodimas
<ilias.apalodimas@linaro.org> wrote:
> commit 63ae7949e94a ("net: socionext: Use descriptor info instead of MMIO reads on Rx")
> removed constant mmio reads from the driver and started using a descriptor
> field to check if packet should be processed.
> This lead the napi rx handler being constantly called while no packets
> needed processing and ksoftirq getting 100% cpu usage. Issue one mmio read
> to clear the irq correcty after processing packets
>
> Signed-off-by: Ilias Apalodimas <ilias.apalodimas@linaro.org>
> Reported-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>

Tested-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Acked-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>

> ---
>  drivers/net/ethernet/socionext/netsec.c | 5 ++++-
>  1 file changed, 4 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/net/ethernet/socionext/netsec.c b/drivers/net/ethernet/socionext/netsec.c
> index 7aa5ebb..4289ccb 100644
> --- a/drivers/net/ethernet/socionext/netsec.c
> +++ b/drivers/net/ethernet/socionext/netsec.c
> @@ -735,8 +735,11 @@ static int netsec_process_rx(struct netsec_priv *priv, int budget)
>                 u16 idx = dring->tail;
>                 struct netsec_de *de = dring->vaddr + (DESC_SZ * idx);
>
> -               if (de->attr & (1U << NETSEC_RX_PKT_OWN_FIELD))
> +               if (de->attr & (1U << NETSEC_RX_PKT_OWN_FIELD)) {
> +                       /* reading the register clears the irq */
> +                       netsec_read(priv, NETSEC_REG_NRM_RX_PKTCNT);
>                         break;
> +               }
>
>                 /* This  barrier is needed to keep us from reading
>                  * any other fields out of the netsec_de until we have
> --
> 2.7.4
>

^ permalink raw reply

* Re: [PATCH net 2/2] selftests: udpgso_bench.sh explicitly requires bash
From: Willem de Bruijn @ 2018-10-11 13:48 UTC (permalink / raw)
  To: Paolo Abeni
  Cc: Network Development, David Miller, Florian Westphal,
	Willem de Bruijn
In-Reply-To: <6d93e3bc5920b3f6f8b121fa0969ee42ef33ed5c.1539247467.git.pabeni@redhat.com>

On Thu, Oct 11, 2018 at 4:58 AM Paolo Abeni <pabeni@redhat.com> wrote:
>
> The udpgso_bench.sh script requires several bash-only features. This
> may cause random failures if the default shell is not bash.
> Address the above explicitly requiring bash as the script interpreter
>
> Fixes: 3a687bef148d ("selftests: udp gso benchmark")
> Signed-off-by: Paolo Abeni <pabeni@redhat.com>

Acked-by: Willem de Bruijn <willemb@google.com>

^ permalink raw reply

* RE: [PATCH net] tipc: eliminate possible recursive locking detected by LOCKDEP
From: Jon Maloy @ 2018-10-11 13:30 UTC (permalink / raw)
  To: Ying Xue, dvyukov@google.com
  Cc: davem@davemloft.net, parthasarathy.bhuvaragan@ericsson.com,
	netdev@vger.kernel.org, linux-kernel@vger.kernel.org,
	tipc-discussion@lists.sourceforge.net
In-Reply-To: <1539259076-8562-1-git-send-email-ying.xue@windriver.com>

Acked-by: Jon Maloy <jon.maloy@ericsson.com>

///jon


> -----Original Message-----
> From: Ying Xue <ying.xue@windriver.com>
> Sent: October 11, 2018 7:58 AM
> To: Jon Maloy <jon.maloy@ericsson.com>; dvyukov@google.com
> Cc: davem@davemloft.net; parthasarathy.bhuvaragan@ericsson.com;
> netdev@vger.kernel.org; linux-kernel@vger.kernel.org; tipc-
> discussion@lists.sourceforge.net
> Subject: [PATCH net] tipc: eliminate possible recursive locking detected by
> LOCKDEP
> 
> When booting kernel with LOCKDEP option, below warning info was found:
> 
> WARNING: possible recursive locking detected 4.19.0-rc7+ #14 Not tainted
> --------------------------------------------
> swapper/0/1 is trying to acquire lock:
> 00000000dcfc0fc8 (&(&list->lock)->rlock#4){+...}, at: spin_lock_bh
> include/linux/spinlock.h:334 [inline]
> 00000000dcfc0fc8 (&(&list->lock)->rlock#4){+...}, at:
> tipc_link_reset+0x125/0xdf0 net/tipc/link.c:850
> 
> but task is already holding lock:
> 00000000cbb9b036 (&(&list->lock)->rlock#4){+...}, at: spin_lock_bh
> include/linux/spinlock.h:334 [inline]
> 00000000cbb9b036 (&(&list->lock)->rlock#4){+...}, at:
> tipc_link_reset+0xfa/0xdf0 net/tipc/link.c:849
> 
> other info that might help us debug this:
>  Possible unsafe locking scenario:
> 
>        CPU0
>        ----
>   lock(&(&list->lock)->rlock#4);
>   lock(&(&list->lock)->rlock#4);
> 
>  *** DEADLOCK ***
> 
>  May be due to missing lock nesting notation
> 
> 2 locks held by swapper/0/1:
>  #0: 00000000f7539d34 (pernet_ops_rwsem){+.+.}, at:
> register_pernet_subsys+0x19/0x40 net/core/net_namespace.c:1051
>  #1: 00000000cbb9b036 (&(&list->lock)->rlock#4){+...}, at:
> spin_lock_bh include/linux/spinlock.h:334 [inline]
>  #1: 00000000cbb9b036 (&(&list->lock)->rlock#4){+...}, at:
> tipc_link_reset+0xfa/0xdf0 net/tipc/link.c:849
> 
> stack backtrace:
> CPU: 0 PID: 1 Comm: swapper/0 Not tainted 4.19.0-rc7+ #14 Hardware name:
> QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1 04/01/2014 Call Trace:
>  __dump_stack lib/dump_stack.c:77 [inline]
>  dump_stack+0x1af/0x295 lib/dump_stack.c:113  print_deadlock_bug
> kernel/locking/lockdep.c:1759 [inline]  check_deadlock
> kernel/locking/lockdep.c:1803 [inline]  validate_chain
> kernel/locking/lockdep.c:2399 [inline]
>  __lock_acquire+0xf1e/0x3c60 kernel/locking/lockdep.c:3411
>  lock_acquire+0x1db/0x520 kernel/locking/lockdep.c:3900
> __raw_spin_lock_bh include/linux/spinlock_api_smp.h:135 [inline]
>  _raw_spin_lock_bh+0x31/0x40 kernel/locking/spinlock.c:168  spin_lock_bh
> include/linux/spinlock.h:334 [inline]
>  tipc_link_reset+0x125/0xdf0 net/tipc/link.c:850
>  tipc_link_bc_create+0xb5/0x1f0 net/tipc/link.c:526
>  tipc_bcast_init+0x59b/0xab0 net/tipc/bcast.c:521
>  tipc_init_net+0x472/0x610 net/tipc/core.c:82
>  ops_init+0xf7/0x520 net/core/net_namespace.c:129
> __register_pernet_operations net/core/net_namespace.c:940 [inline]
>  register_pernet_operations+0x453/0xac0 net/core/net_namespace.c:1011
>  register_pernet_subsys+0x28/0x40 net/core/net_namespace.c:1052
>  tipc_init+0x83/0x104 net/tipc/core.c:140  do_one_initcall+0x109/0x70a
> init/main.c:885  do_initcall_level init/main.c:953 [inline]  do_initcalls
> init/main.c:961 [inline]  do_basic_setup init/main.c:979 [inline]
> kernel_init_freeable+0x4bd/0x57f init/main.c:1144
>  kernel_init+0x13/0x180 init/main.c:1063
>  ret_from_fork+0x3a/0x50 arch/x86/entry/entry_64.S:413
> 
> The reason why the noise above was complained by LOCKDEP is because we
> nested to hold l->wakeupq.lock and l->inputq->lock in tipc_link_reset
> function. In fact it's unnecessary to move skb buffer from l->wakeupq queue
> to l->inputq queue while holding the two locks at the same time.
> Instead, we can move skb buffers in l->wakeupq queue to a temporary list
> first and then move the buffers of the temporary list to l->inputq queue,
> which is also safe for us.
> 
> Fixes: 3f32d0be6c16 ("tipc: lock wakeup & inputq at tipc_link_reset()")
> Reported-by: Dmitry Vyukov <dvyukov@google.com>
> Signed-off-by: Ying Xue <ying.xue@windriver.com>
> ---
>  net/tipc/link.c | 11 +++++++++--
>  1 file changed, 9 insertions(+), 2 deletions(-)
> 
> diff --git a/net/tipc/link.c b/net/tipc/link.c index fb886b5..1d21ae4 100644
> --- a/net/tipc/link.c
> +++ b/net/tipc/link.c
> @@ -843,14 +843,21 @@ static void link_prepare_wakeup(struct tipc_link *l)
> 
>  void tipc_link_reset(struct tipc_link *l)  {
> +	struct sk_buff_head list;
> +
> +	__skb_queue_head_init(&list);
> +
>  	l->in_session = false;
>  	l->session++;
>  	l->mtu = l->advertised_mtu;
> +
>  	spin_lock_bh(&l->wakeupq.lock);
> +	skb_queue_splice_init(&l->wakeupq, &list);
> +	spin_unlock_bh(&l->wakeupq.lock);
> +
>  	spin_lock_bh(&l->inputq->lock);
> -	skb_queue_splice_init(&l->wakeupq, l->inputq);
> +	skb_queue_splice_init(&list, l->inputq);
>  	spin_unlock_bh(&l->inputq->lock);
> -	spin_unlock_bh(&l->wakeupq.lock);
> 
>  	__skb_queue_purge(&l->transmq);
>  	__skb_queue_purge(&l->deferdq);
> --
> 2.7.4

^ permalink raw reply

* [PATCH net-next] hv_netvsc: fix vf serial matching with pci slot info
From: Haiyang Zhang @ 2018-10-11 20:14 UTC (permalink / raw)
  To: davem, netdev; +Cc: olaf, sthemmin, haiyangz, linux-kernel, devel, vkuznets

From: Haiyang Zhang <haiyangz@microsoft.com>

The VF device's serial number is saved as a string in PCI slot's
kobj name, not the slot->number. This patch corrects the netvsc
driver, so the VF device can be successfully paired with synthetic
NIC.

Fixes: 00d7ddba1143 ("hv_netvsc: pair VF based on serial number")
Signed-off-by: Haiyang Zhang <haiyangz@microsoft.com>
---
 drivers/net/hyperv/netvsc_drv.c | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index 9bcaf204a7d4..8121ce34a39f 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -2030,14 +2030,15 @@ static void netvsc_vf_setup(struct work_struct *w)
 	rtnl_unlock();
 }
 
-/* Find netvsc by VMBus serial number.
- * The PCI hyperv controller records the serial number as the slot.
+/* Find netvsc by VF serial number.
+ * The PCI hyperv controller records the serial number as the slot kobj name.
  */
 static struct net_device *get_netvsc_byslot(const struct net_device *vf_netdev)
 {
 	struct device *parent = vf_netdev->dev.parent;
 	struct net_device_context *ndev_ctx;
 	struct pci_dev *pdev;
+	u32 serial;
 
 	if (!parent || !dev_is_pci(parent))
 		return NULL; /* not a PCI device */
@@ -2048,16 +2049,22 @@ static struct net_device *get_netvsc_byslot(const struct net_device *vf_netdev)
 		return NULL;
 	}
 
+	if (kstrtou32(pdev->slot->kobj.name, 10, &serial)) {
+		netdev_notice(vf_netdev, "Invalid vf serial:%s\n",
+			      pdev->slot->kobj.name);
+		return NULL;
+	}
+
 	list_for_each_entry(ndev_ctx, &netvsc_dev_list, list) {
 		if (!ndev_ctx->vf_alloc)
 			continue;
 
-		if (ndev_ctx->vf_serial == pdev->slot->number)
+		if (ndev_ctx->vf_serial == serial)
 			return hv_get_drvdata(ndev_ctx->device_ctx);
 	}
 
 	netdev_notice(vf_netdev,
-		      "no netdev found for slot %u\n", pdev->slot->number);
+		      "no netdev found for vf serial:%u\n", serial);
 	return NULL;
 }
 
-- 
2.18.0

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox