Netdev List

Netdev List
 help / color / mirror / Atom feed

* Re: [PATCH] net: phy: sfp: remove sfp_mutex's definition
From: Andrew Lunn @ 2018-10-11 15:45 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior; +Cc: netdev, tglx, Florian Fainelli, David S. Miller
In-Reply-To: <20181011150621.8466-1-bigeasy@linutronix.de>

On Thu, Oct 11, 2018 at 05:06:21PM +0200, Sebastian Andrzej Siewior wrote:
> The sfp_mutex variable is defined but never used in this file. Not even
> in the commit that introduced that variable.
> 
> Remove sfp_mutex, it has no purpose.

Reviewed-by: Andrew Lunn <andrew@lunn.ch>

    Andrew

^ permalink raw reply

* [PATCH net-next v3 2/2] net/ncsi: Add NCSI Mellanox OEM command
From: Vijay Khemka @ 2018-10-11 23:05 UTC (permalink / raw)
  To: Samuel Mendoza-Jonas, David S. Miller, netdev, linux-kernel
  Cc: vijaykhemka, openbmc @ lists . ozlabs . org, Justin.Lee1, joel,
	linux-aspeed
In-Reply-To: <20181011230518.3204700-1-vijaykhemka@fb.com>

This patch adds OEM Mellanox commands and response handling. It also
defines OEM Get MAC Address handler to get and configure the device.

ncsi_oem_gma_handler_mlx: This handler send NCSI mellanox command for
getting mac address.
ncsi_rsp_handler_oem_mlx: This handles response received for all
mellanox OEM commands.
ncsi_rsp_handler_oem_mlx_gma: This handles get mac address response and
set it to device.

Signed-off-by: Vijay Khemka <vijaykhemka@fb.com>
---
 net/ncsi/internal.h    |  5 +++++
 net/ncsi/ncsi-manage.c | 24 +++++++++++++++++++++++-
 net/ncsi/ncsi-pkt.h    |  9 +++++++++
 net/ncsi/ncsi-rsp.c    | 41 ++++++++++++++++++++++++++++++++++++++++-
 4 files changed, 77 insertions(+), 2 deletions(-)

diff --git a/net/ncsi/internal.h b/net/ncsi/internal.h
index 45883b32790e..d4558628a991 100644
--- a/net/ncsi/internal.h
+++ b/net/ncsi/internal.h
@@ -73,10 +73,15 @@ enum {
 #define NCSI_OEM_MFR_BCM_ID             0x113d
 /* Broadcom specific OEM Command */
 #define NCSI_OEM_BCM_CMD_GMA            0x01   /* CMD ID for Get MAC */
+/* Mellanox specific OEM Command */
+#define NCSI_OEM_MLX_CMD_GMA            0x00   /* CMD ID for Get MAC */
+#define NCSI_OEM_MLX_CMD_GMA_PARAM      0x1b   /* Parameter for GMA  */
 /* OEM Command payload lengths*/
 #define NCSI_OEM_BCM_CMD_GMA_LEN        12
+#define NCSI_OEM_MLX_CMD_GMA_LEN        8
 /* Mac address offset in OEM response */
 #define BCM_MAC_ADDR_OFFSET             28
+#define MLX_MAC_ADDR_OFFSET             8
 
 
 struct ncsi_channel_version {
diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c
index 75504ccd1b95..9306fa464190 100644
--- a/net/ncsi/ncsi-manage.c
+++ b/net/ncsi/ncsi-manage.c
@@ -658,12 +658,34 @@ static void ncsi_oem_gma_handler_bcm(struct ncsi_cmd_arg *nca)
 			   nca->type);
 }
 
+static void ncsi_oem_gma_handler_mlx(struct ncsi_cmd_arg *nca)
+{
+	int ret = 0;
+	unsigned char data[NCSI_OEM_MLX_CMD_GMA_LEN];
+
+	nca->payload = NCSI_OEM_MLX_CMD_GMA_LEN;
+
+	memset(data, 0, NCSI_OEM_MLX_CMD_GMA_LEN);
+	*(unsigned int *)data = ntohl(NCSI_OEM_MFR_MLX_ID);
+	data[5] = NCSI_OEM_MLX_CMD_GMA;
+	data[6] = NCSI_OEM_MLX_CMD_GMA_PARAM;
+
+	nca->data = data;
+
+	ret = ncsi_xmit_cmd(nca);
+	if (ret)
+		netdev_err(nca->ndp->ndev.dev,
+			   "NCSI: Failed to transmit cmd 0x%x during configure\n",
+			   nca->type);
+}
+
 /* OEM Command handlers initialization */
 static struct ncsi_oem_gma_handler {
 	unsigned int	mfr_id;
 	void		(*handler)(struct ncsi_cmd_arg *nca);
 } ncsi_oem_gma_handlers[] = {
-	{ NCSI_OEM_MFR_BCM_ID, ncsi_oem_gma_handler_bcm }
+	{ NCSI_OEM_MFR_BCM_ID, ncsi_oem_gma_handler_bcm },
+	{ NCSI_OEM_MFR_MLX_ID, ncsi_oem_gma_handler_mlx }
 };
 
 #endif /* CONFIG_NCSI_OEM_CMD_GET_MAC */
diff --git a/net/ncsi/ncsi-pkt.h b/net/ncsi/ncsi-pkt.h
index 4d3f06be38bd..2a6d83a596c9 100644
--- a/net/ncsi/ncsi-pkt.h
+++ b/net/ncsi/ncsi-pkt.h
@@ -165,6 +165,15 @@ struct ncsi_rsp_oem_pkt {
 	unsigned char           data[];      /* Payload data      */
 };
 
+/* Mellanox Response Data */
+struct ncsi_rsp_oem_mlx_pkt {
+	unsigned char           cmd_rev;     /* Command Revision  */
+	unsigned char           cmd;         /* Command ID        */
+	unsigned char           param;       /* Parameter         */
+	unsigned char           optional;    /* Optional data     */
+	unsigned char           data[];      /* Data              */
+};
+
 /* Broadcom Response Data */
 struct ncsi_rsp_oem_bcm_pkt {
 	unsigned char           ver;         /* Payload Version   */
diff --git a/net/ncsi/ncsi-rsp.c b/net/ncsi/ncsi-rsp.c
index 31672e967db2..5158836796ce 100644
--- a/net/ncsi/ncsi-rsp.c
+++ b/net/ncsi/ncsi-rsp.c
@@ -596,6 +596,45 @@ static int ncsi_rsp_handler_snfc(struct ncsi_request *nr)
 	return 0;
 }
 
+/* Response handler for Mellanox command Get Mac Address */
+static int ncsi_rsp_handler_oem_mlx_gma(struct ncsi_request *nr)
+{
+	struct ncsi_rsp_oem_pkt *rsp;
+	struct ncsi_dev_priv *ndp = nr->ndp;
+	struct net_device *ndev = ndp->ndev.dev;
+	int ret = 0;
+	const struct net_device_ops *ops = ndev->netdev_ops;
+	struct sockaddr saddr;
+
+	/* Get the response header */
+	rsp = (struct ncsi_rsp_oem_pkt *)skb_network_header(nr->rsp);
+
+	saddr.sa_family = ndev->type;
+	ndev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
+	memcpy(saddr.sa_data, &rsp->data[MLX_MAC_ADDR_OFFSET], ETH_ALEN);
+	ret = ops->ndo_set_mac_address(ndev, &saddr);
+	if (ret < 0)
+		netdev_warn(ndev, "NCSI: Writing mac address to device failed\n");
+
+	return ret;
+}
+
+/* Response handler for Mellanox card */
+static int ncsi_rsp_handler_oem_mlx(struct ncsi_request *nr)
+{
+	struct ncsi_rsp_oem_pkt *rsp;
+	struct ncsi_rsp_oem_mlx_pkt *mlx;
+
+	/* Get the response header */
+	rsp = (struct ncsi_rsp_oem_pkt *)skb_network_header(nr->rsp);
+	mlx = (struct ncsi_rsp_oem_mlx_pkt *)(rsp->data);
+
+	if (mlx->cmd == NCSI_OEM_MLX_CMD_GMA &&
+	    mlx->param == NCSI_OEM_MLX_CMD_GMA_PARAM)
+		return ncsi_rsp_handler_oem_mlx_gma(nr);
+	return 0;
+}
+
 /* Response handler for Broadcom command Get Mac Address */
 static int ncsi_rsp_handler_oem_bcm_gma(struct ncsi_request *nr)
 {
@@ -638,7 +677,7 @@ static struct ncsi_rsp_oem_handler {
 	unsigned int	mfr_id;
 	int		(*handler)(struct ncsi_request *nr);
 } ncsi_rsp_oem_handlers[] = {
-	{ NCSI_OEM_MFR_MLX_ID, NULL },
+	{ NCSI_OEM_MFR_MLX_ID, ncsi_rsp_handler_oem_mlx },
 	{ NCSI_OEM_MFR_BCM_ID, ncsi_rsp_handler_oem_bcm }
 };
 
-- 
2.17.1

^ permalink raw reply related

* Re: [PATCH net-next 0/9] net: Kernel side filtering for route dumps
From: David Ahern @ 2018-10-11 15:32 UTC (permalink / raw)
  To: Stephen Hemminger, David Ahern; +Cc: netdev, davem
In-Reply-To: <20181011082637.3e7833c9@xeon-e3>

On 10/11/18 9:26 AM, Stephen Hemminger wrote:
>>
> 
> You can do the something like this already with BPF socket filters.
> But writing BPF for multi-part messages is hard.
> 
> Maybe a generic eBPF filter mechanism would be more flexible?
> 

That exists today and does not cover what is needed here:
1. The filters apply *after* the skb has been filled in.

2. an skb will have many routes within it and the user filter could
apply to any of those messages within the skb. It is not efficient to
generate the skb and then re-create it with a bpf filter.

The point here is to not even fill in the skb for something userspace
does not care about.

Route dumps are done for the entire FIB for each address family. As we
approach internet routing tables (700k+ routes for IPv4, currently
around 55k for IPv6) with many VRFs dumping the entire table is grossly
inefficient when for example only a single VRF table is wanted.

^ permalink raw reply

* Re: [PATCH net-next 0/9] net: Kernel side filtering for route dumps
From: Stephen Hemminger @ 2018-10-11 15:26 UTC (permalink / raw)
  To: David Ahern; +Cc: netdev, davem, David Ahern
In-Reply-To: <20181011150627.4010-1-dsahern@kernel.org>

On Thu, 11 Oct 2018 08:06:18 -0700
David Ahern <dsahern@kernel.org> wrote:

> From: David Ahern <dsahern@gmail.com>
> 
> Implement kernel side filtering of route dumps by protocol (e.g., which
> routing daemon installed the route), route type (e.g., unicast), table
> id and nexthop device.
> 
> iproute2 has been doing this filtering in userspace for years; pushing
> the filters to the kernel side reduces the amount of data the kernel
> sends and reduces wasted cycles on both sides processing unwanted data.
> These initial options provide a huge improvement for efficiently
> examining routes on large scale systems.
> 
> David Ahern (9):
>   net: Add struct for fib dump filter
>   net/ipv4: Plumb support for filtering route dumps
>   net/ipv6: Plumb support for filtering route dumps
>   net/mpls: Plumb support for filtering route dumps
>   net: Plumb support for filtering ipv4 and ipv6 multicast route dumps
>   net: Enable kernel side filtering of route dumps
>   net/mpls: Handle kernel side filtering of route dumps
>   net/ipv6: Bail early if user only wants cloned entries
>   net/ipv4: Bail early if user only wants prefix entries
> 
>  include/linux/mroute_base.h |  5 +--
>  include/net/ip6_route.h     |  1 +
>  include/net/ip_fib.h        | 14 ++++++--
>  net/ipv4/fib_frontend.c     | 64 +++++++++++++++++++++++++++------
>  net/ipv4/fib_trie.c         | 37 +++++++++++++------
>  net/ipv4/ipmr.c             |  8 +++--
>  net/ipv4/ipmr_base.c        | 33 ++++++++++++++++-
>  net/ipv6/ip6_fib.c          | 17 +++++++--
>  net/ipv6/ip6mr.c            |  7 ++--
>  net/ipv6/route.c            | 40 ++++++++++++++++-----
>  net/mpls/af_mpls.c          | 86 +++++++++++++++++++++++++++++++++++++++------
>  11 files changed, 262 insertions(+), 50 deletions(-)
> 

You can do the something like this already with BPF socket filters.
But writing BPF for multi-part messages is hard.

Maybe a generic eBPF filter mechanism would be more flexible?

^ permalink raw reply

* Re: [PATCH iproute 2/2] utils: fix get_rtnl_link_stats_rta stats parsing
From: Stephen Hemminger @ 2018-10-11 15:09 UTC (permalink / raw)
  To: Lorenzo Bianconi; +Cc: netdev
In-Reply-To: <20181011122401.GA10363@localhost.localdomain>

On Thu, 11 Oct 2018 14:24:03 +0200
Lorenzo Bianconi <lorenzo.bianconi@redhat.com> wrote:

> > > iproute2 walks through the list of available tunnels using netlink
> > > protocol in order to get device info instead of reading
> > > them from proc filesystem. However the kernel reports device statistics
> > > using IFLA_INET6_STATS/IFLA_INET6_ICMP6STATS attributes nested in
> > > IFLA_PROTINFO one but iproutes expects these info in
> > > IFLA_STATS64/IFLA_STATS attributes.
> > > The issue can be triggered with the following reproducer:
> > > 
> > > $ip link add ip6d0 type ip6tnl mode ip6ip6 local 1111::1 remote 2222::1
> > > $ip -6 -d -s tunnel show ip6d0
> > > ip6d0: ipv6/ipv6 remote 2222::1 local 1111::1 encaplimit 4 hoplimit 64
> > > tclass 0x00 flowlabel 0x00000 (flowinfo 0x00000000)
> > > Dump terminated
> > > 
> > > Fix the issue introducing IFLA_INET6_STATS attribute parsing
> > > 
> > > Fixes: 3e953938717f ("iptunnel/ip6tunnel: Use netlink to walk through
> > > tunnels list")
> > > 
> > > Signed-off-by: Lorenzo Bianconi <lorenzo.bianconi@redhat.com>  
> > 
> > Can't we fix the kernel to report statistics properly, rather than
> > starting iproute2 doing more /proc interfaces.
> >   
> 
> Hi Stephen,

> sorry, I did not get what you mean. Current iproute implementation
> walks through tunnels list using netlink protocol and parses device
> statistics in the kernel netlink message. However it does not take
> into account the actual netlink message layout since the statistic
> attribute is nested in IFLA_PROTINFO one.
> Moreover AFAIU the related kernel code has not changed since iproute
> commit 3e953938717f, so I guess we should fix the issue in iproute code
> instead in the kernel one. Do you agree?
> 
> Regards,
> Lorenzo

Sorry, I was confused and the stats code is really about parsing existing
pile of statistics in netlink.  I thought it was doing /proc version
like nstat already does.

I will reset these patches to under review.

^ permalink raw reply

* [PATCH] net: phy: sfp: remove sfp_mutex's definition
From: Sebastian Andrzej Siewior @ 2018-10-11 15:06 UTC (permalink / raw)
  To: netdev
  Cc: tglx, Sebastian Andrzej Siewior, Andrew Lunn, Florian Fainelli,
	David S. Miller

The sfp_mutex variable is defined but never used in this file. Not even
in the commit that introduced that variable.

Remove sfp_mutex, it has no purpose.

Cc: Andrew Lunn <andrew@lunn.ch>
Cc: Florian Fainelli <f.fainelli@gmail.com>
Cc: "David S. Miller" <davem@davemloft.net>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 drivers/net/phy/sfp.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/net/phy/sfp.c b/drivers/net/phy/sfp.c
index 6e13b8832bc7d..fd8bb998ae52d 100644
--- a/drivers/net/phy/sfp.c
+++ b/drivers/net/phy/sfp.c
@@ -163,8 +163,6 @@ static const enum gpiod_flags gpio_flags[] = {
 /* Give this long for the PHY to reset. */
 #define T_PHY_RESET_MS	50
 
-static DEFINE_MUTEX(sfp_mutex);
-
 struct sff_data {
 	unsigned int gpios;
 	bool (*module_supported)(const struct sfp_eeprom_id *id);
-- 
2.19.1

^ permalink raw reply related

* [PATCH net-next 9/9] net/ipv4: Bail early if user only wants prefix entries
From: David Ahern @ 2018-10-11 15:06 UTC (permalink / raw)
  To: netdev, davem; +Cc: David Ahern
In-Reply-To: <20181011150627.4010-1-dsahern@kernel.org>

From: David Ahern <dsahern@gmail.com>

Unlike IPv6, IPv4 does not have routes marked with RTF_PREFIX_RT. If the
flag is set in the dump request, just return.

In the process of this change, move the CLONE check to use the new
filter flags.

Signed-off-by: David Ahern <dsahern@gmail.com>
---
 net/ipv4/fib_frontend.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index a99f2c7ba4e6..3e5e3c380dbb 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -884,10 +884,14 @@ static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
 		err = ip_valid_fib_dump_req(net, nlh, &filter, cb->extack);
 		if (err < 0)
 			return err;
+	} else if (nlmsg_len(nlh) >= sizeof(struct rtmsg)) {
+		struct rtmsg *rtm = nlmsg_data(nlh);
+
+		filter.flags = rtm->rtm_flags & (RTM_F_PREFIX | RTM_F_CLONED);
 	}
 
-	if (nlmsg_len(nlh) >= sizeof(struct rtmsg) &&
-	    ((struct rtmsg *)nlmsg_data(nlh))->rtm_flags & RTM_F_CLONED)
+	/* fib entries are never clones and ipv4 does not use prefix flag */
+	if (filter.flags & (RTM_F_PREFIX | RTM_F_CLONED))
 		return skb->len;
 
 	s_h = cb->args[0];
-- 
2.11.0

^ permalink raw reply related

* [PATCH net-next 3/9] net/ipv6: Plumb support for filtering route dumps
From: David Ahern @ 2018-10-11 15:06 UTC (permalink / raw)
  To: netdev, davem; +Cc: David Ahern
In-Reply-To: <20181011150627.4010-1-dsahern@kernel.org>

From: David Ahern <dsahern@gmail.com>

Implement kernel side filtering of routes by table id, egress device
index, protocol, and route type.

Move the existing route flags check for prefix only routes to the new
filter.

Signed-off-by: David Ahern <dsahern@gmail.com>
---
 net/ipv6/ip6_fib.c |  9 +++++++++
 net/ipv6/route.c   | 40 ++++++++++++++++++++++++++++++++--------
 2 files changed, 41 insertions(+), 8 deletions(-)

diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 6a169794a674..dd6a43874192 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -580,6 +580,11 @@ static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
 		err = ip_valid_fib_dump_req(net, nlh, &arg.filter, cb->extack);
 		if (err < 0)
 			return err;
+	} else if (nlmsg_len(nlh) >= sizeof(struct rtmsg)) {
+		struct rtmsg *rtm = nlmsg_data(nlh);
+
+		if (rtm->rtm_flags & RTM_F_PREFIX)
+			arg.filter.flags = RTM_F_PREFIX;
 	}
 
 	s_h = cb->args[0];
@@ -616,6 +621,10 @@ static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
 		hlist_for_each_entry_rcu(tb, head, tb6_hlist) {
 			if (e < s_e)
 				goto next;
+			if (arg.filter.table_id &&
+			    arg.filter.table_id != tb->tb6_id)
+				goto next;
+
 			res = fib6_dump_table(tb, skb, cb);
 			if (res != 0)
 				goto out;
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index bf4cd647d8b8..8ed2e7462657 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -4763,28 +4763,52 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb,
 	return -EMSGSIZE;
 }
 
+static bool fib6_info_uses_dev(const struct fib6_info *f6i,
+			       const struct net_device *dev)
+{
+	if (f6i->fib6_nh.nh_dev == dev)
+		return true;
+
+	if (f6i->fib6_nsiblings) {
+		struct fib6_info *sibling, *next_sibling;
+
+		list_for_each_entry_safe(sibling, next_sibling,
+					 &f6i->fib6_siblings, fib6_siblings) {
+			if (sibling->fib6_nh.nh_dev == dev)
+				return true;
+		}
+	}
+
+	return false;
+}
+
 int rt6_dump_route(struct fib6_info *rt, void *p_arg)
 {
 	struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
+	struct fib_dump_filter *filter = &arg->filter;
+	unsigned int flags = NLM_F_MULTI;
 	struct net *net = arg->net;
 
 	if (rt == net->ipv6.fib6_null_entry)
 		return 0;
 
-	if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
-		struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
-
-		/* user wants prefix routes only */
-		if (rtm->rtm_flags & RTM_F_PREFIX &&
-		    !(rt->fib6_flags & RTF_PREFIX_RT)) {
-			/* success since this is not a prefix route */
+	if ((filter->flags & RTM_F_PREFIX) &&
+	    !(rt->fib6_flags & RTF_PREFIX_RT)) {
+		/* success since this is not a prefix route */
+		return 1;
+	}
+	if (filter->filter_set) {
+		if ((filter->rt_type && rt->fib6_type != filter->rt_type) ||
+		    (filter->dev && !fib6_info_uses_dev(rt, filter->dev)) ||
+		    (filter->protocol && rt->fib6_protocol != filter->protocol)) {
 			return 1;
 		}
+		flags |= NLM_F_DUMP_FILTERED;
 	}
 
 	return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0,
 			     RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid,
-			     arg->cb->nlh->nlmsg_seq, NLM_F_MULTI);
+			     arg->cb->nlh->nlmsg_seq, flags);
 }
 
 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
-- 
2.11.0

^ permalink raw reply related

* [PATCH net-next 7/9] net/mpls: Handle kernel side filtering of route dumps
From: David Ahern @ 2018-10-11 15:06 UTC (permalink / raw)
  To: netdev, davem; +Cc: David Ahern
In-Reply-To: <20181011150627.4010-1-dsahern@kernel.org>

From: David Ahern <dsahern@gmail.com>

Update the dump request parsing in MPLS for the non-INET case to
enable kernel side filtering. If INET is disabled the other filters
that make sense for MPLS are protocol and nexthop device.

Signed-off-by: David Ahern <dsahern@gmail.com>
---
 net/mpls/af_mpls.c | 32 +++++++++++++++++++++++++++-----
 1 file changed, 27 insertions(+), 5 deletions(-)

diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
index 48f4cbd9fb38..b256de02251b 100644
--- a/net/mpls/af_mpls.c
+++ b/net/mpls/af_mpls.c
@@ -2043,7 +2043,9 @@ static int mpls_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh,
 				   struct fib_dump_filter *filter,
 				   struct netlink_ext_ack *extack)
 {
+	struct nlattr *tb[RTA_MAX + 1];
 	struct rtmsg *rtm;
+	int err, i;
 
 	if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
 		NL_SET_ERR_MSG_MOD(extack, "Invalid header for FIB dump request");
@@ -2052,15 +2054,35 @@ static int mpls_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh,
 
 	rtm = nlmsg_data(nlh);
 	if (rtm->rtm_dst_len || rtm->rtm_src_len  || rtm->rtm_tos   ||
-	    rtm->rtm_table   || rtm->rtm_protocol || rtm->rtm_scope ||
-	    rtm->rtm_type    || rtm->rtm_flags) {
+	    rtm->rtm_table   || rtm->rtm_scope    || rtm->rtm_type  ||
+	    rtm->rtm_flags) {
 		NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for FIB dump request");
 		return -EINVAL;
 	}
 
-	if (nlmsg_attrlen(nlh, sizeof(*rtm))) {
-		NL_SET_ERR_MSG_MOD(extack, "Invalid data after header in FIB dump request");
-		return -EINVAL;
+	if (rtm->rtm_protocol) {
+		filter->protocol = rtm->rtm_protocol;
+		filter->filter_set = 1;
+	}
+
+	err = nlmsg_parse_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
+				 rtm_mpls_policy, extack);
+	if (err < 0)
+		return err;
+
+	for (i = 0; i <= RTA_MAX; ++i) {
+		int ifindex;
+
+		if (i == RTA_OIF) {
+			ifindex = nla_get_u32(tb[i]);
+			filter->dev = __dev_get_by_index(net, ifindex);
+			if (!filter->dev)
+				return -ENODEV;
+			filter->filter_set = 1;
+		} else if (tb[i]) {
+			NL_SET_ERR_MSG(extack, "Unsupported attribute in dump request");
+			return -EINVAL;
+		}
 	}
 
 	return 0;
-- 
2.11.0

^ permalink raw reply related

* [PATCH net-next 4/9] net/mpls: Plumb support for filtering route dumps
From: David Ahern @ 2018-10-11 15:06 UTC (permalink / raw)
  To: netdev, davem; +Cc: David Ahern
In-Reply-To: <20181011150627.4010-1-dsahern@kernel.org>

From: David Ahern <dsahern@gmail.com>

Implement kernel side filtering of routes by egress device index and
protocol. MPLS uses only a single table and route type.

Signed-off-by: David Ahern <dsahern@gmail.com>
---
 net/mpls/af_mpls.c | 42 +++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 41 insertions(+), 1 deletion(-)

diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
index bfcb4759c9ee..48f4cbd9fb38 100644
--- a/net/mpls/af_mpls.c
+++ b/net/mpls/af_mpls.c
@@ -2067,12 +2067,35 @@ static int mpls_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh,
 }
 #endif
 
+static bool mpls_rt_uses_dev(struct mpls_route *rt,
+			     const struct net_device *dev)
+{
+	struct net_device *nh_dev;
+
+	if (rt->rt_nhn == 1) {
+		struct mpls_nh *nh = rt->rt_nh;
+
+		nh_dev = rtnl_dereference(nh->nh_dev);
+		if (dev == nh_dev)
+			return true;
+	} else {
+		for_nexthops(rt) {
+			nh_dev = rtnl_dereference(nh->nh_dev);
+			if (nh_dev == dev)
+				return true;
+		} endfor_nexthops(rt);
+	}
+
+	return false;
+}
+
 static int mpls_dump_routes(struct sk_buff *skb, struct netlink_callback *cb)
 {
 	const struct nlmsghdr *nlh = cb->nlh;
 	struct net *net = sock_net(skb->sk);
 	struct mpls_route __rcu **platform_label;
 	struct fib_dump_filter filter = {};
+	unsigned int flags = NLM_F_MULTI;
 	size_t platform_labels;
 	unsigned int index;
 
@@ -2084,6 +2107,14 @@ static int mpls_dump_routes(struct sk_buff *skb, struct netlink_callback *cb)
 		err = mpls_valid_fib_dump_req(net, nlh, &filter, cb->extack);
 		if (err < 0)
 			return err;
+
+		/* for MPLS, there is only 1 table with fixed type and flags.
+		 * If either are set in the filter then return nothing.
+		 */
+		if ((filter.table_id && filter.table_id != RT_TABLE_MAIN) ||
+		    (filter.rt_type && filter.rt_type != RTN_UNICAST) ||
+		     filter.flags)
+			return skb->len;
 	}
 
 	index = cb->args[0];
@@ -2092,15 +2123,24 @@ static int mpls_dump_routes(struct sk_buff *skb, struct netlink_callback *cb)
 
 	platform_label = rtnl_dereference(net->mpls.platform_label);
 	platform_labels = net->mpls.platform_labels;
+
+	if (filter.filter_set)
+		flags |= NLM_F_DUMP_FILTERED;
+
 	for (; index < platform_labels; index++) {
 		struct mpls_route *rt;
+
 		rt = rtnl_dereference(platform_label[index]);
 		if (!rt)
 			continue;
 
+		if ((filter.dev && !mpls_rt_uses_dev(rt, filter.dev)) ||
+		    (filter.protocol && rt->rt_protocol != filter.protocol))
+			continue;
+
 		if (mpls_dump_route(skb, NETLINK_CB(cb->skb).portid,
 				    cb->nlh->nlmsg_seq, RTM_NEWROUTE,
-				    index, rt, NLM_F_MULTI) < 0)
+				    index, rt, flags) < 0)
 			break;
 	}
 	cb->args[0] = index;
-- 
2.11.0

^ permalink raw reply related

* [PATCH net-next 8/9] net/ipv6: Bail early if user only wants cloned entries
From: David Ahern @ 2018-10-11 15:06 UTC (permalink / raw)
  To: netdev, davem; +Cc: David Ahern
In-Reply-To: <20181011150627.4010-1-dsahern@kernel.org>

From: David Ahern <dsahern@gmail.com>

Similar to IPv4, IPv6 fib no longer contains cloned routes. If a user
requests a route dump for only cloned entries, no sense walking the FIB
and returning everything.

Signed-off-by: David Ahern <dsahern@gmail.com>
---
 net/ipv6/ip6_fib.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index dd6a43874192..0399fafc5136 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -583,10 +583,13 @@ static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
 	} else if (nlmsg_len(nlh) >= sizeof(struct rtmsg)) {
 		struct rtmsg *rtm = nlmsg_data(nlh);
 
-		if (rtm->rtm_flags & RTM_F_PREFIX)
-			arg.filter.flags = RTM_F_PREFIX;
+		arg.filter.flags = rtm->rtm_flags & (RTM_F_PREFIX | RTM_F_CLONED);
 	}
 
+	/* fib entries are never clones */
+	if (arg.filter.flags & RTM_F_CLONED)
+		return skb->len;
+
 	s_h = cb->args[0];
 	s_e = cb->args[1];
 
-- 
2.11.0

^ permalink raw reply related

* [PATCH net-next 6/9] net: Enable kernel side filtering of route dumps
From: David Ahern @ 2018-10-11 15:06 UTC (permalink / raw)
  To: netdev, davem; +Cc: David Ahern
In-Reply-To: <20181011150627.4010-1-dsahern@kernel.org>

From: David Ahern <dsahern@gmail.com>

Update parsing of route dump request to enable kernel side filtering.
Allow filtering results by protocol (e.g., which routing daemon installed
the route), route type (e.g., unicast), table id and nexthop device. These
amount to the low hanging fruit, yet a huge improvement, for dumping
routes.

ip_valid_fib_dump_req is called with RTNL held, so __dev_get_by_index can
be used to look up the device index without taking a reference. From
there filter->dev is only used during dump loops with the lock still held.

Signed-off-by: David Ahern <dsahern@gmail.com>
---
 net/ipv4/fib_frontend.c | 45 ++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 40 insertions(+), 5 deletions(-)

diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 1528b0919951..a99f2c7ba4e6 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -806,7 +806,11 @@ int ip_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh,
 			  struct fib_dump_filter *filter,
 			  struct netlink_ext_ack *extack)
 {
+	struct nlattr *tb[RTA_MAX + 1];
 	struct rtmsg *rtm;
+	int err, i;
+
+	ASSERT_RTNL();
 
 	if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
 		NL_SET_ERR_MSG(extack, "Invalid header for FIB dump request");
@@ -815,8 +819,7 @@ int ip_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh,
 
 	rtm = nlmsg_data(nlh);
 	if (rtm->rtm_dst_len || rtm->rtm_src_len  || rtm->rtm_tos   ||
-	    rtm->rtm_table   || rtm->rtm_protocol || rtm->rtm_scope ||
-	    rtm->rtm_type) {
+	    rtm->rtm_scope) {
 		NL_SET_ERR_MSG(extack, "Invalid values in header for FIB dump request");
 		return -EINVAL;
 	}
@@ -825,9 +828,41 @@ int ip_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh,
 		return -EINVAL;
 	}
 
-	if (nlmsg_attrlen(nlh, sizeof(*rtm))) {
-		NL_SET_ERR_MSG(extack, "Invalid data after header in FIB dump request");
-		return -EINVAL;
+	filter->flags    = rtm->rtm_flags;
+	filter->protocol = rtm->rtm_protocol;
+	filter->rt_type  = rtm->rtm_type;
+	filter->table_id = rtm->rtm_table;
+
+	err = nlmsg_parse_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
+				 rtm_ipv4_policy, extack);
+	if (err < 0)
+		return err;
+
+	for (i = 0; i <= RTA_MAX; ++i) {
+		int ifindex;
+
+		if (!tb[i])
+			continue;
+
+		switch (i) {
+		case RTA_TABLE:
+			filter->table_id = nla_get_u32(tb[i]);
+			break;
+		case RTA_OIF:
+			ifindex = nla_get_u32(tb[i]);
+			filter->dev = __dev_get_by_index(net, ifindex);
+			if (!filter->dev)
+				return -ENODEV;
+			break;
+		default:
+			NL_SET_ERR_MSG(extack, "Unsupported attribute in dump request");
+			return -EINVAL;
+		}
+	}
+
+	if (filter->flags || filter->protocol || filter->rt_type ||
+	    filter->table_id || filter->dev) {
+		filter->filter_set = 1;
 	}
 
 	return 0;
-- 
2.11.0

^ permalink raw reply related

* [PATCH net-next 5/9] net: Plumb support for filtering ipv4 and ipv6 multicast route dumps
From: David Ahern @ 2018-10-11 15:06 UTC (permalink / raw)
  To: netdev, davem; +Cc: David Ahern
In-Reply-To: <20181011150627.4010-1-dsahern@kernel.org>

From: David Ahern <dsahern@gmail.com>

Implement kernel side filtering of routes by egress device index and
table id.

Signed-off-by: David Ahern <dsahern@gmail.com>
---
 include/linux/mroute_base.h |  5 +++--
 net/ipv4/ipmr.c             |  2 +-
 net/ipv4/ipmr_base.c        | 33 ++++++++++++++++++++++++++++++++-
 net/ipv6/ip6mr.c            |  2 +-
 4 files changed, 37 insertions(+), 5 deletions(-)

diff --git a/include/linux/mroute_base.h b/include/linux/mroute_base.h
index 6675b9f81979..8fc516c47a64 100644
--- a/include/linux/mroute_base.h
+++ b/include/linux/mroute_base.h
@@ -7,6 +7,7 @@
 #include <net/net_namespace.h>
 #include <net/sock.h>
 #include <net/fib_notifier.h>
+#include <net/ip_fib.h>
 
 /**
  * struct vif_device - interface representor for multicast routing
@@ -290,7 +291,7 @@ int mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb,
 				 struct sk_buff *skb,
 				 u32 portid, u32 seq, struct mr_mfc *c,
 				 int cmd, int flags),
-		     spinlock_t *lock);
+		     spinlock_t *lock, struct fib_dump_filter *filter);
 
 int mr_dump(struct net *net, struct notifier_block *nb, unsigned short family,
 	    int (*rules_dump)(struct net *net,
@@ -340,7 +341,7 @@ mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb,
 			     struct sk_buff *skb,
 			     u32 portid, u32 seq, struct mr_mfc *c,
 			     int cmd, int flags),
-		 spinlock_t *lock)
+		 spinlock_t *lock, struct fib_dump_filter *filter)
 {
 	return -EINVAL;
 }
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 44d777058960..f6ad4ef1d3c7 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -2539,7 +2539,7 @@ static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb)
 	}
 
 	return mr_rtm_dumproute(skb, cb, ipmr_mr_table_iter,
-				_ipmr_fill_mroute, &mfc_unres_lock);
+				_ipmr_fill_mroute, &mfc_unres_lock, &filter);
 }
 
 static const struct nla_policy rtm_ipmr_policy[RTA_MAX + 1] = {
diff --git a/net/ipv4/ipmr_base.c b/net/ipv4/ipmr_base.c
index 1ad9aa62a97b..647300a55f42 100644
--- a/net/ipv4/ipmr_base.c
+++ b/net/ipv4/ipmr_base.c
@@ -268,6 +268,24 @@ int mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
 }
 EXPORT_SYMBOL(mr_fill_mroute);
 
+static bool mr_mfc_uses_dev(const struct mr_table *mrt,
+			    const struct mr_mfc *c,
+			    const struct net_device *dev)
+{
+	int ct;
+
+	for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) {
+		if (VIF_EXISTS(mrt, ct) && c->mfc_un.res.ttls[ct] < 255) {
+			const struct vif_device *vif;
+
+			vif = &mrt->vif_table[ct];
+			if (vif->dev == dev)
+				return true;
+		}
+	}
+	return false;
+}
+
 int mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb,
 		     struct mr_table *(*iter)(struct net *net,
 					      struct mr_table *mrt),
@@ -275,17 +293,26 @@ int mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb,
 				 struct sk_buff *skb,
 				 u32 portid, u32 seq, struct mr_mfc *c,
 				 int cmd, int flags),
-		     spinlock_t *lock)
+		     spinlock_t *lock, struct fib_dump_filter *filter)
 {
 	unsigned int t = 0, e = 0, s_t = cb->args[0], s_e = cb->args[1];
 	struct net *net = sock_net(skb->sk);
 	struct mr_table *mrt;
 	struct mr_mfc *mfc;
 
+	/* multicast does not track protocol or have route type other
+	 * than RTN_MULTICAST
+	 */
+	if (filter->protocol || filter->flags ||
+	    (filter->rt_type && filter->rt_type != RTN_MULTICAST))
+		return 0;
+
 	rcu_read_lock();
 	for (mrt = iter(net, NULL); mrt; mrt = iter(net, mrt)) {
 		if (t < s_t)
 			goto next_table;
+		if (filter->table_id && filter->table_id != mrt->id)
+			goto next_table;
 		list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list) {
 			if (e < s_e)
 				goto next_entry;
@@ -303,6 +330,10 @@ int mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb,
 		list_for_each_entry(mfc, &mrt->mfc_unres_queue, list) {
 			if (e < s_e)
 				goto next_entry2;
+			if (filter->dev &&
+			    !mr_mfc_uses_dev(mrt, mfc, filter->dev))
+				goto next_entry2;
+
 			if (fill(mrt, skb, NETLINK_CB(cb->skb).portid,
 				 cb->nlh->nlmsg_seq, mfc,
 				 RTM_NEWROUTE, NLM_F_MULTI) < 0) {
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index dbd5166c5599..a7593d1c372c 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -2470,5 +2470,5 @@ static int ip6mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb)
 	}
 
 	return mr_rtm_dumproute(skb, cb, ip6mr_mr_table_iter,
-				_ip6mr_fill_mroute, &mfc_unres_lock);
+				_ip6mr_fill_mroute, &mfc_unres_lock, &filter);
 }
-- 
2.11.0

^ permalink raw reply related

* [PATCH net-next 1/9] net: Add struct for fib dump filter
From: David Ahern @ 2018-10-11 15:06 UTC (permalink / raw)
  To: netdev, davem; +Cc: David Ahern
In-Reply-To: <20181011150627.4010-1-dsahern@kernel.org>

From: David Ahern <dsahern@gmail.com>

Add struct fib_dump_filter for options on limiting which routes are
returned in a dump request. The current list is table id, protocol,
route type, rtm_flags and nexthop device index. struct net is needed
to lookup the net_device from the index.

Plumb the new arguments from dump handlers to ip_valid_fib_dump_req.

Signed-off-by: David Ahern <dsahern@gmail.com>
---
 include/net/ip6_route.h |  1 +
 include/net/ip_fib.h    | 12 +++++++++++-
 net/ipv4/fib_frontend.c |  6 ++++--
 net/ipv4/ipmr.c         |  6 +++++-
 net/ipv6/ip6_fib.c      |  5 +++--
 net/ipv6/ip6mr.c        |  5 ++++-
 net/mpls/af_mpls.c      | 12 ++++++++----
 7 files changed, 36 insertions(+), 11 deletions(-)

diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index cef186dbd2ce..7ab119936e69 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -174,6 +174,7 @@ struct rt6_rtnl_dump_arg {
 	struct sk_buff *skb;
 	struct netlink_callback *cb;
 	struct net *net;
+	struct fib_dump_filter filter;
 };
 
 int rt6_dump_route(struct fib6_info *f6i, void *p_arg);
diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 9846b79c9ee1..9dde41ad02a1 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -222,6 +222,15 @@ struct fib_table {
 	unsigned long		__data[0];
 };
 
+struct fib_dump_filter {
+	bool			filter_set;
+	unsigned char		protocol;
+	unsigned char		rt_type;
+	u32			table_id;
+	unsigned int		flags;
+	struct net_device	*dev;
+};
+
 int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
 		     struct fib_result *res, int fib_flags);
 int fib_table_insert(struct net *, struct fib_table *, struct fib_config *,
@@ -452,6 +461,7 @@ static inline void fib_proc_exit(struct net *net)
 
 u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr);
 
-int ip_valid_fib_dump_req(const struct nlmsghdr *nlh,
+int ip_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh,
+			  struct fib_dump_filter *filter,
 			  struct netlink_ext_ack *extack);
 #endif  /* _NET_FIB_H */
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 038f511c73fa..d0fb9b7efa27 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -802,7 +802,8 @@ static int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
 	return err;
 }
 
-int ip_valid_fib_dump_req(const struct nlmsghdr *nlh,
+int ip_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh,
+			  struct fib_dump_filter *filter,
 			  struct netlink_ext_ack *extack)
 {
 	struct rtmsg *rtm;
@@ -837,6 +838,7 @@ static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
 {
 	const struct nlmsghdr *nlh = cb->nlh;
 	struct net *net = sock_net(skb->sk);
+	struct fib_dump_filter filter = {};
 	unsigned int h, s_h;
 	unsigned int e = 0, s_e;
 	struct fib_table *tb;
@@ -844,7 +846,7 @@ static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
 	int dumped = 0, err;
 
 	if (cb->strict_check) {
-		err = ip_valid_fib_dump_req(nlh, cb->extack);
+		err = ip_valid_fib_dump_req(net, nlh, &filter, cb->extack);
 		if (err < 0)
 			return err;
 	}
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 91b0d5671649..44d777058960 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -2527,9 +2527,13 @@ static int ipmr_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
 
 static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb)
 {
+	struct fib_dump_filter filter = {};
+
 	if (cb->strict_check) {
-		int err = ip_valid_fib_dump_req(cb->nlh, cb->extack);
+		int err;
 
+		err = ip_valid_fib_dump_req(sock_net(skb->sk), cb->nlh,
+					    &filter, cb->extack);
 		if (err < 0)
 			return err;
 	}
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index e14d244c551f..6a169794a674 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -566,17 +566,18 @@ static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
 {
 	const struct nlmsghdr *nlh = cb->nlh;
 	struct net *net = sock_net(skb->sk);
+	struct rt6_rtnl_dump_arg arg = {};
 	unsigned int h, s_h;
 	unsigned int e = 0, s_e;
-	struct rt6_rtnl_dump_arg arg;
 	struct fib6_walker *w;
 	struct fib6_table *tb;
 	struct hlist_head *head;
 	int res = 0;
 
 	if (cb->strict_check) {
-		int err = ip_valid_fib_dump_req(nlh, cb->extack);
+		int err;
 
+		err = ip_valid_fib_dump_req(net, nlh, &arg.filter, cb->extack);
 		if (err < 0)
 			return err;
 	}
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index d7563ef76518..dbd5166c5599 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -2458,10 +2458,13 @@ static void mrt6msg_netlink_event(struct mr_table *mrt, struct sk_buff *pkt)
 static int ip6mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb)
 {
 	const struct nlmsghdr *nlh = cb->nlh;
+	struct fib_dump_filter filter = {};
 
 	if (cb->strict_check) {
-		int err = ip_valid_fib_dump_req(nlh, cb->extack);
+		int err;
 
+		err = ip_valid_fib_dump_req(sock_net(skb->sk), nlh,
+					    &filter, cb->extack);
 		if (err < 0)
 			return err;
 	}
diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
index 5fe274c47c41..bfcb4759c9ee 100644
--- a/net/mpls/af_mpls.c
+++ b/net/mpls/af_mpls.c
@@ -2032,13 +2032,15 @@ static int mpls_dump_route(struct sk_buff *skb, u32 portid, u32 seq, int event,
 }
 
 #if IS_ENABLED(CONFIG_INET)
-static int mpls_valid_fib_dump_req(const struct nlmsghdr *nlh,
+static int mpls_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh,
+				   struct fib_dump_filter *filter,
 				   struct netlink_ext_ack *extack)
 {
-	return ip_valid_fib_dump_req(nlh, extack);
+	return ip_valid_fib_dump_req(net, nlh, filter, extack);
 }
 #else
-static int mpls_valid_fib_dump_req(const struct nlmsghdr *nlh,
+static int mpls_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh,
+				   struct fib_dump_filter *filter,
 				   struct netlink_ext_ack *extack)
 {
 	struct rtmsg *rtm;
@@ -2070,14 +2072,16 @@ static int mpls_dump_routes(struct sk_buff *skb, struct netlink_callback *cb)
 	const struct nlmsghdr *nlh = cb->nlh;
 	struct net *net = sock_net(skb->sk);
 	struct mpls_route __rcu **platform_label;
+	struct fib_dump_filter filter = {};
 	size_t platform_labels;
 	unsigned int index;
 
 	ASSERT_RTNL();
 
 	if (cb->strict_check) {
-		int err = mpls_valid_fib_dump_req(nlh, cb->extack);
+		int err;
 
+		err = mpls_valid_fib_dump_req(net, nlh, &filter, cb->extack);
 		if (err < 0)
 			return err;
 	}
-- 
2.11.0

^ permalink raw reply related

* [PATCH net-next 0/9] net: Kernel side filtering for route dumps
From: David Ahern @ 2018-10-11 15:06 UTC (permalink / raw)
  To: netdev, davem; +Cc: David Ahern

From: David Ahern <dsahern@gmail.com>

Implement kernel side filtering of route dumps by protocol (e.g., which
routing daemon installed the route), route type (e.g., unicast), table
id and nexthop device.

iproute2 has been doing this filtering in userspace for years; pushing
the filters to the kernel side reduces the amount of data the kernel
sends and reduces wasted cycles on both sides processing unwanted data.
These initial options provide a huge improvement for efficiently
examining routes on large scale systems.

David Ahern (9):
  net: Add struct for fib dump filter
  net/ipv4: Plumb support for filtering route dumps
  net/ipv6: Plumb support for filtering route dumps
  net/mpls: Plumb support for filtering route dumps
  net: Plumb support for filtering ipv4 and ipv6 multicast route dumps
  net: Enable kernel side filtering of route dumps
  net/mpls: Handle kernel side filtering of route dumps
  net/ipv6: Bail early if user only wants cloned entries
  net/ipv4: Bail early if user only wants prefix entries

 include/linux/mroute_base.h |  5 +--
 include/net/ip6_route.h     |  1 +
 include/net/ip_fib.h        | 14 ++++++--
 net/ipv4/fib_frontend.c     | 64 +++++++++++++++++++++++++++------
 net/ipv4/fib_trie.c         | 37 +++++++++++++------
 net/ipv4/ipmr.c             |  8 +++--
 net/ipv4/ipmr_base.c        | 33 ++++++++++++++++-
 net/ipv6/ip6_fib.c          | 17 +++++++--
 net/ipv6/ip6mr.c            |  7 ++--
 net/ipv6/route.c            | 40 ++++++++++++++++-----
 net/mpls/af_mpls.c          | 86 +++++++++++++++++++++++++++++++++++++++------
 11 files changed, 262 insertions(+), 50 deletions(-)

-- 
2.11.0

^ permalink raw reply

* [PATCH net-next 2/9] net/ipv4: Plumb support for filtering route dumps
From: David Ahern @ 2018-10-11 15:06 UTC (permalink / raw)
  To: netdev, davem; +Cc: David Ahern
In-Reply-To: <20181011150627.4010-1-dsahern@kernel.org>

From: David Ahern <dsahern@gmail.com>

Implement kernel side filtering of routes by table id, egress device index,
protocol and route type.

Signed-off-by: David Ahern <dsahern@gmail.com>
---
 include/net/ip_fib.h    |  2 +-
 net/ipv4/fib_frontend.c |  5 ++++-
 net/ipv4/fib_trie.c     | 37 ++++++++++++++++++++++++++-----------
 3 files changed, 31 insertions(+), 13 deletions(-)

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 9dde41ad02a1..68967f4bd024 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -238,7 +238,7 @@ int fib_table_insert(struct net *, struct fib_table *, struct fib_config *,
 int fib_table_delete(struct net *, struct fib_table *, struct fib_config *,
 		     struct netlink_ext_ack *extack);
 int fib_table_dump(struct fib_table *table, struct sk_buff *skb,
-		   struct netlink_callback *cb);
+		   struct netlink_callback *cb, struct fib_dump_filter *filter);
 int fib_table_flush(struct net *net, struct fib_table *table);
 struct fib_table *fib_trie_unmerge(struct fib_table *main_tb);
 void fib_table_flush_external(struct fib_table *table);
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index d0fb9b7efa27..1528b0919951 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -866,10 +866,13 @@ static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
 		hlist_for_each_entry_rcu(tb, head, tb_hlist) {
 			if (e < s_e)
 				goto next;
+			if (filter.table_id && filter.table_id != tb->tb_id)
+				goto next;
+
 			if (dumped)
 				memset(&cb->args[2], 0, sizeof(cb->args) -
 						 2 * sizeof(cb->args[0]));
-			err = fib_table_dump(tb, skb, cb);
+			err = fib_table_dump(tb, skb, cb, &filter);
 			if (err < 0) {
 				if (likely(skb->len))
 					goto out;
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 5bc0c89e81e4..237c9f72b265 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -2003,12 +2003,17 @@ void fib_free_table(struct fib_table *tb)
 }
 
 static int fn_trie_dump_leaf(struct key_vector *l, struct fib_table *tb,
-			     struct sk_buff *skb, struct netlink_callback *cb)
+			     struct sk_buff *skb, struct netlink_callback *cb,
+			     struct fib_dump_filter *filter)
 {
+	unsigned int flags = NLM_F_MULTI;
 	__be32 xkey = htonl(l->key);
 	struct fib_alias *fa;
 	int i, s_i;
 
+	if (filter->filter_set)
+		flags |= NLM_F_DUMP_FILTERED;
+
 	s_i = cb->args[4];
 	i = 0;
 
@@ -2016,25 +2021,35 @@ static int fn_trie_dump_leaf(struct key_vector *l, struct fib_table *tb,
 	hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) {
 		int err;
 
-		if (i < s_i) {
-			i++;
-			continue;
-		}
+		if (i < s_i)
+			goto next;
 
-		if (tb->tb_id != fa->tb_id) {
-			i++;
-			continue;
+		if (tb->tb_id != fa->tb_id)
+			goto next;
+
+		if (filter->filter_set) {
+			if (filter->rt_type && fa->fa_type != filter->rt_type)
+				goto next;
+
+			if ((filter->protocol &&
+			     fa->fa_info->fib_protocol != filter->protocol))
+				goto next;
+
+			if (filter->dev &&
+			    !fib_info_nh_uses_dev(fa->fa_info, filter->dev))
+				goto next;
 		}
 
 		err = fib_dump_info(skb, NETLINK_CB(cb->skb).portid,
 				    cb->nlh->nlmsg_seq, RTM_NEWROUTE,
 				    tb->tb_id, fa->fa_type,
 				    xkey, KEYLENGTH - fa->fa_slen,
-				    fa->fa_tos, fa->fa_info, NLM_F_MULTI);
+				    fa->fa_tos, fa->fa_info, flags);
 		if (err < 0) {
 			cb->args[4] = i;
 			return err;
 		}
+next:
 		i++;
 	}
 
@@ -2044,7 +2059,7 @@ static int fn_trie_dump_leaf(struct key_vector *l, struct fib_table *tb,
 
 /* rcu_read_lock needs to be hold by caller from readside */
 int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
-		   struct netlink_callback *cb)
+		   struct netlink_callback *cb, struct fib_dump_filter *filter)
 {
 	struct trie *t = (struct trie *)tb->tb_data;
 	struct key_vector *l, *tp = t->kv;
@@ -2057,7 +2072,7 @@ int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
 	while ((l = leaf_walk_rcu(&tp, key)) != NULL) {
 		int err;
 
-		err = fn_trie_dump_leaf(l, tb, skb, cb);
+		err = fn_trie_dump_leaf(l, tb, skb, cb, filter);
 		if (err < 0) {
 			cb->args[3] = key;
 			cb->args[2] = count;
-- 
2.11.0

^ permalink raw reply related

* Re: [iproute2-next] tipc: support interface name when activating UDP bearer
From: Stephen Hemminger @ 2018-10-11 15:04 UTC (permalink / raw)
  To: Hoang Le; +Cc: jon.maloy, maloy, ying.xue, netdev, tipc-discussion
In-Reply-To: <20181011020708.7585-1-hoang.h.le@dektech.com.au>

On Thu, 11 Oct 2018 09:07:08 +0700
Hoang Le <hoang.h.le@dektech.com.au> wrote:

This looks fine.

> +static int cmd_bearer_validate_and_get_addr(const char *name, char *straddr)
> +{
> +	struct ifreq ifc;
> +	struct sockaddr_in *ip4addr;
> +	struct sockaddr_in6 *ip6addr;
> +	int fd = 0;
> +
> +	if (!name || !straddr)
> +		return 0;
> +
> +	fd = socket(PF_INET, SOCK_DGRAM, 0);

Will goahead and apply but minor nits.

The initialization of fd to zero is unnecessary.
This function is return 0, -1, or -EINVAL but only caller only cares about
zero or non-zero.

^ permalink raw reply

* RE: [PATCH net-next] hv_netvsc: fix vf serial matching with pci slot info
From: Haiyang Zhang @ 2018-10-11 22:30 UTC (permalink / raw)
  To: Haiyang Zhang, davem@davemloft.net, netdev@vger.kernel.org
  Cc: KY Srinivasan, Stephen Hemminger, olaf@aepfle.de, vkuznets,
	devel@linuxdriverproject.org, linux-kernel@vger.kernel.org
In-Reply-To: <20181011201434.30737-1-haiyangz@linuxonhyperv.com>



> -----Original Message-----
> From: Haiyang Zhang <haiyangz@linuxonhyperv.com>
> Sent: Thursday, October 11, 2018 4:15 PM
> To: davem@davemloft.net; netdev@vger.kernel.org
> Cc: Haiyang Zhang <haiyangz@microsoft.com>; KY Srinivasan
> <kys@microsoft.com>; Stephen Hemminger <sthemmin@microsoft.com>;
> olaf@aepfle.de; vkuznets <vkuznets@redhat.com>;
> devel@linuxdriverproject.org; linux-kernel@vger.kernel.org
> Subject: [PATCH net-next] hv_netvsc: fix vf serial matching with pci slot info
> 
> From: Haiyang Zhang <haiyangz@microsoft.com>
> 
> The VF device's serial number is saved as a string in PCI slot's kobj name, not
> the slot->number. This patch corrects the netvsc driver, so the VF device can be
> successfully paired with synthetic NIC.
> 
> Fixes: 00d7ddba1143 ("hv_netvsc: pair VF based on serial number")
> Signed-off-by: Haiyang Zhang <haiyangz@microsoft.com>

Thanks Stephen for the reminder -- I added the "reported-by" here:

Reported-by: Vitaly Kuznetsov <vkuznets@redhat.com>

^ permalink raw reply

* Re: [PATCH v2 1/2] net/ncsi: Add NCSI Broadcom OEM command
From: Vijay Khemka @ 2018-10-11 22:12 UTC (permalink / raw)
  To: Samuel Mendoza-Jonas, David S. Miller, netdev@vger.kernel.org,
	linux-kernel@vger.kernel.org
  Cc: openbmc @ lists . ozlabs . org, Justin.Lee1@Dell.com,
	joel@jms.id.au, linux-aspeed@lists.ozlabs.org
In-Reply-To: <7893ec3aa673b6009d00df1646b2820be2fc33a6.camel@mendozajonas.com>

Thanks Sam,
I will take care of this and generate new patch.

Regards
-Vijay

On 10/9/18, 9:40 PM, "Samuel Mendoza-Jonas" <sam@mendozajonas.com> wrote:

    On Tue, 2018-10-09 at 10:48 -0700, Vijay Khemka wrote:
    > This patch adds OEM Broadcom commands and response handling. It also
    > defines OEM Get MAC Address handler to get and configure the device.
    > 
    > ncsi_oem_gma_handler_bcm: This handler send NCSI broadcom command for
    > getting mac address.
    > ncsi_rsp_handler_oem_bcm: This handles response received for all
    > broadcom OEM commands.
    > ncsi_rsp_handler_oem_bcm_gma: This handles get mac address response and
    > set it to device.
    > 
    > Signed-off-by: Vijay Khemka <vijaykhemka@fb.com>
    > ---
    >  net/ncsi/Kconfig       |  6 ++++
    >  net/ncsi/internal.h    |  8 +++++
    >  net/ncsi/ncsi-manage.c | 70 +++++++++++++++++++++++++++++++++++++++++-
    >  net/ncsi/ncsi-pkt.h    |  8 +++++
    >  net/ncsi/ncsi-rsp.c    | 40 +++++++++++++++++++++++-
    >  5 files changed, 130 insertions(+), 2 deletions(-)
    > 
    
    <snip>
    
    >  static void ncsi_configure_channel(struct ncsi_dev_priv *ndp)
    >  {
    >  	struct ncsi_dev *nd = &ndp->ndev;
    > @@ -643,9 +676,10 @@ static void ncsi_configure_channel(struct ncsi_dev_priv *ndp)
    >  	struct ncsi_channel *nc = ndp->active_channel;
    >  	struct ncsi_channel *hot_nc = NULL;
    >  	struct ncsi_cmd_arg nca;
    > +	struct ncsi_oem_gma_handler *nch = NULL;
    >  	unsigned char index;
    >  	unsigned long flags;
    > -	int ret;
    > +	int ret, i;
    >  
    
    I just noticed that if CONFIG_NCSI_OEM_CMD_GET_MAC is not set then this
    generates unused variable warnings for i and nch. Otherwise all looking good!
    
    Regards,
    Sam
    
    
    


^ permalink raw reply

* RE: [RFC PATCH 2/2] net/ncsi: Configure multi-package, multi-channel modes with failover
From: Justin.Lee1 @ 2018-10-11 22:09 UTC (permalink / raw)
  To: sam, netdev; +Cc: davem, linux-kernel, openbmc
In-Reply-To: <8548f1872badb35588d5506da21f0d89bea71127.camel@mendozajonas.com>

Hi Sam,

Please see my comments below.

Thanks,
Justin
 
 
> On Wed, 2018-10-10 at 22:36 +0000, Justin.Lee1@Dell.com wrote:
> > Hi Samuel,
> > 
> > I am still testing your change and have some comments below.
> > 
> > Thanks,
> > Justin
> > 
> > 
> > > This patch extends the ncsi-netlink interface with two new commands and
> > > three new attributes to configure multiple packages and/or channels at
> > > once, and configure specific failover modes.
> > > 
> > > NCSI_CMD_SET_PACKAGE mask and NCSI_CMD_SET_CHANNEL_MASK set a whitelist
> > > of packages or channels allowed to be configured with the
> > > NCSI_ATTR_PACKAGE_MASK and NCSI_ATTR_CHANNEL_MASK attributes
> > > respectively. If one of these whitelists is set only packages or
> > > channels matching the whitelist are considered for the channel queue in
> > > ncsi_choose_active_channel().
> > > 
> > > These commands may also use the NCSI_ATTR_MULTI_FLAG to signal that
> > > multiple packages or channels may be configured simultaneously. NCSI
> > > hardware arbitration (HWA) must be available in order to enable
> > > multi-package mode. Multi-channel mode is always available.
> > > 
> > > If the NCSI_ATTR_CHANNEL_ID attribute is present in the
> > > NCSI_CMD_SET_CHANNEL_MASK command the it sets the preferred channel as
> > > with the NCSI_CMD_SET_INTERFACE command. The combination of preferred
> > > channel and channel whitelist defines a primary channel and the allowed
> > > failover channels.
> > > If the NCSI_ATTR_MULTI_FLAG attribute is also present then the preferred
> > > channel is configured for Tx/Rx and the other channels are enabled only
> > > for Rx.
> > > 
> > > Signed-off-by: Samuel Mendoza-Jonas <sam@mendozajonas.com>
> > > ---
> > >  include/uapi/linux/ncsi.h |  16 +++
> > >  net/ncsi/internal.h       |  11 +-
> > >  net/ncsi/ncsi-aen.c       |   2 +-
> > >  net/ncsi/ncsi-manage.c    | 138 ++++++++++++++++--------
> > >  net/ncsi/ncsi-netlink.c   | 217 +++++++++++++++++++++++++++++++++-----
> > >  net/ncsi/ncsi-rsp.c       |   2 +-
> > >  6 files changed, 312 insertions(+), 74 deletions(-)
> > > 
> > > diff --git a/include/uapi/linux/ncsi.h b/include/uapi/linux/ncsi.h
> > > index 4c292ecbb748..035fba1693f9 100644
> > > --- a/include/uapi/linux/ncsi.h
> > > +++ b/include/uapi/linux/ncsi.h
> > > @@ -23,6 +23,13 @@
> > >   *	optionally the preferred NCSI_ATTR_CHANNEL_ID.
> > >   * @NCSI_CMD_CLEAR_INTERFACE: clear any preferred package/channel combination.
> > >   *	Requires NCSI_ATTR_IFINDEX.
> > > + * @NCSI_CMD_SET_PACKAGE_MASK: set a whitelist of allowed packages.
> > > + * @NCSI_CMD_SET_PACKAGE_MASK: set a whitelist of allowed channels.
> > > + *	Requires NCSI_ATTR_IFINDEX and NCSI_ATTR_PACKAGE_MASK.
> > > + * @NCSI_CMD_SET_PACKAGE_MASK: set a whitelist of allowed channels.
> > > + *	Requires NCSI_ATTR_IFINDEX, NCSI_ATTR_PACKAGE_ID, and
> > > + *	NCSI_ATTR_CHANNEL_MASK. If NCSI_ATTR_CHANNEL_ID is present it sets
> > > + *	the primary channel.
> > >   * @NCSI_CMD_MAX: highest command number
> > >   */
> > 
> > There are some typo in the description.
> > * @NCSI_CMD_SET_PACKAGE_MASK: set a whitelist of allowed packages.
> >  *	Requires NCSI_ATTR_IFINDEX and NCSI_ATTR_PACKAGE_MASK.
> >  * @NCSI_CMD_SET_CHANNEL_MASK: set a whitelist of allowed channels.
> >  *	Requires NCSI_ATTR_IFINDEX, NCSI_ATTR_PACKAGE_ID, and
> >  *	NCSI_ATTR_CHANNEL_MASK. If NCSI_ATTR_CHANNEL_ID is present it sets
> >  *	the primary channel.
> 
> Haha, yes I threw that in at the end, thanks for catching it.
> 
> > 
> > >  enum ncsi_nl_commands {
> > > @@ -30,6 +37,8 @@ enum ncsi_nl_commands {
> > >  	NCSI_CMD_PKG_INFO,
> > >  	NCSI_CMD_SET_INTERFACE,
> > >  	NCSI_CMD_CLEAR_INTERFACE,
> > > +	NCSI_CMD_SET_PACKAGE_MASK,
> > > +	NCSI_CMD_SET_CHANNEL_MASK,
> > >  
> > >  	__NCSI_CMD_AFTER_LAST,
> > >  	NCSI_CMD_MAX = __NCSI_CMD_AFTER_LAST - 1
> > > @@ -43,6 +52,10 @@ enum ncsi_nl_commands {
> > >   * @NCSI_ATTR_PACKAGE_LIST: nested array of NCSI_PKG_ATTR attributes
> > >   * @NCSI_ATTR_PACKAGE_ID: package ID
> > >   * @NCSI_ATTR_CHANNEL_ID: channel ID
> > > + * @NCSI_ATTR_MULTI_FLAG: flag to signal that multi-mode should be enabled with
> > > + *	NCSI_CMD_SET_PACKAGE_MASK or NCSI_CMD_SET_CHANNEL_MASK.
> > > + * @NCSI_ATTR_PACKAGE_MASK: 32-bit mask of allowed packages.
> > > + * @NCSI_ATTR_CHANNEL_MASK: 32-bit mask of allowed channels.
> > >   * @NCSI_ATTR_MAX: highest attribute number
> > >   */
> > >  enum ncsi_nl_attrs {
> > > @@ -51,6 +64,9 @@ enum ncsi_nl_attrs {
> > >  	NCSI_ATTR_PACKAGE_LIST,
> > >  	NCSI_ATTR_PACKAGE_ID,
> > >  	NCSI_ATTR_CHANNEL_ID,
> > > +	NCSI_ATTR_MULTI_FLAG,
> > > +	NCSI_ATTR_PACKAGE_MASK,
> > > +	NCSI_ATTR_CHANNEL_MASK,
> > 
> > Is there a case that we might set these two masks at the same time?
> > If not, maybe we can just have one generic MASK attribute.
> > 
> 
> I thought of this too: not yet, but I wonder if we might in the future.
> I'll have a think about it.
> 
> > >  
> > >  	__NCSI_ATTR_AFTER_LAST,
> > >  	NCSI_ATTR_MAX = __NCSI_ATTR_AFTER_LAST - 1
> > > diff --git a/net/ncsi/internal.h b/net/ncsi/internal.h
> > > index 3d0a33b874f5..8437474d0a78 100644
> > > --- a/net/ncsi/internal.h
> > > +++ b/net/ncsi/internal.h
> > > @@ -213,6 +213,10 @@ struct ncsi_package {
> > >  	unsigned int         channel_num; /* Number of channels     */
> > >  	struct list_head     channels;    /* List of chanels        */
> > >  	struct list_head     node;        /* Form list of packages  */
> > > +
> > > +	bool                 multi_channel; /* Enable multiple channels  */
> > > +	u32                  channel_whitelist; /* Channels to configure */
> > > +	struct ncsi_channel  *preferred_channel; /* Primary channel      */
> > >  };
> > >  
> > >  struct ncsi_request {
> > > @@ -280,8 +284,6 @@ struct ncsi_dev_priv {
> > >  	unsigned int        package_num;     /* Number of packages         */
> > >  	struct list_head    packages;        /* List of packages           */
> > >  	struct ncsi_channel *hot_channel;    /* Channel was ever active    */
> > > -	struct ncsi_package *force_package;  /* Force a specific package   */
> > > -	struct ncsi_channel *force_channel;  /* Force a specific channel   */
> > >  	struct ncsi_request requests[256];   /* Request table              */
> > >  	unsigned int        request_id;      /* Last used request ID       */
> > >  #define NCSI_REQ_START_IDX	1
> > > @@ -294,6 +296,9 @@ struct ncsi_dev_priv {
> > >  	struct list_head    node;            /* Form NCSI device list      */
> > >  #define NCSI_MAX_VLAN_VIDS	15
> > >  	struct list_head    vlan_vids;       /* List of active VLAN IDs */
> > > +
> > > +	bool                multi_package;   /* Enable multiple packages   */
> > > +	u32                 package_whitelist; /* Packages to configure    */
> > >  };
> > >  
> > >  struct ncsi_cmd_arg {
> > > @@ -345,6 +350,8 @@ struct ncsi_request *ncsi_alloc_request(struct ncsi_dev_priv *ndp,
> > >  void ncsi_free_request(struct ncsi_request *nr);
> > >  struct ncsi_dev *ncsi_find_dev(struct net_device *dev);
> > >  int ncsi_process_next_channel(struct ncsi_dev_priv *ndp);
> > > +bool ncsi_channel_is_last(struct ncsi_dev_priv *ndp,
> > > +			  struct ncsi_channel *channel);
> > >  
> > >  /* Packet handlers */
> > >  u32 ncsi_calculate_checksum(unsigned char *data, int len);
> > > diff --git a/net/ncsi/ncsi-aen.c b/net/ncsi/ncsi-aen.c
> > > index 65f47a648be3..eac56aee30c4 100644
> > > --- a/net/ncsi/ncsi-aen.c
> > > +++ b/net/ncsi/ncsi-aen.c
> > > @@ -86,7 +86,7 @@ static int ncsi_aen_handler_lsc(struct ncsi_dev_priv *ndp,
> > >  	    !(state == NCSI_CHANNEL_ACTIVE && !(data & 0x1)))
> > >  		return 0;
> > >  
> > > -	if (state == NCSI_CHANNEL_ACTIVE)
> > > +	if (state == NCSI_CHANNEL_ACTIVE && ncsi_channel_is_last(ndp, nc))
> > >  		ndp->flags |= NCSI_DEV_RESHUFFLE;
> > >  
> > >  	ncsi_stop_channel_monitor(nc);
> > > diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c
> > > index 665bee25ec44..6a55df700bcb 100644
> > > --- a/net/ncsi/ncsi-manage.c
> > > +++ b/net/ncsi/ncsi-manage.c
> > > @@ -27,6 +27,24 @@
> > >  LIST_HEAD(ncsi_dev_list);
> > >  DEFINE_SPINLOCK(ncsi_dev_lock);
> > >  
> > > +/* Returns true if the given channel is the last channel available */
> > > +bool ncsi_channel_is_last(struct ncsi_dev_priv *ndp,
> > > +			  struct ncsi_channel *channel)
> > > +{
> > > +	struct ncsi_package *np;
> > > +	struct ncsi_channel *nc;
> > > +
> > > +	NCSI_FOR_EACH_PACKAGE(ndp, np)
> > > +		NCSI_FOR_EACH_CHANNEL(np, nc) {
> > > +			if (nc == channel)
> > > +				continue;
> > > +			if (nc->state == NCSI_CHANNEL_ACTIVE)
> > > +				return false;
> > > +		}
> > > +
> > > +	return true;
> > > +}
> > > +
> > >  static void ncsi_report_link(struct ncsi_dev_priv *ndp, bool force_down)
> > >  {
> > >  	struct ncsi_dev *nd = &ndp->ndev;
> > > @@ -266,6 +284,7 @@ struct ncsi_package *ncsi_add_package(struct ncsi_dev_priv *ndp,
> > >  	np->ndp = ndp;
> > >  	spin_lock_init(&np->lock);
> > >  	INIT_LIST_HEAD(&np->channels);
> > > +	np->channel_whitelist = UINT_MAX;
> > >  
> > >  	spin_lock_irqsave(&ndp->lock, flags);
> > >  	tmp = ncsi_find_package(ndp, id);
> > > @@ -633,6 +652,34 @@ static int set_one_vid(struct ncsi_dev_priv *ndp, struct ncsi_channel *nc,
> > >  	return 0;
> > >  }
> > >  
> > > +/* Determine if a given channel should be the Tx channel */
> > > +bool ncsi_channel_is_tx(struct ncsi_dev_priv *ndp, struct ncsi_channel *nc)
> > > +{
> > > +	struct ncsi_package *np = nc->package;
> > > +	struct ncsi_channel *channel;
> > > +	struct ncsi_channel_mode *ncm;

Learn from Dave. All local variable declarations from longest to shortest
line. :)

> > > +
> > > +	NCSI_FOR_EACH_CHANNEL(np, channel) {
> > > +		ncm = &channel->modes[NCSI_MODE_TX_ENABLE];
> > > +		/* Another channel is already Tx */
> > > +		if (ncm->enable)
> > > +			return false;
> > > +	}
> > > +
> > > +	if (!np->preferred_channel)
> > > +		return true;
> > > +
> > > +	if (np->preferred_channel == nc)
> > > +		return true;
> > > +
> > > +	/* The preferred channel is not in the queue and not active */
> > > +	if (list_empty(&np->preferred_channel->link) &&
> > > +	    np->preferred_channel->state != NCSI_CHANNEL_ACTIVE)
> > > +		return true;
> > > +
> > > +	return false;
> > > +}
> > > +
> > >  static void ncsi_configure_channel(struct ncsi_dev_priv *ndp)
> > >  {
> > >  	struct ncsi_dev *nd = &ndp->ndev;
> > > @@ -745,18 +792,22 @@ static void ncsi_configure_channel(struct ncsi_dev_priv *ndp)
> > >  		} else if (nd->state == ncsi_dev_state_config_ebf) {
> > >  			nca.type = NCSI_PKT_CMD_EBF;
> > >  			nca.dwords[0] = nc->caps[NCSI_CAP_BC].cap;
> > > -			nd->state = ncsi_dev_state_config_ecnt;
> > > +			if (ncsi_channel_is_tx(ndp, nc))
> > > +				nd->state = ncsi_dev_state_config_ecnt;
> > > +			else
> > > +				nd->state = ncsi_dev_state_config_ec;
> > >  #if IS_ENABLED(CONFIG_IPV6)
> > >  			if (ndp->inet6_addr_num > 0 &&
> > >  			    (nc->caps[NCSI_CAP_GENERIC].cap &
> > >  			     NCSI_CAP_GENERIC_MC))
> > >  				nd->state = ncsi_dev_state_config_egmf;
> > > -			else
> > > -				nd->state = ncsi_dev_state_config_ecnt;
> > >  		} else if (nd->state == ncsi_dev_state_config_egmf) {
> > >  			nca.type = NCSI_PKT_CMD_EGMF;
> > >  			nca.dwords[0] = nc->caps[NCSI_CAP_MC].cap;
> > > -			nd->state = ncsi_dev_state_config_ecnt;
> > > +			if (ncsi_channel_is_tx(ndp, nc))
> > > +				nd->state = ncsi_dev_state_config_ecnt;
> > > +			else
> > > +				nd->state = ncsi_dev_state_config_ec;
> > >  #endif /* CONFIG_IPV6 */
> > >  		} else if (nd->state == ncsi_dev_state_config_ecnt) {
> > >  			nca.type = NCSI_PKT_CMD_ECNT;
> > > @@ -840,43 +891,35 @@ static void ncsi_configure_channel(struct ncsi_dev_priv *ndp)
> > >  
> > >  static int ncsi_choose_active_channel(struct ncsi_dev_priv *ndp)
> > >  {
> > > -	struct ncsi_package *np, *force_package;
> > > -	struct ncsi_channel *nc, *found, *hot_nc, *force_channel;
> > > +	struct ncsi_package *np;
> > > +	struct ncsi_channel *nc, *found, *hot_nc;
> > >  	struct ncsi_channel_mode *ncm;
> > > -	unsigned long flags;
> > > +	unsigned long flags, cflags;
> > > +	bool with_link;

All local variable declarations from longest to shortest
line.

> > >  
> > >  	spin_lock_irqsave(&ndp->lock, flags);
> > >  	hot_nc = ndp->hot_channel;
> > > -	force_channel = ndp->force_channel;
> > > -	force_package = ndp->force_package;
> > >  	spin_unlock_irqrestore(&ndp->lock, flags);
> > >  
> > > -	/* Force a specific channel whether or not it has link if we have been
> > > -	 * configured to do so
> > > -	 */
> > > -	if (force_package && force_channel) {
> > > -		found = force_channel;
> > > -		ncm = &found->modes[NCSI_MODE_LINK];
> > > -		if (!(ncm->data[2] & 0x1))
> > > -			netdev_info(ndp->ndev.dev,
> > > -				    "NCSI: Channel %u forced, but it is link down\n",
> > > -				    found->id);
> > > -		goto out;
> > > -	}
> > > -
> > > -	/* The search is done once an inactive channel with up
> > > -	 * link is found.
> > > +	/* By default the search is done once an inactive channel with up
> > > +	 * link is found, unless a preferred channel is set.
> > > +	 * If multi_package or multi_channel are configured all channels in the
> > > +	 * whitelist with link are added to the channel queue.
> > >  	 */
> > >  	found = NULL;
> > > +	with_link = false;
> > >  	NCSI_FOR_EACH_PACKAGE(ndp, np) {
> > > -		if (ndp->force_package && np != ndp->force_package)
> > > +		if (!(ndp->package_whitelist & (0x1 << np->id)))
> > >  			continue;
> > >  		NCSI_FOR_EACH_CHANNEL(np, nc) {
> > > -			spin_lock_irqsave(&nc->lock, flags);
> > > +			if (!(np->channel_whitelist & (0x1 << nc->id)))
> > > +				continue;
> > > +
> > > +			spin_lock_irqsave(&nc->lock, cflags);
> > >  
> > >  			if (!list_empty(&nc->link) ||
> > >  			    nc->state != NCSI_CHANNEL_INACTIVE) {
> > > -				spin_unlock_irqrestore(&nc->lock, flags);
> > > +				spin_unlock_irqrestore(&nc->lock, cflags);
> > >  				continue;
> > >  			}
> > >  
> > > @@ -888,32 +931,42 @@ static int ncsi_choose_active_channel(struct ncsi_dev_priv *ndp)
> > >  
> > >  			ncm = &nc->modes[NCSI_MODE_LINK];
> > >  			if (ncm->data[2] & 0x1) {
> > 
> > This data will not be updated if the channel monitor for it is not running.
> > If I move the cable from the current configured channel to the other channel,
> > NC-SI module will not detect the link status as the other channel is not configured
> > and AEN will not happen.
> > Is it per design that NC-SI module will always use the first interface with the link?
> 
> We'll check the link state of every channel at init, and per-package on
> suspend events, but you're right that we only get AENs right now from
> configured channels. There's probably no reason why we could at least
> enable AENs on all channels even if we don't use them for network, maybe
> during the package probe step.
> 

To receive the AEN packet, the channel (RX) needs to be enabled.
It seems that we need to send Get Link Status command to all channels before choosing
The active channel.

> > 
> > > -				spin_unlock_irqrestore(&nc->lock, flags);
> > >  				found = nc;
> > > -				goto out;
> > > +				with_link = true;
> > > +
> > > +				spin_lock_irqsave(&ndp->lock, flags);
> > > +				list_add_tail_rcu(&found->link,
> > > +						  &ndp->channel_queue);
> > > +				spin_unlock_irqrestore(&ndp->lock, flags);
> > > +
> > > +				netdev_dbg(ndp->ndev.dev,
> > > +					   "NCSI: Channel %u added to queue (link %s)\n",
> > > +					   found->id,
> > > +					   ncm->data[2] & 0x1 ? "up" : "down");
> > >  			}

If multi_channel is enabled, the interface without link still needs to be processed.
Something likes below?
			} else if (np->multi_channel) {
				found = nc;

				spin_lock_irqsave(&ndp->lock, flags);
				list_add_tail_rcu(&found->link,
						  &ndp->channel_queue);
				spin_unlock_irqrestore(&ndp->lock, flags);

				netdev_dbg(ndp->ndev.dev,
					   "NCSI: pkg %u ch %u added to queue (link %s)\n",
					   found->package->id,
					   found->id,
					   ncm->data[2] & 0x1 ? "up" : "down");
			}

> > > +			spin_unlock_irqrestore(&nc->lock, cflags);
> > >  
> > > -			spin_unlock_irqrestore(&nc->lock, flags);
> > > +			if (with_link && !np->multi_channel)
> > > +				break;
> > >  		}
> > > +		if (with_link && !ndp->multi_package)
> > > +			break;
> > >  	}
> > >  
> > > -	if (!found) {
> > > +	if (!with_link && found) {
> > > +		netdev_info(ndp->ndev.dev,
> > > +			    "NCSI: No channel with link found, configuring channel %u\n",
> > > +			    found->id);
> > > +		spin_lock_irqsave(&ndp->lock, flags);
> > > +		list_add_tail_rcu(&found->link, &ndp->channel_queue);
> > > +		spin_unlock_irqrestore(&ndp->lock, flags);
> > > +	} else if (!found) {
> > >  		netdev_warn(ndp->ndev.dev,
> > > -			    "NCSI: No channel found with link\n");
> > > +			    "NCSI: No channel found to configure!\n");
> > >  		ncsi_report_link(ndp, true);
> > >  		return -ENODEV;
> > >  	}
> > >  
> > > -	ncm = &found->modes[NCSI_MODE_LINK];
> > > -	netdev_dbg(ndp->ndev.dev,
> > > -		   "NCSI: Channel %u added to queue (link %s)\n",
> > > -		   found->id, ncm->data[2] & 0x1 ? "up" : "down");
> > > -
> > > -out:
> > > -	spin_lock_irqsave(&ndp->lock, flags);
> > > -	list_add_tail_rcu(&found->link, &ndp->channel_queue);
> > > -	spin_unlock_irqrestore(&ndp->lock, flags);
> > > -
> > >  	return ncsi_process_next_channel(ndp);
> > >  }
> > >  
> > > @@ -1428,6 +1481,7 @@ struct ncsi_dev *ncsi_register_dev(struct net_device *dev,
> > >  	INIT_LIST_HEAD(&ndp->channel_queue);
> > >  	INIT_LIST_HEAD(&ndp->vlan_vids);
> > >  	INIT_WORK(&ndp->work, ncsi_dev_work);
> > > +	ndp->package_whitelist = UINT_MAX;
> > >  
> > >  	/* Initialize private NCSI device */
> > >  	spin_lock_init(&ndp->lock);
> > > diff --git a/net/ncsi/ncsi-netlink.c b/net/ncsi/ncsi-netlink.c
> > > index 32cb7751d216..33a091e6f466 100644
> > > --- a/net/ncsi/ncsi-netlink.c
> > > +++ b/net/ncsi/ncsi-netlink.c
> > 
> > Is the following missed in the patch?
> > static const struct nla_policy ncsi_genl_policy[NCSI_ATTR_MAX + 1] = {
> > ...
> > 	[NCSI_ATTR_MULTI_FLAG] =	{ .type = NLA_FLAG },
> > 	[NCSI_ATTR_PACKAGE_MASK] =	{ .type = NLA_U32 },
> > 	[NCSI_ATTR_CHANNEL_MASK] =	{ .type = NLA_U32 },
> 
> Oops, added.
> 
> > 
> > > @@ -67,7 +67,7 @@ static int ncsi_write_channel_info(struct sk_buff *skb,
> > >  	nla_put_u32(skb, NCSI_CHANNEL_ATTR_LINK_STATE, m->data[2]);
> > >  	if (nc->state == NCSI_CHANNEL_ACTIVE)
> > >  		nla_put_flag(skb, NCSI_CHANNEL_ATTR_ACTIVE);
> > > -	if (ndp->force_channel == nc)
> > > +	if (nc == nc->package->preferred_channel)
> > >  		nla_put_flag(skb, NCSI_CHANNEL_ATTR_FORCED);
> > >  
> > >  	nla_put_u32(skb, NCSI_CHANNEL_ATTR_VERSION_MAJOR, nc->version.version);
> > > @@ -112,7 +112,7 @@ static int ncsi_write_package_info(struct sk_buff *skb,
> > >  		if (!pnest)
> > >  			return -ENOMEM;
> > >  		nla_put_u32(skb, NCSI_PKG_ATTR_ID, np->id);
> > > -		if (ndp->force_package == np)
> > > +		if ((0x1 << np->id) == ndp->package_whitelist)
> > >  			nla_put_flag(skb, NCSI_PKG_ATTR_FORCED);
> > >  		cnest = nla_nest_start(skb, NCSI_PKG_ATTR_CHANNEL_LIST);
> > >  		if (!cnest) {
> > > @@ -288,45 +288,54 @@ static int ncsi_set_interface_nl(struct sk_buff *msg, struct genl_info *info)
> > >  	package_id = nla_get_u32(info->attrs[NCSI_ATTR_PACKAGE_ID]);
> > >  	package = NULL;
> > >  
> > > -	spin_lock_irqsave(&ndp->lock, flags);
> > > -
> > >  	NCSI_FOR_EACH_PACKAGE(ndp, np)
> > >  		if (np->id == package_id)
> > >  			package = np;
> > >  	if (!package) {
> > >  		/* The user has set a package that does not exist */
> > > -		spin_unlock_irqrestore(&ndp->lock, flags);
> > >  		return -ERANGE;
> > >  	}
> > >  
> > >  	channel = NULL;
> > > -	if (!info->attrs[NCSI_ATTR_CHANNEL_ID]) {
> > > -		/* Allow any channel */
> > > -		channel_id = NCSI_RESERVED_CHANNEL;
> > > -	} else {
> > > +	if (info->attrs[NCSI_ATTR_CHANNEL_ID]) {
> > >  		channel_id = nla_get_u32(info->attrs[NCSI_ATTR_CHANNEL_ID]);
> > >  		NCSI_FOR_EACH_CHANNEL(package, nc)
> > > -			if (nc->id == channel_id)
> > > +			if (nc->id == channel_id) {
> > >  				channel = nc;
> > > +				break;
> > > +			}
> > > +		if (!channel) {
> > > +			netdev_info(ndp->ndev.dev,
> > > +				    "NCSI: Channel %u does not exist!\n",
> > > +				    channel_id);
> > > +			return -ERANGE;
> > > +		}
> > >  	}
> > >  
> > > -	if (channel_id != NCSI_RESERVED_CHANNEL && !channel) {
> > > -		/* The user has set a channel that does not exist on this
> > > -		 * package
> > > -		 */
> > > -		spin_unlock_irqrestore(&ndp->lock, flags);
> > > -		netdev_info(ndp->ndev.dev, "NCSI: Channel %u does not exist!\n",
> > > -			    channel_id);
> > > -		return -ERANGE;
> > > -	}
> > > -
> > > -	ndp->force_package = package;
> > > -	ndp->force_channel = channel;
> > > +	spin_lock_irqsave(&ndp->lock, flags);
> > > +	ndp->package_whitelist = 0x1 << package->id;
> > > +	ndp->multi_package = false;
> > >  	spin_unlock_irqrestore(&ndp->lock, flags);
> > >  
> > > -	netdev_info(ndp->ndev.dev, "Set package 0x%x, channel 0x%x%s as preferred\n",
> > > -		    package_id, channel_id,
> > > -		    channel_id == NCSI_RESERVED_CHANNEL ? " (any)" : "");
> > > +	spin_lock_irqsave(&package->lock, flags);
> > > +	package->multi_channel = false;
> > > +	if (channel) {
> > > +		package->channel_whitelist = 0x1 << channel->id;
> > > +		package->preferred_channel = channel;
> > > +	} else {
> > > +		/* Allow any channel */
> > > +		package->channel_whitelist = UINT_MAX;
> > > +		package->preferred_channel = NULL;
> > > +	}
> > > +	spin_unlock_irqrestore(&package->lock, flags);
> > > +
> > > +	if (channel)
> > > +		netdev_info(ndp->ndev.dev,
> > > +			    "Set package 0x%x, channel 0x%x as preferred\n",
> > > +			    package_id, channel_id);
> > > +	else
> > > +		netdev_info(ndp->ndev.dev, "Set package 0x%x as preferred\n",
> > > +			    package_id);
> > >  
> > >  	/* Bounce the NCSI channel to set changes */
> > >  	ncsi_stop_dev(&ndp->ndev);
> > > @@ -338,6 +347,7 @@ static int ncsi_set_interface_nl(struct sk_buff *msg, struct genl_info *info)
> > >  static int ncsi_clear_interface_nl(struct sk_buff *msg, struct genl_info *info)
> > >  {
> > >  	struct ncsi_dev_priv *ndp;
> > > +	struct ncsi_package *np;
> > >  	unsigned long flags;
> > >  
> > >  	if (!info || !info->attrs)
> > > @@ -351,11 +361,19 @@ static int ncsi_clear_interface_nl(struct sk_buff *msg, struct genl_info *info)
> > >  	if (!ndp)
> > >  		return -ENODEV;
> > >  
> > > -	/* Clear any override */
> > > +	/* Reset any whitelists and disable multi mode */
> > >  	spin_lock_irqsave(&ndp->lock, flags);
> > > -	ndp->force_package = NULL;
> > > -	ndp->force_channel = NULL;
> > > +	ndp->package_whitelist = UINT_MAX;
> > > +	ndp->multi_package = false;
> > >  	spin_unlock_irqrestore(&ndp->lock, flags);
> > > +
> > > +	NCSI_FOR_EACH_PACKAGE(ndp, np) {
> > > +		spin_lock_irqsave(&np->lock, flags);
> > > +		np->multi_channel = false;
> > > +		np->channel_whitelist = UINT_MAX;
> > > +		np->preferred_channel = NULL;
> > > +		spin_unlock_irqrestore(&np->lock, flags);
> > > +	}
> > >  	netdev_info(ndp->ndev.dev, "NCSI: Cleared preferred package/channel\n");
> > >  
> > >  	/* Bounce the NCSI channel to set changes */
> > > @@ -365,6 +383,137 @@ static int ncsi_clear_interface_nl(struct sk_buff *msg, struct genl_info *info)
> > >  	return 0;
> > >  }
> > >  
> > > +static int ncsi_set_package_mask_nl(struct sk_buff *msg,
> > > +				    struct genl_info *info)
> > > +{
> > > +	struct ncsi_dev_priv *ndp;
> > > +	unsigned long flags;
> > > +	int rc;
> > > +
> > > +	if (!info || !info->attrs)
> > > +		return -EINVAL;
> > > +
> > > +	if (!info->attrs[NCSI_ATTR_IFINDEX])
> > > +		return -EINVAL;
> > > +
> > > +	if (!info->attrs[NCSI_ATTR_PACKAGE_MASK])
> > > +		return -EINVAL;
> > > +
> > > +	ndp = ndp_from_ifindex(get_net(sock_net(msg->sk)),
> > > +			       nla_get_u32(info->attrs[NCSI_ATTR_IFINDEX]));
> > > +	if (!ndp)
> > > +		return -ENODEV;
> > > +
> > > +	spin_lock_irqsave(&ndp->lock, flags);
> > > +	ndp->package_whitelist =
> > > +		nla_get_u32(info->attrs[NCSI_ATTR_PACKAGE_MASK]);
> > > +
> > > +	if (nla_get_flag(info->attrs[NCSI_ATTR_MULTI_FLAG])) {
> > > +		if (ndp->flags & NCSI_DEV_HWA) {
> > > +			ndp->multi_package = true;
> > > +			rc = 0;
> > > +		} else {
> > > +			netdev_err(ndp->ndev.dev,
> > > +				   "NCSI: Can't use multiple packages without HWA\n");
> > > +			rc = -EPERM;
> > > +		}
> > > +	} else {
> > > +		rc = 0;
> > > +	}

If the flag is not set, do we need to clear the multi_package flag? It is possible that it is
user's intension to disable the multi-mode.
	} else {
		ndp->multi_package = false;
		rc = 0;
	}

> > > +
> > > +	spin_unlock_irqrestore(&ndp->lock, flags);
> > > +
> > > +	if (!rc) {
> > > +		/* Bounce the NCSI channel to set changes */
> > > +		ncsi_stop_dev(&ndp->ndev);
> > > +		ncsi_start_dev(&ndp->ndev);
> > 
> > Is it possible to delay the restart? If we have two packages, we might send
> > set_package_mask command once and set_channel_mask command twice.
> > We will see the unnecessary reconfigurations in a very short period time.
> 
> We could probably do with a better way of bouncing the link here too. I
> suppose we could schedule the actual link 'bounce' to occur a second or
> two later, and delay if we receive new configuration in that time.
> Perhaps something outside of the scope of this patch though.
> 
> > 
> > > +	}
> > > +
> > > +	return rc;
> > > +}
> > > +
> > > +static int ncsi_set_channel_mask_nl(struct sk_buff *msg,
> > > +				    struct genl_info *info)
> > > +{
> > > +	struct ncsi_package *np, *package;
> > > +	struct ncsi_channel *nc, *channel;
> > > +	struct ncsi_dev_priv *ndp;
> > > +	unsigned long flags;
> > > +	u32 package_id, channel_id;

All local variable declarations from longest to shortest
line.

> > > +
> > > +	if (!info || !info->attrs)
> > > +		return -EINVAL;
> > > +
> > > +	if (!info->attrs[NCSI_ATTR_IFINDEX])
> > > +		return -EINVAL;
> > > +
> > > +	if (!info->attrs[NCSI_ATTR_PACKAGE_ID])
> > > +		return -EINVAL;
> > > +
> > > +	if (!info->attrs[NCSI_ATTR_CHANNEL_MASK])
> > > +		return -EINVAL;
> > > +
> > > +	ndp = ndp_from_ifindex(get_net(sock_net(msg->sk)),
> > > +			       nla_get_u32(info->attrs[NCSI_ATTR_IFINDEX]));
> > > +	if (!ndp)
> > > +		return -ENODEV;
> > > +
> > > +	package_id = nla_get_u32(info->attrs[NCSI_ATTR_PACKAGE_ID]);
> > > +	package = NULL;
> > > +	NCSI_FOR_EACH_PACKAGE(ndp, np)
> > > +		if (np->id == package_id) {
> > > +			package = np;
> > > +			break;
> > > +		}
> > > +	if (!package)
> > > +		return -ERANGE;
> > > +
> > > +	spin_lock_irqsave(&package->lock, flags);
> > > +
> > > +	channel = NULL;
> > > +	if (info->attrs[NCSI_ATTR_CHANNEL_ID]) {
> > > +		channel_id = nla_get_u32(info->attrs[NCSI_ATTR_CHANNEL_ID]);
> > > +		NCSI_FOR_EACH_CHANNEL(np, nc)
> > > +			if (nc->id == channel_id) {
> > > +				channel = nc;
> > > +				break;
> > > +			}
> > > +		if (!channel) {
> > > +			spin_unlock_irqrestore(&package->lock, flags);
> > > +			return -ERANGE;
> > > +		}
> > > +		netdev_dbg(ndp->ndev.dev,
> > > +			   "NCSI: Channel %u set as preferred channel\n",
> > > +			   channel->id);
> > > +	}
> > > +
> > > +	package->channel_whitelist =
> > > +		nla_get_u32(info->attrs[NCSI_ATTR_CHANNEL_MASK]);
> > > +	if (package->channel_whitelist == 0)
> > > +		netdev_dbg(ndp->ndev.dev,
> > > +			   "NCSI: Package %u set to all channels disabled\n",
> > > +			   package->id);
> > > +
> > > +	package->preferred_channel = channel;
> > > +
> > > +	if (nla_get_flag(info->attrs[NCSI_ATTR_MULTI_FLAG])) {
> > > +		package->multi_channel = true;
> > > +		netdev_info(ndp->ndev.dev,
> > > +			    "NCSI: Multi-channel enabled on package %u\n",
> > > +			    package_id);
> > > +	} else {
> > > +		package->multi_channel = false;
> > > +	}
> > > +
> > > +	spin_unlock_irqrestore(&package->lock, flags);
> > > +
> > > +	/* Bounce the NCSI channel to set changes */
> > > +	ncsi_stop_dev(&ndp->ndev);
> > > +	ncsi_start_dev(&ndp->ndev);
> > 
> > Same question as set_package_mask function.
> > Is it possible to delay the restart? If we have two packages, we might send
> > set_package_mask command once and set_channel_mask command twice.
> > We will see the unnecessary reconfigurations in a very short period time.
> > 
> > > +
> > > +	return 0;
> > > +}
> > > +
> > >  static const struct genl_ops ncsi_ops[] = {
> > >  	{
> > >  		.cmd = NCSI_CMD_PKG_INFO,
> > > @@ -385,6 +534,18 @@ static const struct genl_ops ncsi_ops[] = {
> > >  		.doit = ncsi_clear_interface_nl,
> > >  		.flags = GENL_ADMIN_PERM,
> > >  	},
> > > +	{
> > > +		.cmd = NCSI_CMD_SET_PACKAGE_MASK,
> > > +		.policy = ncsi_genl_policy,
> > > +		.doit = ncsi_set_package_mask_nl,
> > > +		.flags = GENL_ADMIN_PERM,
> > > +	},
> > > +	{
> > > +		.cmd = NCSI_CMD_SET_CHANNEL_MASK,
> > > +		.policy = ncsi_genl_policy,
> > > +		.doit = ncsi_set_channel_mask_nl,
> > > +		.flags = GENL_ADMIN_PERM,
> > > +	},
> > >  };
> > >  
> > >  static struct genl_family ncsi_genl_family __ro_after_init = {
> > > diff --git a/net/ncsi/ncsi-rsp.c b/net/ncsi/ncsi-rsp.c
> > > index d66b34749027..02ce7626b579 100644
> > > --- a/net/ncsi/ncsi-rsp.c
> > > +++ b/net/ncsi/ncsi-rsp.c
> > > @@ -241,7 +241,7 @@ static int ncsi_rsp_handler_dcnt(struct ncsi_request *nr)
> > >  	if (!ncm->enable)
> > >  		return 0;
> > >  
> > > -	ncm->enable = 1;
> > > +	ncm->enable = 0;
> > >  	return 0;
> > >  }
> > >  
> > > -- 
> > > 2.19.0
> > 
> > 


^ permalink raw reply

* [PATCH net] net: bcmgenet: Poll internal PHY for GENETv5
From: Florian Fainelli @ 2018-10-11 22:06 UTC (permalink / raw)
  To: netdev; +Cc: Florian Fainelli, Doug Berger, David S. Miller, open list

On GENETv5, there is a hardware issue which prevents the GENET hardware
from generating a link UP interrupt when the link is operating at
10Mbits/sec. Since we do not have any way to configure the link
detection logic, fallback to polling in that case.

Fixes: 421380856d9c ("net: bcmgenet: add support for the GENETv5 hardware")
Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
---
 drivers/net/ethernet/broadcom/genet/bcmmii.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/genet/bcmmii.c b/drivers/net/ethernet/broadcom/genet/bcmmii.c
index 4241ae928d4a..34af5f1569c8 100644
--- a/drivers/net/ethernet/broadcom/genet/bcmmii.c
+++ b/drivers/net/ethernet/broadcom/genet/bcmmii.c
@@ -321,9 +321,12 @@ int bcmgenet_mii_probe(struct net_device *dev)
 	phydev->advertising = phydev->supported;
 
 	/* The internal PHY has its link interrupts routed to the
-	 * Ethernet MAC ISRs
+	 * Ethernet MAC ISRs. On GENETv5 there is a hardware issue
+	 * that prevents the signaling of link UP interrupts when
+	 * the link operates at 10Mbps, so fallback to polling for
+	 * those versions of GENET.
 	 */
-	if (priv->internal_phy)
+	if (priv->internal_phy && !GENET_IS_V5(priv))
 		dev->phydev->irq = PHY_IGNORE_INTERRUPT;
 
 	return 0;
-- 
2.17.1

^ permalink raw reply related

* Re: [virtio-dev] Re: [PATCH net-next v2 0/5] virtio: support packed ring
From: Tiwei Bie @ 2018-10-11 14:13 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Jason Wang, virtualization, linux-kernel, netdev, virtio-dev,
	wexu, jfreimann, maxime.coquelin, zhihong.wang
In-Reply-To: <20181011094705-mutt-send-email-mst@kernel.org>

On Thu, Oct 11, 2018 at 09:48:48AM -0400, Michael S. Tsirkin wrote:
> On Thu, Oct 11, 2018 at 08:12:21PM +0800, Tiwei Bie wrote:
> > > > But if it's not too late, I second for a OUT_OF_ORDER feature.
> > > > Starting from in order can have much simpler code in driver.
> > > > 
> > > > Thanks
> > > 
> > > It's tricky to change the flag polarity because of compatibility
> > > with legacy interfaces. Why is this such a big deal?
> > > 
> > > Let's teach drivers about IN_ORDER, then if devices
> > > are in order it will get enabled by default.
> > 
> > Yeah, make sense.
> > 
> > Besides, I have done some further profiling and debugging
> > both in kernel driver and DPDK vhost. Previously I was mislead
> > by a bug in vhost code. I will send a patch to fix that bug.
> > With that bug fixed, the performance of packed ring in the
> > test between kernel driver and DPDK vhost is better now.
> 
> OK, if we get a performance gain on the virtio side, we can finally
> upstream it. If you see that please re-post ASAP so we can
> put it in the next kernel release.

Got it, I will re-post ASAP.

Thanks!


> 
> -- 
> MST

^ permalink raw reply

* Re: Possible bug in traffic control?
From: Josh Coombs @ 2018-10-11 14:05 UTC (permalink / raw)
  To: xiyou.wangcong; +Cc: netdev
In-Reply-To: <CAM_iQpUqwM8vP+_qxXHJnEZ0AttBWF4nTckRiP0nY=Eq+=gUpg@mail.gmail.com>

I'm actually leaning towards macsec now.  I'm at 6TB transferred in a
double hop, no macsec over the bridge setup without triggering the
fault.  I'm going to let it continue to churn and setup a second
testbed that JUST uses macsec without traffic control bridging to see
if I can trip the issue there.    That should determine if it's macsec
itself, or an interaction between macsec and traffic control.

Joshua Coombs
GWI

office 207-494-2140
www.gwi.net

On Wed, Oct 10, 2018 at 12:39 PM Cong Wang <xiyou.wangcong@gmail.com> wrote:
>
> On Wed, Oct 10, 2018 at 8:54 AM Josh Coombs <jcoombs@staff.gwi.net> wrote:
> >
> > 2.3 billion 1 byte packets failed to re-create the bug.  To try and
> > simplify the setup I removed macsec from the equation, using a single
> > host in the middle as the bridge.  Interestingly, rather than 1.3Gbits
> > a second in both directions, it ran around 8Mbits a second.  Switching
> > the filter from u32 to matchall didn't change the performance.  Going
> > back to the four machine test bed, again removing macsec and just
> > bridging through radically decreased the throughput to around 8Mbits.
> > Flip on macsec for the bridge and 1.3Gbits?
>
> This is a great narrow down! We can rule out macsec for guilty.
>
> Can you share a minimum reproducer for this problem? If so I can take
> a look.

^ permalink raw reply

* [PATCH net] rxrpc: use correct kvec num when sending BUSY response packet
From: David Howells @ 2018-10-11 21:32 UTC (permalink / raw)
  Cc: netdev, dhowells, linux-afs, linux-kernel

From: YueHaibing <yuehaibing@huawei.com>

Fixes gcc '-Wunused-but-set-variable' warning:

net/rxrpc/output.c: In function 'rxrpc_reject_packets':
net/rxrpc/output.c:527:11: warning:
 variable 'ioc' set but not used [-Wunused-but-set-variable]

'ioc' is the correct kvec num when sending a BUSY (or an ABORT) response
packet.

Fixes: ece64fec164f ("rxrpc: Emit BUSY packets when supposed to rather than ABORTs")
Signed-off-by: YueHaibing <yuehaibing@huawei.com>
Signed-off-by: David Howells <dhowells@redhat.com>
---

 net/rxrpc/output.c |    3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c
index e8fb8922bca8..a141ee3ab812 100644
--- a/net/rxrpc/output.c
+++ b/net/rxrpc/output.c
@@ -572,7 +572,8 @@ void rxrpc_reject_packets(struct rxrpc_local *local)
 			whdr.flags	^= RXRPC_CLIENT_INITIATED;
 			whdr.flags	&= RXRPC_CLIENT_INITIATED;
 
-			ret = kernel_sendmsg(local->socket, &msg, iov, 2, size);
+			ret = kernel_sendmsg(local->socket, &msg,
+					     iov, ioc, size);
 			if (ret < 0)
 				trace_rxrpc_tx_fail(local->debug_id, 0, ret,
 						    rxrpc_tx_point_reject);

^ permalink raw reply related

* [PATCH net] rxrpc: Fix an uninitialised variable
From: David Howells @ 2018-10-11 21:32 UTC (permalink / raw)
  Cc: netdev, dhowells, linux-afs, linux-kernel

Fix an uninitialised variable introduced by the last patch.  This can cause
a crash when a new call comes in to a local service, such as when an AFS
fileserver calls back to the local cache manager.

Fixes: c1e15b4944c9 ("rxrpc: Fix the packet reception routine")
Signed-off-by: David Howells <dhowells@redhat.com>
---

 net/rxrpc/call_accept.c |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c
index 652e314de38e..8079aacaecac 100644
--- a/net/rxrpc/call_accept.c
+++ b/net/rxrpc/call_accept.c
@@ -337,7 +337,7 @@ struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *local,
 {
 	struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
 	struct rxrpc_connection *conn;
-	struct rxrpc_peer *peer;
+	struct rxrpc_peer *peer = NULL;
 	struct rxrpc_call *call;
 
 	_enter("");

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox