Netdev List
 help / color / mirror / Atom feed
* [PATCH net-next v4] net/ncsi: Add NCSI Broadcom OEM command
From: Vijay Khemka @ 2018-10-12 18:20 UTC (permalink / raw)
  To: Samuel Mendoza-Jonas, David S. Miller, netdev, linux-kernel
  Cc: vijaykhemka, linux-aspeed, openbmc

This patch adds OEM Broadcom commands and response handling. It also
defines OEM Get MAC Address handler to get and configure the device.

ncsi_oem_gma_handler_bcm: This handler send NCSI broadcom command for
getting mac address.
ncsi_rsp_handler_oem_bcm: This handles response received for all
broadcom OEM commands.
ncsi_rsp_handler_oem_bcm_gma: This handles get mac address response and
set it to device.

Signed-off-by: Vijay Khemka <vijaykhemka@fb.com>
---
 v4: updated as per comment from Sam, I was just wondering if I can remove
 NCSI_OEM_CMD_GET_MAC config option and let this code be valid always and
 it will configure mac address if there is get mac address handler for given 
 manufacture id.

 net/ncsi/Kconfig       |  6 ++++
 net/ncsi/internal.h    |  8 +++++
 net/ncsi/ncsi-manage.c | 75 ++++++++++++++++++++++++++++++++++++++++++
 net/ncsi/ncsi-pkt.h    |  8 +++++
 net/ncsi/ncsi-rsp.c    | 44 +++++++++++++++++++++++--
 5 files changed, 139 insertions(+), 2 deletions(-)

diff --git a/net/ncsi/Kconfig b/net/ncsi/Kconfig
index 08a8a6031fd7..7f2b46108a24 100644
--- a/net/ncsi/Kconfig
+++ b/net/ncsi/Kconfig
@@ -10,3 +10,9 @@ config NET_NCSI
 	  support. Enable this only if your system connects to a network
 	  device via NCSI and the ethernet driver you're using supports
 	  the protocol explicitly.
+config NCSI_OEM_CMD_GET_MAC
+	bool "Get NCSI OEM MAC Address"
+	depends on NET_NCSI
+	---help---
+	  This allows to get MAC address from NCSI firmware and set them back to
+		controller.
diff --git a/net/ncsi/internal.h b/net/ncsi/internal.h
index 3d0a33b874f5..45883b32790e 100644
--- a/net/ncsi/internal.h
+++ b/net/ncsi/internal.h
@@ -71,6 +71,13 @@ enum {
 /* OEM Vendor Manufacture ID */
 #define NCSI_OEM_MFR_MLX_ID             0x8119
 #define NCSI_OEM_MFR_BCM_ID             0x113d
+/* Broadcom specific OEM Command */
+#define NCSI_OEM_BCM_CMD_GMA            0x01   /* CMD ID for Get MAC */
+/* OEM Command payload lengths*/
+#define NCSI_OEM_BCM_CMD_GMA_LEN        12
+/* Mac address offset in OEM response */
+#define BCM_MAC_ADDR_OFFSET             28
+
 
 struct ncsi_channel_version {
 	u32 version;		/* Supported BCD encoded NCSI version */
@@ -240,6 +247,7 @@ enum {
 	ncsi_dev_state_probe_dp,
 	ncsi_dev_state_config_sp	= 0x0301,
 	ncsi_dev_state_config_cis,
+	ncsi_dev_state_config_oem_gma,
 	ncsi_dev_state_config_clear_vids,
 	ncsi_dev_state_config_svf,
 	ncsi_dev_state_config_ev,
diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c
index 091284760d21..e58bf51ff685 100644
--- a/net/ncsi/ncsi-manage.c
+++ b/net/ncsi/ncsi-manage.c
@@ -635,6 +635,65 @@ static int set_one_vid(struct ncsi_dev_priv *ndp, struct ncsi_channel *nc,
 	return 0;
 }
 
+#if IS_ENABLED(CONFIG_NCSI_OEM_CMD_GET_MAC)
+
+/* NCSI OEM Command APIs */
+static void ncsi_oem_gma_handler_bcm(struct ncsi_cmd_arg *nca)
+{
+	unsigned char data[NCSI_OEM_BCM_CMD_GMA_LEN];
+	int ret = 0;
+
+	nca->payload = NCSI_OEM_BCM_CMD_GMA_LEN;
+
+	memset(data, 0, NCSI_OEM_BCM_CMD_GMA_LEN);
+	*(unsigned int *)data = ntohl(NCSI_OEM_MFR_BCM_ID);
+	data[5] = NCSI_OEM_BCM_CMD_GMA;
+
+	nca->data = data;
+
+	ret = ncsi_xmit_cmd(nca);
+	if (ret)
+		netdev_err(nca->ndp->ndev.dev,
+			   "NCSI: Failed to transmit cmd 0x%x during configure\n",
+			   nca->type);
+}
+
+/* OEM Command handlers initialization */
+static struct ncsi_oem_gma_handler {
+	unsigned int	mfr_id;
+	void		(*handler)(struct ncsi_cmd_arg *nca);
+} ncsi_oem_gma_handlers[] = {
+	{ NCSI_OEM_MFR_BCM_ID, ncsi_oem_gma_handler_bcm }
+};
+
+static int ncsi_oem_handler(struct ncsi_cmd_arg *nca, unsigned int mf_id)
+{
+	struct ncsi_oem_gma_handler *nch = NULL;
+	int i;
+
+	/* Find gma handler for given manufacturer id */
+	for (i = 0; i < ARRAY_SIZE(ncsi_oem_gma_handlers); i++) {
+		if (ncsi_oem_gma_handlers[i].mfr_id == mf_id) {
+			if (ncsi_oem_gma_handlers[i].handler)
+				nch = &ncsi_oem_gma_handlers[i];
+			break;
+			}
+	}
+
+	if (!nch) {
+		netdev_err(nca->ndp->ndev.dev,
+			   "NCSI: No GMA handler available for MFR-ID (0x%x)\n",
+			   mf_id);
+		return -1;
+	}
+
+	/* Get Mac address from NCSI device */
+	nch->handler(nca);
+	return 0;
+}
+
+#endif /* CONFIG_NCSI_OEM_CMD_GET_MAC */
+
 static void ncsi_configure_channel(struct ncsi_dev_priv *ndp)
 {
 	struct ncsi_dev *nd = &ndp->ndev;
@@ -685,7 +744,23 @@ static void ncsi_configure_channel(struct ncsi_dev_priv *ndp)
 			goto error;
 		}
 
+		nd->state = ncsi_dev_state_config_oem_gma;
+		break;
+	case ncsi_dev_state_config_oem_gma:
 		nd->state = ncsi_dev_state_config_clear_vids;
+		ret = -1;
+
+#if IS_ENABLED(CONFIG_NCSI_OEM_CMD_GET_MAC)
+		nca.type = NCSI_PKT_CMD_OEM;
+		nca.package = np->id;
+		nca.channel = nc->id;
+		ndp->pending_req_num = 1;
+		ret = ncsi_oem_handler(&nca, nc->version.mf_id);
+#endif /* CONFIG_NCSI_OEM_CMD_GET_MAC */
+
+		if (ret < 0)
+			schedule_work(&ndp->work);
+
 		break;
 	case ncsi_dev_state_config_clear_vids:
 	case ncsi_dev_state_config_svf:
diff --git a/net/ncsi/ncsi-pkt.h b/net/ncsi/ncsi-pkt.h
index 0f2087c8d42a..4d3f06be38bd 100644
--- a/net/ncsi/ncsi-pkt.h
+++ b/net/ncsi/ncsi-pkt.h
@@ -165,6 +165,14 @@ struct ncsi_rsp_oem_pkt {
 	unsigned char           data[];      /* Payload data      */
 };
 
+/* Broadcom Response Data */
+struct ncsi_rsp_oem_bcm_pkt {
+	unsigned char           ver;         /* Payload Version   */
+	unsigned char           type;        /* OEM Command type  */
+	__be16                  len;         /* Payload Length    */
+	unsigned char           data[];      /* Cmd specific Data */
+};
+
 /* Get Link Status */
 struct ncsi_rsp_gls_pkt {
 	struct ncsi_rsp_pkt_hdr rsp;        /* Response header   */
diff --git a/net/ncsi/ncsi-rsp.c b/net/ncsi/ncsi-rsp.c
index d66b34749027..d052a3cafed4 100644
--- a/net/ncsi/ncsi-rsp.c
+++ b/net/ncsi/ncsi-rsp.c
@@ -596,19 +596,59 @@ static int ncsi_rsp_handler_snfc(struct ncsi_request *nr)
 	return 0;
 }
 
+/* Response handler for Broadcom command Get Mac Address */
+static int ncsi_rsp_handler_oem_bcm_gma(struct ncsi_request *nr)
+{
+	struct ncsi_dev_priv *ndp = nr->ndp;
+	struct net_device *ndev = ndp->ndev.dev;
+	const struct net_device_ops *ops = ndev->netdev_ops;
+	struct ncsi_rsp_oem_pkt *rsp;
+	struct sockaddr saddr;
+	int ret = 0;
+
+	/* Get the response header */
+	rsp = (struct ncsi_rsp_oem_pkt *)skb_network_header(nr->rsp);
+
+	saddr.sa_family = ndev->type;
+	ndev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
+	memcpy(saddr.sa_data, &rsp->data[BCM_MAC_ADDR_OFFSET], ETH_ALEN);
+	/* Increase mac address by 1 for BMC's address */
+	saddr.sa_data[ETH_ALEN - 1]++;
+	ret = ops->ndo_set_mac_address(ndev, &saddr);
+	if (ret < 0)
+		netdev_warn(ndev, "NCSI: 'Writing mac address to device failed\n");
+
+	return ret;
+}
+
+/* Response handler for Broadcom card */
+static int ncsi_rsp_handler_oem_bcm(struct ncsi_request *nr)
+{
+	struct ncsi_rsp_oem_bcm_pkt *bcm;
+	struct ncsi_rsp_oem_pkt *rsp;
+
+	/* Get the response header */
+	rsp = (struct ncsi_rsp_oem_pkt *)skb_network_header(nr->rsp);
+	bcm = (struct ncsi_rsp_oem_bcm_pkt *)(rsp->data);
+
+	if (bcm->type == NCSI_OEM_BCM_CMD_GMA)
+		return ncsi_rsp_handler_oem_bcm_gma(nr);
+	return 0;
+}
+
 static struct ncsi_rsp_oem_handler {
 	unsigned int	mfr_id;
 	int		(*handler)(struct ncsi_request *nr);
 } ncsi_rsp_oem_handlers[] = {
 	{ NCSI_OEM_MFR_MLX_ID, NULL },
-	{ NCSI_OEM_MFR_BCM_ID, NULL }
+	{ NCSI_OEM_MFR_BCM_ID, ncsi_rsp_handler_oem_bcm }
 };
 
 /* Response handler for OEM command */
 static int ncsi_rsp_handler_oem(struct ncsi_request *nr)
 {
-	struct ncsi_rsp_oem_pkt *rsp;
 	struct ncsi_rsp_oem_handler *nrh = NULL;
+	struct ncsi_rsp_oem_pkt *rsp;
 	unsigned int mfr_id, i;
 
 	/* Get the response header */
-- 
2.17.1

^ permalink raw reply related

* Re: [PATCH net-next v3 1/2] net/ncsi: Add NCSI Broadcom OEM command
From: Vijay Khemka @ 2018-10-12 18:23 UTC (permalink / raw)
  To: Samuel Mendoza-Jonas, David S. Miller, netdev@vger.kernel.org,
	linux-kernel@vger.kernel.org
  Cc: openbmc @ lists . ozlabs . org, Justin.Lee1@Dell.com,
	joel@jms.id.au, linux-aspeed@lists.ozlabs.org
In-Reply-To: <0291dc2447720438a25c9c922252ab71e92985dd.camel@mendozajonas.com>



On 10/11/18, 5:42 PM, "Samuel Mendoza-Jonas" <sam@mendozajonas.com> wrote:

    > diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c
    > index 091284760d21..75504ccd1b95 100644
    > --- a/net/ncsi/ncsi-manage.c
    > +++ b/net/ncsi/ncsi-manage.c
    > @@ -635,6 +635,39 @@ static int set_one_vid(struct ncsi_dev_priv *ndp, struct ncsi_channel *nc,
    >  	return 0;
    >  }
    >  
    > +#if IS_ENABLED(CONFIG_NCSI_OEM_CMD_GET_MAC)
    > +
    > +/* NCSI OEM Command APIs */
    > +static void ncsi_oem_gma_handler_bcm(struct ncsi_cmd_arg *nca)
    > +{
    > +	int ret = 0;
    > +	unsigned char data[NCSI_OEM_BCM_CMD_GMA_LEN];
    > +
    > +	nca->payload = NCSI_OEM_BCM_CMD_GMA_LEN;
    > +
    > +	memset(data, 0, NCSI_OEM_BCM_CMD_GMA_LEN);
    > +	*(unsigned int *)data = ntohl(NCSI_OEM_MFR_BCM_ID);
    > +	data[5] = NCSI_OEM_BCM_CMD_GMA;
    > +
    > +	nca->data = data;
    > +
    > +	ret = ncsi_xmit_cmd(nca);
    > +	if (ret)
    > +		netdev_err(nca->ndp->ndev.dev,
    > +			   "NCSI: Failed to transmit cmd 0x%x during configure\n",
    > +			   nca->type);
    > +}
    > +
    > +/* OEM Command handlers initialization */
    > +static struct ncsi_oem_gma_handler {
    > +	unsigned int	mfr_id;
    > +	void		(*handler)(struct ncsi_cmd_arg *nca);
    > +} ncsi_oem_gma_handlers[] = {
    > +	{ NCSI_OEM_MFR_BCM_ID, ncsi_oem_gma_handler_bcm }
    > +};
    > +
    > +#endif /* CONFIG_NCSI_OEM_CMD_GET_MAC */
    > +
    >  static void ncsi_configure_channel(struct ncsi_dev_priv *ndp)
    >  {
    >  	struct ncsi_dev *nd = &ndp->ndev;
    > @@ -685,6 +718,43 @@ static void ncsi_configure_channel(struct ncsi_dev_priv *ndp)
    >  			goto error;
    >  		}
    >  
    > +#if IS_ENABLED(CONFIG_NCSI_OEM_CMD_GET_MAC)
    > +		nd->state = ncsi_dev_state_config_oem_gma;
    > +		break;
    > +	case ncsi_dev_state_config_oem_gma:
    > +		nca.type = NCSI_PKT_CMD_OEM;
    > +		nca.package = np->id;
    > +		nca.channel = nc->id;
    > +		ndp->pending_req_num = 1;
    > +
    > +		/* Check for manufacturer id and Find the handler */
    > +		struct ncsi_oem_gma_handler *nch = NULL;
    > +		int i;
    > +
    
    This has the opposite affect, now if we do compile with
    CONFIG_NCSI_OEM_CMD_GET_MAC we get:
    
    ../net/ncsi/ncsi-manage.c: In function ‘ncsi_configure_channel’:
    ../net/ncsi/ncsi-manage.c:769:3: warning: ISO C90 forbids mixed declarations and code [-Wdeclaration-after-statement]
       struct ncsi_oem_gma_handler *nch = NULL;
       ^~~~~~
    
    Perhaps we should lay this out slightly differently. For example we could go
    through the ncsi_dev_state_config_oem_gma state regardless and call some other
    function that finds and calls the handler, or does nothing if the config option
    isn't set.
    
    Regards,
    Sam
    
  Sam,
  I have created a new patch v4 and introduced new function. I was just wondering if I can remove
  CONFIG_NCSI_OEM_CMD_GET_MAC config all together. And it always get and set mac address if
  Handler is available. Looking for your thought here.
 
  -Vijay


^ permalink raw reply

* Re: [PATCH net-next 0/3] Fixes & small enhancements related to the promisc mode in HNS3
From: David Miller @ 2018-10-12 18:24 UTC (permalink / raw)
  To: salil.mehta
  Cc: yisen.zhuang, lipeng321, mehta.salil, netdev, linux-kernel,
	linuxarm
In-Reply-To: <20181012143406.22600-1-salil.mehta@huawei.com>

From: Salil Mehta <salil.mehta@huawei.com>
Date: Fri, 12 Oct 2018 15:34:03 +0100

> This patch-set presents some fixes and enhancements related to promiscuous
> mode and MAC VLAN Table full condition in HNS3 Ethernet Driver.

Series applied.

^ permalink raw reply

* [PATCH v2] netlink: replace __NLA_ENSURE implementation
From: Johannes Berg @ 2018-10-12 10:53 UTC (permalink / raw)
  To: linux-wireless, netdev; +Cc: John Garry, Johannes Berg

From: Johannes Berg <johannes.berg@intel.com>

We already have BUILD_BUG_ON_ZERO() which I just hadn't found
before, so we should use it here instead of open-coding another
implementation thereof.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
v2: remove all the language about -Wvla, I misunderstood John
    and he was just referring to *other* VLA warnings in the
    wifi stack (which we know about and are being fixed by the
    crypto tree)
---
 include/net/netlink.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/net/netlink.h b/include/net/netlink.h
index 589683091f16..094012174b6f 100644
--- a/include/net/netlink.h
+++ b/include/net/netlink.h
@@ -311,7 +311,7 @@ struct nla_policy {
 #define NLA_POLICY_NESTED_ARRAY(maxattr, policy) \
 	{ .type = NLA_NESTED_ARRAY, .validation_data = policy, .len = maxattr }
 
-#define __NLA_ENSURE(condition) (sizeof(char[1 - 2*!(condition)]) - 1)
+#define __NLA_ENSURE(condition) BUILD_BUG_ON_ZERO(!(condition))
 #define NLA_ENSURE_INT_TYPE(tp)				\
 	(__NLA_ENSURE(tp == NLA_S8 || tp == NLA_U8 ||	\
 		      tp == NLA_S16 || tp == NLA_U16 ||	\
-- 
2.14.4

^ permalink raw reply related

* [PATCH] igb: shorten maximum PHC timecounter update interval
From: Miroslav Lichvar @ 2018-10-12 11:13 UTC (permalink / raw)
  To: intel-wired-lan, netdev
  Cc: Miroslav Lichvar, Jacob Keller, Richard Cochran, Thomas Gleixner

The timecounter needs to be updated at least once per ~550 seconds in
order to avoid a 40-bit SYSTIM timestamp to be misinterpreted as an old
timestamp.

Since commit 500462a9d ("timers: Switch to a non-cascading wheel"),
scheduling of delayed work seems to be less accurate and a requested
delay of 540 seconds may actually be longer than 550 seconds. Shorten
the delay to 480 seconds to be sure the timecounter is updated in time.

This fixes an issue with HW timestamps on 82580/I350/I354 being off by
~1100 seconds for few seconds every ~9 minutes.

Cc: Jacob Keller <jacob.e.keller@intel.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Miroslav Lichvar <mlichvar@redhat.com>
---
 drivers/net/ethernet/intel/igb/igb_ptp.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/igb/igb_ptp.c b/drivers/net/ethernet/intel/igb/igb_ptp.c
index 9f4d700e09df..29ced6b74d36 100644
--- a/drivers/net/ethernet/intel/igb/igb_ptp.c
+++ b/drivers/net/ethernet/intel/igb/igb_ptp.c
@@ -51,9 +51,15 @@
  *
  * The 40 bit 82580 SYSTIM overflows every
  *   2^40 * 10^-9 /  60  = 18.3 minutes.
+ *
+ * SYSTIM is converted to real time using a timecounter. As
+ * timecounter_cyc2time() allows old timestamps, the timecounter
+ * needs to be updated at least once per half of the SYSTIM interval.
+ * Scheduling of delayed work is not very accurate, so we aim for 8
+ * minutes to be sure the actual interval is shorter than 9.16 minutes.
  */
 
-#define IGB_SYSTIM_OVERFLOW_PERIOD	(HZ * 60 * 9)
+#define IGB_SYSTIM_OVERFLOW_PERIOD	(HZ * 60 * 8)
 #define IGB_PTP_TX_TIMEOUT		(HZ * 15)
 #define INCPERIOD_82576			BIT(E1000_TIMINCA_16NS_SHIFT)
 #define INCVALUE_82576_MASK		GENMASK(E1000_TIMINCA_16NS_SHIFT - 1, 0)
-- 
2.17.1

^ permalink raw reply related

* [PATCH iproute2 net-next] bridge: add support for backup port
From: Nikolay Aleksandrov @ 2018-10-12 11:42 UTC (permalink / raw)
  To: netdev; +Cc: roopa, dsahern, Nikolay Aleksandrov

This patch adds support for the new backup port option that can be set
on a bridge port. If the port's carrier goes down all of the traffic
gets redirected to the configured backup port. We add the following new
arguments:
$ ip link set dev brport type bridge_slave backup_port brport2
$ ip link set dev brport type bridge_slave nobackup_port

$ bridge link set dev brport backup_port brport2
$ bridge link set dev brport nobackup_port

The man pages are updated respectively.
Also 2 minor style adjustments:
- add missing space to bridge man page's state argument
- use lower starting case for vlan_tunnel in ip-link man page (to be
consistent with the rest)

Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
---
 bridge/link.c            | 26 ++++++++++++++++++++++++++
 ip/iplink_bridge_slave.c | 18 ++++++++++++++++++
 man/man8/bridge.8        | 13 ++++++++++++-
 man/man8/ip-link.8.in    | 14 ++++++++++++--
 4 files changed, 68 insertions(+), 3 deletions(-)

diff --git a/bridge/link.c b/bridge/link.c
index 09df489b26ab..4a14845da591 100644
--- a/bridge/link.c
+++ b/bridge/link.c
@@ -152,6 +152,16 @@ static void print_protinfo(FILE *fp, struct rtattr *attr)
 		if (prtb[IFLA_BRPORT_VLAN_TUNNEL])
 			print_onoff(fp, "vlan_tunnel",
 				    rta_getattr_u8(prtb[IFLA_BRPORT_VLAN_TUNNEL]));
+
+		if (prtb[IFLA_BRPORT_BACKUP_PORT]) {
+			int ifidx;
+
+			ifidx = rta_getattr_u32(prtb[IFLA_BRPORT_BACKUP_PORT]);
+			print_string(PRINT_ANY,
+				     "backup_port", "backup_port %s ",
+				     ll_index_to_name(ifidx));
+		}
+
 		if (prtb[IFLA_BRPORT_ISOLATED])
 			print_onoff(fp, "isolated",
 				    rta_getattr_u8(prtb[IFLA_BRPORT_ISOLATED]));
@@ -255,6 +265,7 @@ static void usage(void)
 	fprintf(stderr,	"                               [ vlan_tunnel {on | off} ]\n");
 	fprintf(stderr,	"                               [ isolated {on | off} ]\n");
 	fprintf(stderr, "                               [ hwmode {vepa | veb} ]\n");
+	fprintf(stderr,	"                               [ backup_port DEVICE ] [ nobackup_port ]\n");
 	fprintf(stderr, "                               [ self ] [ master ]\n");
 	fprintf(stderr, "       bridge link show [dev DEV]\n");
 	exit(-1);
@@ -289,6 +300,7 @@ static int brlink_modify(int argc, char **argv)
 		.ifm.ifi_family = PF_BRIDGE,
 	};
 	char *d = NULL;
+	int backup_port_idx = -1;
 	__s8 neigh_suppress = -1;
 	__s8 learning = -1;
 	__s8 learning_sync = -1;
@@ -395,6 +407,16 @@ static int brlink_modify(int argc, char **argv)
 			NEXT_ARG();
 			if (!on_off("isolated", &isolated, *argv))
 				return -1;
+		} else if (strcmp(*argv, "backup_port") == 0) {
+			NEXT_ARG();
+			backup_port_idx = ll_name_to_index(*argv);
+			if (!backup_port_idx) {
+				fprintf(stderr, "Error: device %s does not exist\n",
+					*argv);
+				return -1;
+			}
+		} else if (strcmp(*argv, "nobackup_port") == 0) {
+			backup_port_idx = 0;
 		} else {
 			usage();
 		}
@@ -456,6 +478,10 @@ static int brlink_modify(int argc, char **argv)
 	if (isolated != -1)
 		addattr8(&req.n, sizeof(req), IFLA_BRPORT_ISOLATED, isolated);
 
+	if (backup_port_idx != -1)
+		addattr32(&req.n, sizeof(req), IFLA_BRPORT_BACKUP_PORT,
+			  backup_port_idx);
+
 	addattr_nest_end(&req.n, nest);
 
 	/* IFLA_AF_SPEC nested attribute. Contains IFLA_BRIDGE_FLAGS that
diff --git a/ip/iplink_bridge_slave.c b/ip/iplink_bridge_slave.c
index 5a6e48559781..8b4f93f265be 100644
--- a/ip/iplink_bridge_slave.c
+++ b/ip/iplink_bridge_slave.c
@@ -41,6 +41,7 @@ static void print_explain(FILE *f)
 		"                        [ neigh_suppress {on | off} ]\n"
 		"                        [ vlan_tunnel {on | off} ]\n"
 		"                        [ isolated {on | off} ]\n"
+		"                        [ backup_port DEVICE ] [ nobackup_port ]\n"
 	);
 }
 
@@ -279,6 +280,13 @@ static void bridge_slave_print_opt(struct link_util *lu, FILE *f,
 	if (tb[IFLA_BRPORT_ISOLATED])
 		_print_onoff(f, "isolated", "isolated",
 			     rta_getattr_u8(tb[IFLA_BRPORT_ISOLATED]));
+
+	if (tb[IFLA_BRPORT_BACKUP_PORT]) {
+		int backup_p = rta_getattr_u32(tb[IFLA_BRPORT_BACKUP_PORT]);
+
+		print_string(PRINT_ANY, "backup_port", "backup_port %s ",
+			     ll_index_to_name(backup_p));
+	}
 }
 
 static void bridge_slave_parse_on_off(char *arg_name, char *arg_val,
@@ -388,6 +396,16 @@ static int bridge_slave_parse_opt(struct link_util *lu, int argc, char **argv,
 			NEXT_ARG();
 			bridge_slave_parse_on_off("isolated", *argv, n,
 						  IFLA_BRPORT_ISOLATED);
+		} else if (matches(*argv, "backup_port") == 0) {
+			int ifindex;
+
+			NEXT_ARG();
+			ifindex = ll_name_to_index(*argv);
+			if (!ifindex)
+				invarg("Device does not exist\n", *argv);
+			addattr32(n, 1024, IFLA_BRPORT_BACKUP_PORT, ifindex);
+		} else if (matches(*argv, "nobackup_port") == 0) {
+			addattr32(n, 1024, IFLA_BRPORT_BACKUP_PORT, 0);
 		} else if (matches(*argv, "help") == 0) {
 			explain();
 			return -1;
diff --git a/man/man8/bridge.8 b/man/man8/bridge.8
index c0415bc646df..72210f625e55 100644
--- a/man/man8/bridge.8
+++ b/man/man8/bridge.8
@@ -37,7 +37,7 @@ bridge \- show / manipulate bridge addresses and devices
 .B priority
 .IR PRIO " ] [ "
 .B state
-.IR STATE "] ["
+.IR STATE " ] [ "
 .BR guard " { " on " | " off " } ] [ "
 .BR hairpin " { " on " | " off " } ] [ "
 .BR fastleave " { " on " | " off " } ] [ "
@@ -50,6 +50,9 @@ bridge \- show / manipulate bridge addresses and devices
 .BR neigh_suppress " { " on " | " off " } ] [ "
 .BR vlan_tunnel " { " on " | " off " } ] [ "
 .BR isolated " { " on " | " off " } ] [ "
+.B backup_port
+.IR  DEVICE " ] ["
+.BR nobackup_port " ] [ "
 .BR self " ] [ " master " ]"
 
 .ti -8
@@ -373,6 +376,14 @@ Controls whether vlan to tunnel mapping is enabled on the port. By default this
 Controls whether a given port will be isolated, which means it will be able to communicate with non-isolated ports only.
 By default this flag is off.
 
+.TP
+.BI backup_port " DEVICE"
+If the port loses carrier all traffic will be redirected to the configured backup port
+
+.TP
+.BR nobackup_port
+Removes the currently configured backup port
+
 .TP
 .BI self
 link setting is configured on specified physical device
diff --git a/man/man8/ip-link.8.in b/man/man8/ip-link.8.in
index 9f345f96fab1..ecbbd4f499e7 100644
--- a/man/man8/ip-link.8.in
+++ b/man/man8/ip-link.8.in
@@ -2076,7 +2076,11 @@ the following additional arguments are supported:
 ] [
 .BR vlan_tunnel " { " on " | " off " }"
 ] [
-.BR isolated " { " on " | " off " } ]"
+.BR isolated " { " on " | " off " }"
+] [
+.BR backup_port " DEVICE"
+] [
+.BR nobackup_port " ]"
 
 .in +8
 .sp
@@ -2158,7 +2162,13 @@ option above.
 - controls whether neigh discovery (arp and nd) proxy and suppression is enabled on the port. By default this flag is off.
 
 .BR vlan_tunnel " { " on " | " off " }"
-- Controls whether vlan to tunnel mapping is enabled on the port. By default this flag is off.
+- controls whether vlan to tunnel mapping is enabled on the port. By default this flag is off.
+
+.BI backup_port " DEVICE"
+- if the port loses carrier all traffic will be redirected to the configured backup port
+
+.BR nobackup_port
+- removes the currently configured backup port
 
 .in -8
 
-- 
2.17.2

^ permalink raw reply related

* RE: [RFC PATCH 2/2] net/ncsi: Configure multi-package, multi-channel modes with failover
From: Justin.Lee1 @ 2018-10-12 19:16 UTC (permalink / raw)
  To: sam, netdev; +Cc: davem, linux-kernel, openbmc
In-Reply-To: <cdf6fa4a6d7a506f84c9d9925d8f621b398a2fff.camel@mendozajonas.com>

> On Thu, 2018-10-11 at 22:09 +0000, Justin.Lee1@Dell.com wrote:
> > Hi Sam,
> > 
> > Please see my comments below.
> > 
> > Thanks,
> > Justin
> >  
> >  
> > > On Wed, 2018-10-10 at 22:36 +0000, Justin.Lee1@Dell.com wrote:
> > > > Hi Samuel,
> > > > 
> > > > I am still testing your change and have some comments below.
> > > > 
> > > > Thanks,
> > > > Justin
> > > > 
> > > > 
> > > > > This patch extends the ncsi-netlink interface with two new commands and
> > > > > three new attributes to configure multiple packages and/or channels at
> > > > > once, and configure specific failover modes.
> > > > > 
> > > > > NCSI_CMD_SET_PACKAGE mask and NCSI_CMD_SET_CHANNEL_MASK set a whitelist
> > > > > of packages or channels allowed to be configured with the
> > > > > NCSI_ATTR_PACKAGE_MASK and NCSI_ATTR_CHANNEL_MASK attributes
> > > > > respectively. If one of these whitelists is set only packages or
> > > > > channels matching the whitelist are considered for the channel queue in
> > > > > ncsi_choose_active_channel().
> > > > > 
> > > > > These commands may also use the NCSI_ATTR_MULTI_FLAG to signal that
> > > > > multiple packages or channels may be configured simultaneously. NCSI
> > > > > hardware arbitration (HWA) must be available in order to enable
> > > > > multi-package mode. Multi-channel mode is always available.
> > > > > 
> > > > > If the NCSI_ATTR_CHANNEL_ID attribute is present in the
> > > > > NCSI_CMD_SET_CHANNEL_MASK command the it sets the preferred channel as
> > > > > with the NCSI_CMD_SET_INTERFACE command. The combination of preferred
> > > > > channel and channel whitelist defines a primary channel and the allowed
> > > > > failover channels.
> > > > > If the NCSI_ATTR_MULTI_FLAG attribute is also present then the preferred
> > > > > channel is configured for Tx/Rx and the other channels are enabled only
> > > > > for Rx.
> > > > > 
> > > > > Signed-off-by: Samuel Mendoza-Jonas <sam@mendozajonas.com>
> > > > > ---
> > > > >  include/uapi/linux/ncsi.h |  16 +++
> > > > >  net/ncsi/internal.h       |  11 +-
> > > > >  net/ncsi/ncsi-aen.c       |   2 +-
> > > > >  net/ncsi/ncsi-manage.c    | 138 ++++++++++++++++--------
> > > > >  net/ncsi/ncsi-netlink.c   | 217 +++++++++++++++++++++++++++++++++-----
> > > > >  net/ncsi/ncsi-rsp.c       |   2 +-
> > > > >  6 files changed, 312 insertions(+), 74 deletions(-)
> > > > > 
> > > > > diff --git a/include/uapi/linux/ncsi.h b/include/uapi/linux/ncsi.h
> > > > > index 4c292ecbb748..035fba1693f9 100644
> > > > > --- a/include/uapi/linux/ncsi.h
> > > > > +++ b/include/uapi/linux/ncsi.h
> > > > > @@ -23,6 +23,13 @@
> > > > >   *	optionally the preferred NCSI_ATTR_CHANNEL_ID.
> > > > >   * @NCSI_CMD_CLEAR_INTERFACE: clear any preferred package/channel combination.
> > > > >   *	Requires NCSI_ATTR_IFINDEX.
> > > > > + * @NCSI_CMD_SET_PACKAGE_MASK: set a whitelist of allowed packages.
> > > > > + * @NCSI_CMD_SET_PACKAGE_MASK: set a whitelist of allowed channels.
> > > > > + *	Requires NCSI_ATTR_IFINDEX and NCSI_ATTR_PACKAGE_MASK.
> > > > > + * @NCSI_CMD_SET_PACKAGE_MASK: set a whitelist of allowed channels.
> > > > > + *	Requires NCSI_ATTR_IFINDEX, NCSI_ATTR_PACKAGE_ID, and
> > > > > + *	NCSI_ATTR_CHANNEL_MASK. If NCSI_ATTR_CHANNEL_ID is present it sets
> > > > > + *	the primary channel.
> > > > >   * @NCSI_CMD_MAX: highest command number
> > > > >   */
> > > > 
> > > > There are some typo in the description.
> > > > * @NCSI_CMD_SET_PACKAGE_MASK: set a whitelist of allowed packages.
> > > >  *	Requires NCSI_ATTR_IFINDEX and NCSI_ATTR_PACKAGE_MASK.
> > > >  * @NCSI_CMD_SET_CHANNEL_MASK: set a whitelist of allowed channels.
> > > >  *	Requires NCSI_ATTR_IFINDEX, NCSI_ATTR_PACKAGE_ID, and
> > > >  *	NCSI_ATTR_CHANNEL_MASK. If NCSI_ATTR_CHANNEL_ID is present it sets
> > > >  *	the primary channel.
> > > 
> > > Haha, yes I threw that in at the end, thanks for catching it.
> > > 
> > > > >  enum ncsi_nl_commands {
> > > > > @@ -30,6 +37,8 @@ enum ncsi_nl_commands {
> > > > >  	NCSI_CMD_PKG_INFO,
> > > > >  	NCSI_CMD_SET_INTERFACE,
> > > > >  	NCSI_CMD_CLEAR_INTERFACE,
> > > > > +	NCSI_CMD_SET_PACKAGE_MASK,
> > > > > +	NCSI_CMD_SET_CHANNEL_MASK,
> > > > >  
> > > > >  	__NCSI_CMD_AFTER_LAST,
> > > > >  	NCSI_CMD_MAX = __NCSI_CMD_AFTER_LAST - 1
> > > > > @@ -43,6 +52,10 @@ enum ncsi_nl_commands {
> > > > >   * @NCSI_ATTR_PACKAGE_LIST: nested array of NCSI_PKG_ATTR attributes
> > > > >   * @NCSI_ATTR_PACKAGE_ID: package ID
> > > > >   * @NCSI_ATTR_CHANNEL_ID: channel ID
> > > > > + * @NCSI_ATTR_MULTI_FLAG: flag to signal that multi-mode should be enabled with
> > > > > + *	NCSI_CMD_SET_PACKAGE_MASK or NCSI_CMD_SET_CHANNEL_MASK.
> > > > > + * @NCSI_ATTR_PACKAGE_MASK: 32-bit mask of allowed packages.
> > > > > + * @NCSI_ATTR_CHANNEL_MASK: 32-bit mask of allowed channels.
> > > > >   * @NCSI_ATTR_MAX: highest attribute number
> > > > >   */
> > > > >  enum ncsi_nl_attrs {
> > > > > @@ -51,6 +64,9 @@ enum ncsi_nl_attrs {
> > > > >  	NCSI_ATTR_PACKAGE_LIST,
> > > > >  	NCSI_ATTR_PACKAGE_ID,
> > > > >  	NCSI_ATTR_CHANNEL_ID,
> > > > > +	NCSI_ATTR_MULTI_FLAG,
> > > > > +	NCSI_ATTR_PACKAGE_MASK,
> > > > > +	NCSI_ATTR_CHANNEL_MASK,
> > > > 
> > > > Is there a case that we might set these two masks at the same time?
> > > > If not, maybe we can just have one generic MASK attribute.
> > > > 
> > > 
> > > I thought of this too: not yet, but I wonder if we might in the future.
> > > I'll have a think about it.
> > > 
> > > > >  
> > > > >  	__NCSI_ATTR_AFTER_LAST,
> > > > >  	NCSI_ATTR_MAX = __NCSI_ATTR_AFTER_LAST - 1
> > > > > diff --git a/net/ncsi/internal.h b/net/ncsi/internal.h
> > > > > index 3d0a33b874f5..8437474d0a78 100644
> > > > > --- a/net/ncsi/internal.h
> > > > > +++ b/net/ncsi/internal.h
> > > > > @@ -213,6 +213,10 @@ struct ncsi_package {
> > > > >  	unsigned int         channel_num; /* Number of channels     */
> > > > >  	struct list_head     channels;    /* List of chanels        */
> > > > >  	struct list_head     node;        /* Form list of packages  */
> > > > > +
> > > > > +	bool                 multi_channel; /* Enable multiple channels  */
> > > > > +	u32                  channel_whitelist; /* Channels to configure */
> > > > > +	struct ncsi_channel  *preferred_channel; /* Primary channel      */
> > > > >  };
> > > > >  
> > > > >  struct ncsi_request {
> > > > > @@ -280,8 +284,6 @@ struct ncsi_dev_priv {
> > > > >  	unsigned int        package_num;     /* Number of packages         */
> > > > >  	struct list_head    packages;        /* List of packages           */
> > > > >  	struct ncsi_channel *hot_channel;    /* Channel was ever active    */
> > > > > -	struct ncsi_package *force_package;  /* Force a specific package   */
> > > > > -	struct ncsi_channel *force_channel;  /* Force a specific channel   */
> > > > >  	struct ncsi_request requests[256];   /* Request table              */
> > > > >  	unsigned int        request_id;      /* Last used request ID       */
> > > > >  #define NCSI_REQ_START_IDX	1
> > > > > @@ -294,6 +296,9 @@ struct ncsi_dev_priv {
> > > > >  	struct list_head    node;            /* Form NCSI device list      */
> > > > >  #define NCSI_MAX_VLAN_VIDS	15
> > > > >  	struct list_head    vlan_vids;       /* List of active VLAN IDs */
> > > > > +
> > > > > +	bool                multi_package;   /* Enable multiple packages   */
> > > > > +	u32                 package_whitelist; /* Packages to configure    */
> > > > >  };
> > > > >  
> > > > >  struct ncsi_cmd_arg {
> > > > > @@ -345,6 +350,8 @@ struct ncsi_request *ncsi_alloc_request(struct ncsi_dev_priv *ndp,
> > > > >  void ncsi_free_request(struct ncsi_request *nr);
> > > > >  struct ncsi_dev *ncsi_find_dev(struct net_device *dev);
> > > > >  int ncsi_process_next_channel(struct ncsi_dev_priv *ndp);
> > > > > +bool ncsi_channel_is_last(struct ncsi_dev_priv *ndp,
> > > > > +			  struct ncsi_channel *channel);
> > > > >  
> > > > >  /* Packet handlers */
> > > > >  u32 ncsi_calculate_checksum(unsigned char *data, int len);
> > > > > diff --git a/net/ncsi/ncsi-aen.c b/net/ncsi/ncsi-aen.c
> > > > > index 65f47a648be3..eac56aee30c4 100644
> > > > > --- a/net/ncsi/ncsi-aen.c
> > > > > +++ b/net/ncsi/ncsi-aen.c
> > > > > @@ -86,7 +86,7 @@ static int ncsi_aen_handler_lsc(struct ncsi_dev_priv *ndp,
> > > > >  	    !(state == NCSI_CHANNEL_ACTIVE && !(data & 0x1)))
> > > > >  		return 0;
> > > > >  
> > > > > -	if (state == NCSI_CHANNEL_ACTIVE)
> > > > > +	if (state == NCSI_CHANNEL_ACTIVE && ncsi_channel_is_last(ndp, nc))
> > > > >  		ndp->flags |= NCSI_DEV_RESHUFFLE;
> > > > >  
> > > > >  	ncsi_stop_channel_monitor(nc);
> > > > > diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c
> > > > > index 665bee25ec44..6a55df700bcb 100644
> > > > > --- a/net/ncsi/ncsi-manage.c
> > > > > +++ b/net/ncsi/ncsi-manage.c
> > > > > @@ -27,6 +27,24 @@
> > > > >  LIST_HEAD(ncsi_dev_list);
> > > > >  DEFINE_SPINLOCK(ncsi_dev_lock);
> > > > >  
> > > > > +/* Returns true if the given channel is the last channel available */
> > > > > +bool ncsi_channel_is_last(struct ncsi_dev_priv *ndp,
> > > > > +			  struct ncsi_channel *channel)
> > > > > +{
> > > > > +	struct ncsi_package *np;
> > > > > +	struct ncsi_channel *nc;
> > > > > +
> > > > > +	NCSI_FOR_EACH_PACKAGE(ndp, np)
> > > > > +		NCSI_FOR_EACH_CHANNEL(np, nc) {
> > > > > +			if (nc == channel)
> > > > > +				continue;
> > > > > +			if (nc->state == NCSI_CHANNEL_ACTIVE)
> > > > > +				return false;
> > > > > +		}
> > > > > +
> > > > > +	return true;
> > > > > +}
> > > > > +
> > > > >  static void ncsi_report_link(struct ncsi_dev_priv *ndp, bool force_down)
> > > > >  {
> > > > >  	struct ncsi_dev *nd = &ndp->ndev;
> > > > > @@ -266,6 +284,7 @@ struct ncsi_package *ncsi_add_package(struct ncsi_dev_priv *ndp,
> > > > >  	np->ndp = ndp;
> > > > >  	spin_lock_init(&np->lock);
> > > > >  	INIT_LIST_HEAD(&np->channels);
> > > > > +	np->channel_whitelist = UINT_MAX;
> > > > >  
> > > > >  	spin_lock_irqsave(&ndp->lock, flags);
> > > > >  	tmp = ncsi_find_package(ndp, id);
> > > > > @@ -633,6 +652,34 @@ static int set_one_vid(struct ncsi_dev_priv *ndp, struct ncsi_channel *nc,
> > > > >  	return 0;
> > > > >  }
> > > > >  
> > > > > +/* Determine if a given channel should be the Tx channel */
> > > > > +bool ncsi_channel_is_tx(struct ncsi_dev_priv *ndp, struct ncsi_channel *nc)
> > > > > +{
> > > > > +	struct ncsi_package *np = nc->package;
> > > > > +	struct ncsi_channel *channel;
> > > > > +	struct ncsi_channel_mode *ncm;
> > 
> > Learn from Dave. All local variable declarations from longest to shortest
> > line. :)
> > 
> > > > > +
> > > > > +	NCSI_FOR_EACH_CHANNEL(np, channel) {
> > > > > +		ncm = &channel->modes[NCSI_MODE_TX_ENABLE];
> > > > > +		/* Another channel is already Tx */
> > > > > +		if (ncm->enable)
> > > > > +			return false;
> > > > > +	}

As we don't suspend the old channel when we call the ncsi_stop_dev() function,
this will always be false and we will not set it to the right channel.
If mutli_channel is enabled, suppose that we only need to send TX enable/disable
when  the link is changed.

> > > > > +
> > > > > +	if (!np->preferred_channel)
> > > > > +		return true;
> > > > > +
> > > > > +	if (np->preferred_channel == nc)
> > > > > +		return true;
> > > > > +
> > > > > +	/* The preferred channel is not in the queue and not active */
> > > > > +	if (list_empty(&np->preferred_channel->link) &&
> > > > > +	    np->preferred_channel->state != NCSI_CHANNEL_ACTIVE)
> > > > > +		return true;
> > > > > +
> > > > > +	return false;
> > > > > +}
> > > > > +
> > > > >  static void ncsi_configure_channel(struct ncsi_dev_priv *ndp)
> > > > >  {
> > > > >  	struct ncsi_dev *nd = &ndp->ndev;
> > > > > @@ -745,18 +792,22 @@ static void ncsi_configure_channel(struct ncsi_dev_priv *ndp)
> > > > >  		} else if (nd->state == ncsi_dev_state_config_ebf) {
> > > > >  			nca.type = NCSI_PKT_CMD_EBF;
> > > > >  			nca.dwords[0] = nc->caps[NCSI_CAP_BC].cap;
> > > > > -			nd->state = ncsi_dev_state_config_ecnt;
> > > > > +			if (ncsi_channel_is_tx(ndp, nc))
> > > > > +				nd->state = ncsi_dev_state_config_ecnt;
> > > > > +			else
> > > > > +				nd->state = ncsi_dev_state_config_ec;
> > > > >  #if IS_ENABLED(CONFIG_IPV6)
> > > > >  			if (ndp->inet6_addr_num > 0 &&
> > > > >  			    (nc->caps[NCSI_CAP_GENERIC].cap &
> > > > >  			     NCSI_CAP_GENERIC_MC))
> > > > >  				nd->state = ncsi_dev_state_config_egmf;
> > > > > -			else
> > > > > -				nd->state = ncsi_dev_state_config_ecnt;
> > > > >  		} else if (nd->state == ncsi_dev_state_config_egmf) {
> > > > >  			nca.type = NCSI_PKT_CMD_EGMF;
> > > > >  			nca.dwords[0] = nc->caps[NCSI_CAP_MC].cap;
> > > > > -			nd->state = ncsi_dev_state_config_ecnt;
> > > > > +			if (ncsi_channel_is_tx(ndp, nc))
> > > > > +				nd->state = ncsi_dev_state_config_ecnt;
> > > > > +			else
> > > > > +				nd->state = ncsi_dev_state_config_ec;
> > > > >  #endif /* CONFIG_IPV6 */
> > > > >  		} else if (nd->state == ncsi_dev_state_config_ecnt) {
> > > > >  			nca.type = NCSI_PKT_CMD_ECNT;
> > > > > @@ -840,43 +891,35 @@ static void ncsi_configure_channel(struct ncsi_dev_priv *ndp)
> > > > >  
> > > > >  static int ncsi_choose_active_channel(struct ncsi_dev_priv *ndp)
> > > > >  {
> > > > > -	struct ncsi_package *np, *force_package;
> > > > > -	struct ncsi_channel *nc, *found, *hot_nc, *force_channel;
> > > > > +	struct ncsi_package *np;
> > > > > +	struct ncsi_channel *nc, *found, *hot_nc;
> > > > >  	struct ncsi_channel_mode *ncm;
> > > > > -	unsigned long flags;
> > > > > +	unsigned long flags, cflags;
> > > > > +	bool with_link;
> > 
> > All local variable declarations from longest to shortest
> > line.
> > 
> > > > >  
> > > > >  	spin_lock_irqsave(&ndp->lock, flags);
> > > > >  	hot_nc = ndp->hot_channel;
> > > > > -	force_channel = ndp->force_channel;
> > > > > -	force_package = ndp->force_package;
> > > > >  	spin_unlock_irqrestore(&ndp->lock, flags);
> > > > >  
> > > > > -	/* Force a specific channel whether or not it has link if we have been
> > > > > -	 * configured to do so
> > > > > -	 */
> > > > > -	if (force_package && force_channel) {
> > > > > -		found = force_channel;
> > > > > -		ncm = &found->modes[NCSI_MODE_LINK];
> > > > > -		if (!(ncm->data[2] & 0x1))
> > > > > -			netdev_info(ndp->ndev.dev,
> > > > > -				    "NCSI: Channel %u forced, but it is link down\n",
> > > > > -				    found->id);
> > > > > -		goto out;
> > > > > -	}
> > > > > -
> > > > > -	/* The search is done once an inactive channel with up
> > > > > -	 * link is found.
> > > > > +	/* By default the search is done once an inactive channel with up
> > > > > +	 * link is found, unless a preferred channel is set.
> > > > > +	 * If multi_package or multi_channel are configured all channels in the
> > > > > +	 * whitelist with link are added to the channel queue.
> > > > >  	 */
> > > > >  	found = NULL;
> > > > > +	with_link = false;
> > > > >  	NCSI_FOR_EACH_PACKAGE(ndp, np) {
> > > > > -		if (ndp->force_package && np != ndp->force_package)
> > > > > +		if (!(ndp->package_whitelist & (0x1 << np->id)))
> > > > >  			continue;
> > > > >  		NCSI_FOR_EACH_CHANNEL(np, nc) {
> > > > > -			spin_lock_irqsave(&nc->lock, flags);
> > > > > +			if (!(np->channel_whitelist & (0x1 << nc->id)))
> > > > > +				continue;
> > > > > +
> > > > > +			spin_lock_irqsave(&nc->lock, cflags);
> > > > >  
> > > > >  			if (!list_empty(&nc->link) ||
> > > > >  			    nc->state != NCSI_CHANNEL_INACTIVE) {
> > > > > -				spin_unlock_irqrestore(&nc->lock, flags);
> > > > > +				spin_unlock_irqrestore(&nc->lock, cflags);
> > > > >  				continue;
> > > > >  			}
> > > > >  
> > > > > @@ -888,32 +931,42 @@ static int ncsi_choose_active_channel(struct ncsi_dev_priv *ndp)
> > > > >  
> > > > >  			ncm = &nc->modes[NCSI_MODE_LINK];
> > > > >  			if (ncm->data[2] & 0x1) {
> > > > 
> > > > This data will not be updated if the channel monitor for it is not running.
> > > > If I move the cable from the current configured channel to the other channel,
> > > > NC-SI module will not detect the link status as the other channel is not configured
> > > > and AEN will not happen.
> > > > Is it per design that NC-SI module will always use the first interface with the link?
> > > 
> > > We'll check the link state of every channel at init, and per-package on
> > > suspend events, but you're right that we only get AENs right now from
> > > configured channels. There's probably no reason why we could at least
> > > enable AENs on all channels even if we don't use them for network, maybe
> > > during the package probe step.
> > > 
> > 
> > To receive the AEN packet, the channel (RX) needs to be enabled.
> > It seems that we need to send Get Link Status command to all channels before choosing
> > The active channel.
> 
> We mostly already do a GLS before this; either we've just started the
> NCSI device in which case we sent a GLS to every package as part of
> probing, or some other event has occured and we've gone through
> ncsi_suspend_channel() which will do ncsi_dev_state_suspend_gls.
> However there are some gaps here, for example the
> ncsi_stop_dev()/ncsi_start_dev() path won't cause GLS commands to be sent
> so we'll have stale information. Continued below..
> 
> > 
> > > > > -				spin_unlock_irqrestore(&nc->lock, flags);
> > > > >  				found = nc;
> > > > > -				goto out;
> > > > > +				with_link = true;
> > > > > +
> > > > > +				spin_lock_irqsave(&ndp->lock, flags);
> > > > > +				list_add_tail_rcu(&found->link,
> > > > > +						  &ndp->channel_queue);
> > > > > +				spin_unlock_irqrestore(&ndp->lock, flags);
> > > > > +
> > > > > +				netdev_dbg(ndp->ndev.dev,
> > > > > +					   "NCSI: Channel %u added to queue (link %s)\n",
> > > > > +					   found->id,
> > > > > +					   ncm->data[2] & 0x1 ? "up" : "down");
> > > > >  			}
> > 
> > If multi_channel is enabled, the interface without link still needs to be processed.
> > Something likes below?
> > 			} else if (np->multi_channel) {
> > 				found = nc;
> > 
> > 				spin_lock_irqsave(&ndp->lock, flags);
> > 				list_add_tail_rcu(&found->link,
> > 						  &ndp->channel_queue);
> > 				spin_unlock_irqrestore(&ndp->lock, flags);
> > 
> > 				netdev_dbg(ndp->ndev.dev,
> > 					   "NCSI: pkg %u ch %u added to queue (link %s)\n",
> > 					   found->package->id,
> > 					   found->id,
> > 					   ncm->data[2] & 0x1 ? "up" : "down");
> > 			}
> 
> Yes we should just configure every channel in the whitelist if
> multi_channel is set - this gives us AENs for that channel and we'll
> recognise when it goes link up.
> 
> It would be good to have a better way of "bouncing" the NCSI
> configuration in these cases though, especially to include a re-probe of
> channel states. I'll include something like that for this series.
> 
> > 
> > > > > +			spin_unlock_irqrestore(&nc->lock, cflags);
> > > > >  
> > > > > -			spin_unlock_irqrestore(&nc->lock, flags);
> > > > > +			if (with_link && !np->multi_channel)
> > > > > +				break;
> > > > >  		}
> > > > > +		if (with_link && !ndp->multi_package)
> > > > > +			break;
> > > > >  	}
> > > > >  
> > > > > -	if (!found) {
> > > > > +	if (!with_link && found) {
> > > > > +		netdev_info(ndp->ndev.dev,
> > > > > +			    "NCSI: No channel with link found, configuring channel %u\n",
> > > > > +			    found->id);
> > > > > +		spin_lock_irqsave(&ndp->lock, flags);
> > > > > +		list_add_tail_rcu(&found->link, &ndp->channel_queue);
> > > > > +		spin_unlock_irqrestore(&ndp->lock, flags);

If multi-channel is enabled and without the link, the last found channel would be added again.

> > > > > +	} else if (!found) {
> > > > >  		netdev_warn(ndp->ndev.dev,
> > > > > -			    "NCSI: No channel found with link\n");
> > > > > +			    "NCSI: No channel found to configure!\n");
> > > > >  		ncsi_report_link(ndp, true);
> > > > >  		return -ENODEV;
> > > > >  	}
> > > > >  
> > > > > -	ncm = &found->modes[NCSI_MODE_LINK];
> > > > > -	netdev_dbg(ndp->ndev.dev,
> > > > > -		   "NCSI: Channel %u added to queue (link %s)\n",
> > > > > -		   found->id, ncm->data[2] & 0x1 ? "up" : "down");
> > > > > -
> > > > > -out:
> > > > > -	spin_lock_irqsave(&ndp->lock, flags);
> > > > > -	list_add_tail_rcu(&found->link, &ndp->channel_queue);
> > > > > -	spin_unlock_irqrestore(&ndp->lock, flags);
> > > > > -
> > > > >  	return ncsi_process_next_channel(ndp);
> > > > >  }
> > > > >  
> > > > > @@ -1428,6 +1481,7 @@ struct ncsi_dev *ncsi_register_dev(struct net_device *dev,
> > > > >  	INIT_LIST_HEAD(&ndp->channel_queue);
> > > > >  	INIT_LIST_HEAD(&ndp->vlan_vids);
> > > > >  	INIT_WORK(&ndp->work, ncsi_dev_work);
> > > > > +	ndp->package_whitelist = UINT_MAX;
> > > > >  
> > > > >  	/* Initialize private NCSI device */
> > > > >  	spin_lock_init(&ndp->lock);
> > > > > diff --git a/net/ncsi/ncsi-netlink.c b/net/ncsi/ncsi-netlink.c
> > > > > index 32cb7751d216..33a091e6f466 100644
> > > > > --- a/net/ncsi/ncsi-netlink.c
> > > > > +++ b/net/ncsi/ncsi-netlink.c
> > > > 
> > > > Is the following missed in the patch?
> > > > static const struct nla_policy ncsi_genl_policy[NCSI_ATTR_MAX + 1] = {
> > > > ...
> > > > 	[NCSI_ATTR_MULTI_FLAG] =	{ .type = NLA_FLAG },
> > > > 	[NCSI_ATTR_PACKAGE_MASK] =	{ .type = NLA_U32 },
> > > > 	[NCSI_ATTR_CHANNEL_MASK] =	{ .type = NLA_U32 },
> > > 
> > > Oops, added.
> > > 
> > > > > @@ -67,7 +67,7 @@ static int ncsi_write_channel_info(struct sk_buff *skb,
> > > > >  	nla_put_u32(skb, NCSI_CHANNEL_ATTR_LINK_STATE, m->data[2]);
> > > > >  	if (nc->state == NCSI_CHANNEL_ACTIVE)
> > > > >  		nla_put_flag(skb, NCSI_CHANNEL_ATTR_ACTIVE);
> > > > > -	if (ndp->force_channel == nc)
> > > > > +	if (nc == nc->package->preferred_channel)
> > > > >  		nla_put_flag(skb, NCSI_CHANNEL_ATTR_FORCED);
> > > > >  
> > > > >  	nla_put_u32(skb, NCSI_CHANNEL_ATTR_VERSION_MAJOR, nc->version.version);
> > > > > @@ -112,7 +112,7 @@ static int ncsi_write_package_info(struct sk_buff *skb,
> > > > >  		if (!pnest)
> > > > >  			return -ENOMEM;
> > > > >  		nla_put_u32(skb, NCSI_PKG_ATTR_ID, np->id);
> > > > > -		if (ndp->force_package == np)
> > > > > +		if ((0x1 << np->id) == ndp->package_whitelist)
> > > > >  			nla_put_flag(skb, NCSI_PKG_ATTR_FORCED);
> > > > >  		cnest = nla_nest_start(skb, NCSI_PKG_ATTR_CHANNEL_LIST);
> > > > >  		if (!cnest) {
> > > > > @@ -288,45 +288,54 @@ static int ncsi_set_interface_nl(struct sk_buff *msg, struct genl_info *info)
> > > > >  	package_id = nla_get_u32(info->attrs[NCSI_ATTR_PACKAGE_ID]);
> > > > >  	package = NULL;
> > > > >  
> > > > > -	spin_lock_irqsave(&ndp->lock, flags);
> > > > > -
> > > > >  	NCSI_FOR_EACH_PACKAGE(ndp, np)
> > > > >  		if (np->id == package_id)
> > > > >  			package = np;
> > > > >  	if (!package) {
> > > > >  		/* The user has set a package that does not exist */
> > > > > -		spin_unlock_irqrestore(&ndp->lock, flags);
> > > > >  		return -ERANGE;
> > > > >  	}
> > > > >  
> > > > >  	channel = NULL;
> > > > > -	if (!info->attrs[NCSI_ATTR_CHANNEL_ID]) {
> > > > > -		/* Allow any channel */
> > > > > -		channel_id = NCSI_RESERVED_CHANNEL;
> > > > > -	} else {
> > > > > +	if (info->attrs[NCSI_ATTR_CHANNEL_ID]) {
> > > > >  		channel_id = nla_get_u32(info->attrs[NCSI_ATTR_CHANNEL_ID]);
> > > > >  		NCSI_FOR_EACH_CHANNEL(package, nc)
> > > > > -			if (nc->id == channel_id)
> > > > > +			if (nc->id == channel_id) {
> > > > >  				channel = nc;
> > > > > +				break;
> > > > > +			}
> > > > > +		if (!channel) {
> > > > > +			netdev_info(ndp->ndev.dev,
> > > > > +				    "NCSI: Channel %u does not exist!\n",
> > > > > +				    channel_id);
> > > > > +			return -ERANGE;
> > > > > +		}
> > > > >  	}
> > > > >  
> > > > > -	if (channel_id != NCSI_RESERVED_CHANNEL && !channel) {
> > > > > -		/* The user has set a channel that does not exist on this
> > > > > -		 * package
> > > > > -		 */
> > > > > -		spin_unlock_irqrestore(&ndp->lock, flags);
> > > > > -		netdev_info(ndp->ndev.dev, "NCSI: Channel %u does not exist!\n",
> > > > > -			    channel_id);
> > > > > -		return -ERANGE;
> > > > > -	}
> > > > > -
> > > > > -	ndp->force_package = package;
> > > > > -	ndp->force_channel = channel;
> > > > > +	spin_lock_irqsave(&ndp->lock, flags);
> > > > > +	ndp->package_whitelist = 0x1 << package->id;
> > > > > +	ndp->multi_package = false;
> > > > >  	spin_unlock_irqrestore(&ndp->lock, flags);
> > > > >  
> > > > > -	netdev_info(ndp->ndev.dev, "Set package 0x%x, channel 0x%x%s as preferred\n",
> > > > > -		    package_id, channel_id,
> > > > > -		    channel_id == NCSI_RESERVED_CHANNEL ? " (any)" : "");
> > > > > +	spin_lock_irqsave(&package->lock, flags);
> > > > > +	package->multi_channel = false;
> > > > > +	if (channel) {
> > > > > +		package->channel_whitelist = 0x1 << channel->id;
> > > > > +		package->preferred_channel = channel;
> > > > > +	} else {
> > > > > +		/* Allow any channel */
> > > > > +		package->channel_whitelist = UINT_MAX;
> > > > > +		package->preferred_channel = NULL;
> > > > > +	}
> > > > > +	spin_unlock_irqrestore(&package->lock, flags);
> > > > > +
> > > > > +	if (channel)
> > > > > +		netdev_info(ndp->ndev.dev,
> > > > > +			    "Set package 0x%x, channel 0x%x as preferred\n",
> > > > > +			    package_id, channel_id);
> > > > > +	else
> > > > > +		netdev_info(ndp->ndev.dev, "Set package 0x%x as preferred\n",
> > > > > +			    package_id);
> > > > >  
> > > > >  	/* Bounce the NCSI channel to set changes */
> > > > >  	ncsi_stop_dev(&ndp->ndev);
> > > > > @@ -338,6 +347,7 @@ static int ncsi_set_interface_nl(struct sk_buff *msg, struct genl_info *info)
> > > > >  static int ncsi_clear_interface_nl(struct sk_buff *msg, struct genl_info *info)
> > > > >  {
> > > > >  	struct ncsi_dev_priv *ndp;
> > > > > +	struct ncsi_package *np;
> > > > >  	unsigned long flags;
> > > > >  
> > > > >  	if (!info || !info->attrs)
> > > > > @@ -351,11 +361,19 @@ static int ncsi_clear_interface_nl(struct sk_buff *msg, struct genl_info *info)
> > > > >  	if (!ndp)
> > > > >  		return -ENODEV;
> > > > >  
> > > > > -	/* Clear any override */
> > > > > +	/* Reset any whitelists and disable multi mode */
> > > > >  	spin_lock_irqsave(&ndp->lock, flags);
> > > > > -	ndp->force_package = NULL;
> > > > > -	ndp->force_channel = NULL;
> > > > > +	ndp->package_whitelist = UINT_MAX;
> > > > > +	ndp->multi_package = false;
> > > > >  	spin_unlock_irqrestore(&ndp->lock, flags);
> > > > > +
> > > > > +	NCSI_FOR_EACH_PACKAGE(ndp, np) {
> > > > > +		spin_lock_irqsave(&np->lock, flags);
> > > > > +		np->multi_channel = false;
> > > > > +		np->channel_whitelist = UINT_MAX;
> > > > > +		np->preferred_channel = NULL;
> > > > > +		spin_unlock_irqrestore(&np->lock, flags);
> > > > > +	}
> > > > >  	netdev_info(ndp->ndev.dev, "NCSI: Cleared preferred package/channel\n");
> > > > >  
> > > > >  	/* Bounce the NCSI channel to set changes */
> > > > > @@ -365,6 +383,137 @@ static int ncsi_clear_interface_nl(struct sk_buff *msg, struct genl_info *info)
> > > > >  	return 0;
> > > > >  }
> > > > >  
> > > > > +static int ncsi_set_package_mask_nl(struct sk_buff *msg,
> > > > > +				    struct genl_info *info)
> > > > > +{
> > > > > +	struct ncsi_dev_priv *ndp;
> > > > > +	unsigned long flags;
> > > > > +	int rc;
> > > > > +
> > > > > +	if (!info || !info->attrs)
> > > > > +		return -EINVAL;
> > > > > +
> > > > > +	if (!info->attrs[NCSI_ATTR_IFINDEX])
> > > > > +		return -EINVAL;
> > > > > +
> > > > > +	if (!info->attrs[NCSI_ATTR_PACKAGE_MASK])
> > > > > +		return -EINVAL;
> > > > > +
> > > > > +	ndp = ndp_from_ifindex(get_net(sock_net(msg->sk)),
> > > > > +			       nla_get_u32(info->attrs[NCSI_ATTR_IFINDEX]));
> > > > > +	if (!ndp)
> > > > > +		return -ENODEV;
> > > > > +
> > > > > +	spin_lock_irqsave(&ndp->lock, flags);
> > > > > +	ndp->package_whitelist =
> > > > > +		nla_get_u32(info->attrs[NCSI_ATTR_PACKAGE_MASK]);
> > > > > +
> > > > > +	if (nla_get_flag(info->attrs[NCSI_ATTR_MULTI_FLAG])) {
> > > > > +		if (ndp->flags & NCSI_DEV_HWA) {
> > > > > +			ndp->multi_package = true;
> > > > > +			rc = 0;
> > > > > +		} else {
> > > > > +			netdev_err(ndp->ndev.dev,
> > > > > +				   "NCSI: Can't use multiple packages without HWA\n");
> > > > > +			rc = -EPERM;
> > > > > +		}
> > > > > +	} else {
> > > > > +		rc = 0;

I mean this one. If NCSI_ATTR_MULTI_FLAG attribute is not set, it means user disables it.

> > > > > +	}
> > 
> > If the flag is not set, do we need to clear the multi_package flag? It is possible that it is
> > user's intension to disable the multi-mode.
> > 	} else {
> > 		ndp->multi_package = false;
> > 		rc = 0;
> > 	}
> 
> Yep, good catch (although technically multi_package could not have been
> set true in the past if HWA is not available).
> 
> > 
> > > > > +
> > > > > +	spin_unlock_irqrestore(&ndp->lock, flags);
> > > > > +
> > > > > +	if (!rc) {
> > > > > +		/* Bounce the NCSI channel to set changes */
> > > > > +		ncsi_stop_dev(&ndp->ndev);

Inside the ncsi_stop_dev() function, do we need to suspend channel?
Disable channel and TX?

> > > > > +		ncsi_start_dev(&ndp->ndev);
> > > > 
> > > > Is it possible to delay the restart? If we have two packages, we might send
> > > > set_package_mask command once and set_channel_mask command twice.
> > > > We will see the unnecessary reconfigurations in a very short period time.
> > > 
> > > We could probably do with a better way of bouncing the link here too. I
> > > suppose we could schedule the actual link 'bounce' to occur a second or
> > > two later, and delay if we receive new configuration in that time.
> > > Perhaps something outside of the scope of this patch though.
> > > 
> > > > > +	}
> > > > > +
> > > > > +	return rc;
> > > > > +}
> > > > > +
> > > > > +static int ncsi_set_channel_mask_nl(struct sk_buff *msg,
> > > > > +				    struct genl_info *info)
> > > > > +{
> > > > > +	struct ncsi_package *np, *package;
> > > > > +	struct ncsi_channel *nc, *channel;
> > > > > +	struct ncsi_dev_priv *ndp;
> > > > > +	unsigned long flags;
> > > > > +	u32 package_id, channel_id;
> > 
> > All local variable declarations from longest to shortest
> > line.
> > 
> > > > > +
> > > > > +	if (!info || !info->attrs)
> > > > > +		return -EINVAL;
> > > > > +
> > > > > +	if (!info->attrs[NCSI_ATTR_IFINDEX])
> > > > > +		return -EINVAL;
> > > > > +
> > > > > +	if (!info->attrs[NCSI_ATTR_PACKAGE_ID])
> > > > > +		return -EINVAL;
> > > > > +
> > > > > +	if (!info->attrs[NCSI_ATTR_CHANNEL_MASK])
> > > > > +		return -EINVAL;
> > > > > +
> > > > > +	ndp = ndp_from_ifindex(get_net(sock_net(msg->sk)),
> > > > > +			       nla_get_u32(info->attrs[NCSI_ATTR_IFINDEX]));
> > > > > +	if (!ndp)
> > > > > +		return -ENODEV;
> > > > > +
> > > > > +	package_id = nla_get_u32(info->attrs[NCSI_ATTR_PACKAGE_ID]);
> > > > > +	package = NULL;
> > > > > +	NCSI_FOR_EACH_PACKAGE(ndp, np)
> > > > > +		if (np->id == package_id) {
> > > > > +			package = np;
> > > > > +			break;
> > > > > +		}
> > > > > +	if (!package)
> > > > > +		return -ERANGE;
> > > > > +
> > > > > +	spin_lock_irqsave(&package->lock, flags);
> > > > > +
> > > > > +	channel = NULL;
> > > > > +	if (info->attrs[NCSI_ATTR_CHANNEL_ID]) {
> > > > > +		channel_id = nla_get_u32(info->attrs[NCSI_ATTR_CHANNEL_ID]);
> > > > > +		NCSI_FOR_EACH_CHANNEL(np, nc)
> > > > > +			if (nc->id == channel_id) {
> > > > > +				channel = nc;
> > > > > +				break;
> > > > > +			}
> > > > > +		if (!channel) {
> > > > > +			spin_unlock_irqrestore(&package->lock, flags);
> > > > > +			return -ERANGE;
> > > > > +		}
> > > > > +		netdev_dbg(ndp->ndev.dev,
> > > > > +			   "NCSI: Channel %u set as preferred channel\n",
> > > > > +			   channel->id);
> > > > > +	}
> > > > > +
> > > > > +	package->channel_whitelist =
> > > > > +		nla_get_u32(info->attrs[NCSI_ATTR_CHANNEL_MASK]);
> > > > > +	if (package->channel_whitelist == 0)
> > > > > +		netdev_dbg(ndp->ndev.dev,
> > > > > +			   "NCSI: Package %u set to all channels disabled\n",
> > > > > +			   package->id);
> > > > > +
> > > > > +	package->preferred_channel = channel;
> > > > > +
> > > > > +	if (nla_get_flag(info->attrs[NCSI_ATTR_MULTI_FLAG])) {
> > > > > +		package->multi_channel = true;
> > > > > +		netdev_info(ndp->ndev.dev,
> > > > > +			    "NCSI: Multi-channel enabled on package %u\n",
> > > > > +			    package_id);
> > > > > +	} else {
> > > > > +		package->multi_channel = false;
> > > > > +	}
> > > > > +
> > > > > +	spin_unlock_irqrestore(&package->lock, flags);
> > > > > +
> > > > > +	/* Bounce the NCSI channel to set changes */
> > > > > +	ncsi_stop_dev(&ndp->ndev);
> > > > > +	ncsi_start_dev(&ndp->ndev);
> > > > 
> > > > Same question as set_package_mask function.
> > > > Is it possible to delay the restart? If we have two packages, we might send
> > > > set_package_mask command once and set_channel_mask command twice.
> > > > We will see the unnecessary reconfigurations in a very short period time.
> > > > 
> > > > > +
> > > > > +	return 0;
> > > > > +}
> > > > > +
> > > > >  static const struct genl_ops ncsi_ops[] = {
> > > > >  	{
> > > > >  		.cmd = NCSI_CMD_PKG_INFO,
> > > > > @@ -385,6 +534,18 @@ static const struct genl_ops ncsi_ops[] = {
> > > > >  		.doit = ncsi_clear_interface_nl,
> > > > >  		.flags = GENL_ADMIN_PERM,
> > > > >  	},
> > > > > +	{
> > > > > +		.cmd = NCSI_CMD_SET_PACKAGE_MASK,
> > > > > +		.policy = ncsi_genl_policy,
> > > > > +		.doit = ncsi_set_package_mask_nl,
> > > > > +		.flags = GENL_ADMIN_PERM,
> > > > > +	},
> > > > > +	{
> > > > > +		.cmd = NCSI_CMD_SET_CHANNEL_MASK,
> > > > > +		.policy = ncsi_genl_policy,
> > > > > +		.doit = ncsi_set_channel_mask_nl,
> > > > > +		.flags = GENL_ADMIN_PERM,
> > > > > +	},
> > > > >  };
> > > > >  
> > > > >  static struct genl_family ncsi_genl_family __ro_after_init = {
> > > > > diff --git a/net/ncsi/ncsi-rsp.c b/net/ncsi/ncsi-rsp.c
> > > > > index d66b34749027..02ce7626b579 100644
> > > > > --- a/net/ncsi/ncsi-rsp.c
> > > > > +++ b/net/ncsi/ncsi-rsp.c
> > > > > @@ -241,7 +241,7 @@ static int ncsi_rsp_handler_dcnt(struct ncsi_request *nr)
> > > > >  	if (!ncm->enable)
> > > > >  		return 0;
> > > > >  
> > > > > -	ncm->enable = 1;
> > > > > +	ncm->enable = 0;
> > > > >  	return 0;
> > > > >  }
> > > > >  
> > > > > -- 
> > > > > 2.19.0



^ permalink raw reply

* [PATCH 1/1] crypto:chelsio: Update ntx queue received from cxgb4
From: Harsh Jain @ 2018-10-12 11:46 UTC (permalink / raw)
  To: herbert, atul.gupta, indranil, swise, varun, ganeshgr, netdev,
	linux-crypto
  Cc: Harsh Jain
In-Reply-To: <cover.1539343454.git.harsh@chelsio.com>

Update cxgb4 to send No. of Tx Queue created in lldinfo struct
and use the same ntxq in chcr driver.

This patch depends on following commit
commit	add92a817e60e308a419693413a38d9d1e663aff
"Fix memory corruption in DMA Mapped buffers"

Signed-off-by: Harsh Jain <harsh@chelsio.com>
---
 drivers/crypto/chelsio/chcr_algo.c             |  3 +--
 drivers/crypto/chelsio/chcr_core.c             |  2 +-
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.c | 19 +++++++++++++++----
 3 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/drivers/crypto/chelsio/chcr_algo.c b/drivers/crypto/chelsio/chcr_algo.c
index 010bbf6..9b937cb 100644
--- a/drivers/crypto/chelsio/chcr_algo.c
+++ b/drivers/crypto/chelsio/chcr_algo.c
@@ -1337,8 +1337,7 @@ static int chcr_device_init(struct chcr_context *ctx)
 		}
 		ctx->dev = u_ctx->dev;
 		adap = padap(ctx->dev);
-		ntxq = min_not_zero((unsigned int)u_ctx->lldi.nrxq,
-				    adap->vres.ncrypto_fc);
+		ntxq = u_ctx->lldi.ntxq;
 		rxq_perchan = u_ctx->lldi.nrxq / u_ctx->lldi.nchan;
 		txq_perchan = ntxq / u_ctx->lldi.nchan;
 		spin_lock(&ctx->dev->lock_chcr_dev);
diff --git a/drivers/crypto/chelsio/chcr_core.c b/drivers/crypto/chelsio/chcr_core.c
index 04f277c..2399ce3 100644
--- a/drivers/crypto/chelsio/chcr_core.c
+++ b/drivers/crypto/chelsio/chcr_core.c
@@ -43,7 +43,7 @@ static chcr_handler_func work_handlers[NUM_CPL_CMDS] = {
 static struct cxgb4_uld_info chcr_uld_info = {
 	.name = DRV_MODULE_NAME,
 	.nrxq = MAX_ULD_QSETS,
-	.ntxq = MAX_ULD_QSETS,
+	/* Max ntxq will be derived from fw config file*/
 	.rxq_size = 1024,
 	.add = chcr_uld_add,
 	.state_change = chcr_uld_state_change,
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.c
index 4bc2110..ad4fa0d 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.c
@@ -520,10 +520,19 @@ setup_sge_txq_uld(struct adapter *adap, unsigned int uld_type,
 	txq_info = kzalloc(sizeof(*txq_info), GFP_KERNEL);
 	if (!txq_info)
 		return -ENOMEM;
+	if (uld_type == CXGB4_ULD_CRYPTO) {
+		i = min_t(int, adap->vres.ncrypto_fc,
+			  num_online_cpus());
+		txq_info->ntxq = rounddown(i, adap->params.nports);
+		if (txq_info->ntxq <= 0) {
+			dev_warn(adap->pdev_dev, "Crypto Tx Queues can't be zero\n");
+			return -EINVAL;
+		}
 
-	i = min_t(int, uld_info->ntxq, num_online_cpus());
-	txq_info->ntxq = roundup(i, adap->params.nports);
-
+	} else {
+		i = min_t(int, uld_info->ntxq, num_online_cpus());
+		txq_info->ntxq = roundup(i, adap->params.nports);
+	}
 	txq_info->uldtxq = kcalloc(txq_info->ntxq, sizeof(struct sge_uld_txq),
 				   GFP_KERNEL);
 	if (!txq_info->uldtxq) {
@@ -546,11 +555,14 @@ static void uld_queue_init(struct adapter *adap, unsigned int uld_type,
 			   struct cxgb4_lld_info *lli)
 {
 	struct sge_uld_rxq_info *rxq_info = adap->sge.uld_rxq_info[uld_type];
+	int tx_uld_type = TX_ULD(uld_type);
+	struct sge_uld_txq_info *txq_info = adap->sge.uld_txq_info[tx_uld_type];
 
 	lli->rxq_ids = rxq_info->rspq_id;
 	lli->nrxq = rxq_info->nrxq;
 	lli->ciq_ids = rxq_info->rspq_id + rxq_info->nrxq;
 	lli->nciq = rxq_info->nciq;
+	lli->ntxq = txq_info->ntxq;
 }
 
 int t4_uld_mem_alloc(struct adapter *adap)
@@ -634,7 +646,6 @@ static void uld_init(struct adapter *adap, struct cxgb4_lld_info *lld)
 	lld->ports = adap->port;
 	lld->vr = &adap->vres;
 	lld->mtus = adap->params.mtus;
-	lld->ntxq = adap->sge.ofldqsets;
 	lld->nchan = adap->params.nports;
 	lld->nports = adap->params.nports;
 	lld->wr_cred = adap->params.ofldq_wr_cred;
-- 
2.1.4

^ permalink raw reply related

* URGENT RESPONSE NEEDED
From: Sean Kim. @ 2018-10-12 11:59 UTC (permalink / raw)
  To: netdev

Hello my dear.

Did you receive my email message to you? Please, get back to me ASAP as the matter is becoming late.  Expecting your urgent response.

Sean.

^ permalink raw reply

* [PATCH net-next] net: ena: fix unintended sign extension
From: Gustavo A. R. Silva @ 2018-10-12 19:49 UTC (permalink / raw)
  To: Netanel Belgazal, Saeed Bishara, Zorik Machulsky, David S. Miller
  Cc: netdev, linux-kernel, Gustavo A. R. Silva

In the following expression:

372                size = io_sq->bounce_buf_ctrl.buffer_size *
373                         io_sq->bounce_buf_ctrl.buffers_num;

both buffer_size and buffers_num are of type u16 (16 bits, unsigned),
so they are promoted to type int (32 bits, signed) and then
sign-extended to type size_t.

Fix this by casting io_sq->bounce_buf_ctrl.buffer_size to size_t in
order to avoid the sign extension and unintended results.

Addresses-Coverity-ID: 1474187 ("Unintended sign extension")
Addresses-Coverity-ID: 1474189 ("Unintended sign extension")
Fixes: 689b2bdaaa14 ("net: ena: add functions for handling Low Latency Queues in ena_com")
Signed-off-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
---
 drivers/net/ethernet/amazon/ena/ena_com.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/amazon/ena/ena_com.c b/drivers/net/ethernet/amazon/ena/ena_com.c
index 420cede..9a8130e 100644
--- a/drivers/net/ethernet/amazon/ena/ena_com.c
+++ b/drivers/net/ethernet/amazon/ena/ena_com.c
@@ -369,7 +369,7 @@ static int ena_com_init_io_sq(struct ena_com_dev *ena_dev,
 			ENA_COM_BOUNCE_BUFFER_CNTRL_CNT;
 		io_sq->bounce_buf_ctrl.next_to_use = 0;
 
-		size = io_sq->bounce_buf_ctrl.buffer_size *
+		size = (size_t)io_sq->bounce_buf_ctrl.buffer_size *
 			 io_sq->bounce_buf_ctrl.buffers_num;
 
 		dev_node = dev_to_node(ena_dev->dmadev);
-- 
2.7.4

^ permalink raw reply related

* [PATCH net-next] octeontx2-af: remove unused cgx_fwi_link_change
From: Arnd Bergmann @ 2018-10-12 19:52 UTC (permalink / raw)
  To: Sunil Goutham, Linu Cherian, Geetha sowjanya, Jerin Jacob,
	David S. Miller
  Cc: Arnd Bergmann, YueHaibing, Nithya Mani, netdev, linux-kernel

The newly added driver causes a warning about a function that is
not used anywhere:

drivers/net/ethernet/marvell/octeontx2/af/cgx.c:320:12: error: 'cgx_fwi_link_change' defined but not used [-Werror=unused-function]

Remove it for now, until a user gets added. If we want to use this
function from another module, we also need a declaration in a header
file, which is currently missing, so it would have to change anyway.

Fixes: 1463f382f58d ("octeontx2-af: Add support for CGX link management")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 drivers/net/ethernet/marvell/octeontx2/af/cgx.c | 17 -----------------
 1 file changed, 17 deletions(-)

diff --git a/drivers/net/ethernet/marvell/octeontx2/af/cgx.c b/drivers/net/ethernet/marvell/octeontx2/af/cgx.c
index 2cf8e402c31e..5328ecc8cea2 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/cgx.c
+++ b/drivers/net/ethernet/marvell/octeontx2/af/cgx.c
@@ -58,9 +58,6 @@ struct cgx {
 
 static LIST_HEAD(cgx_list);
 
-/* CGX PHY management internal APIs */
-static int cgx_fwi_link_change(struct cgx *cgx, int lmac_id, bool en);
-
 /* Supported devices */
 static const struct pci_device_id cgx_id_table[] = {
 	{ PCI_DEVICE(PCI_VENDOR_ID_CAVIUM, PCI_DEVID_OCTEONTX2_CGX) },
@@ -317,20 +314,6 @@ int cgx_lmac_evh_register(struct cgx_event_cb *cb, void *cgxd, int lmac_id)
 }
 EXPORT_SYMBOL(cgx_lmac_evh_register);
 
-static int cgx_fwi_link_change(struct cgx *cgx, int lmac_id, bool enable)
-{
-	u64 req = 0;
-	u64 resp;
-
-	if (enable)
-		req = FIELD_SET(CMDREG_ID, CGX_CMD_LINK_BRING_UP, req);
-	else
-		req = FIELD_SET(CMDREG_ID, CGX_CMD_LINK_BRING_DOWN, req);
-
-	return cgx_fwi_cmd_generic(req, &resp, cgx, lmac_id);
-}
-EXPORT_SYMBOL(cgx_fwi_link_change);
-
 static inline int cgx_fwi_read_version(u64 *resp, struct cgx *cgx)
 {
 	u64 req = 0;
-- 
2.18.0

^ permalink raw reply related

* [PATCH net] ip6_tunnel: Don't update PMTU on tunnels with collect_md
From: Stefano Brivio @ 2018-10-12 12:32 UTC (permalink / raw)
  To: David S. Miller; +Cc: Nicolas Dichtel, Alexei Starovoitov, netdev

Commit 8d79266bc48c ("ip6_tunnel: add collect_md mode to IPv6
tunnels") introduced a check to avoid updating PMTU when
collect_md mode is enabled.

Later, commit f15ca723c1eb ("net: don't call update_pmtu
unconditionally") dropped this check, I guess inadvertently.
Restore it.

Fixes: f15ca723c1eb ("net: don't call update_pmtu unconditionally")
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 net/ipv6/ip6_tunnel.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index a0b6932c3afd..6f05e0f74bdf 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -1136,7 +1136,8 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield,
 	mtu = max(mtu, skb->protocol == htons(ETH_P_IPV6) ?
 		       IPV6_MIN_MTU : IPV4_MIN_MTU);
 
-	skb_dst_update_pmtu(skb, mtu);
+	if (!t->parms.collect_md)
+		skb_dst_update_pmtu(skb, mtu);
 	if (skb->len - t->tun_hlen - eth_hlen > mtu && !skb_is_gso(skb)) {
 		*pmtu = mtu;
 		err = -EMSGSIZE;
-- 
2.19.1

^ permalink raw reply related

* Re: [PATCH v4] Wait for running BPF programs when updating map-in-map
From: Joel Fernandes @ 2018-10-12 20:54 UTC (permalink / raw)
  To: Daniel Colascione
  Cc: linux-kernel, timmurray, netdev, Alexei Starovoitov,
	Lorenzo Colitti, Chenbo Feng, Mathieu Desnoyers,
	Alexei Starovoitov, Daniel Borkmann, kernel-team
In-Reply-To: <20181012105427.243779-1-dancol@google.com>

On Fri, Oct 12, 2018 at 03:54:27AM -0700, Daniel Colascione wrote:
> The map-in-map frequently serves as a mechanism for atomic
> snapshotting of state that a BPF program might record.  The current
> implementation is dangerous to use in this way, however, since
> userspace has no way of knowing when all programs that might have
> retrieved the "old" value of the map may have completed.
> 
> This change ensures that map update operations on map-in-map map types
> always wait for all references to the old map to drop before returning
> to userspace.
> 
> Signed-off-by: Daniel Colascione <dancol@google.com>
> ---
>  kernel/bpf/syscall.c | 14 ++++++++++++++
>  1 file changed, 14 insertions(+)
> 
> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> index 8339d81cba1d..d7c16ae1e85a 100644
> --- a/kernel/bpf/syscall.c
> +++ b/kernel/bpf/syscall.c
> @@ -741,6 +741,18 @@ static int map_lookup_elem(union bpf_attr *attr)
>  	return err;
>  }
>  
> +static void maybe_wait_bpf_programs(struct bpf_map *map)
> +{
> +	/* Wait for any running BPF programs to complete so that
> +	 * userspace, when we return to it, knows that all programs
> +	 * that could be running use the new map value.
> +	 */
> +	if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS ||
> +	    map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS) {
> +		synchronize_rcu();
> +	}
> +}
> +
>  #define BPF_MAP_UPDATE_ELEM_LAST_FIELD flags
>  
>  static int map_update_elem(union bpf_attr *attr)
> @@ -831,6 +843,7 @@ static int map_update_elem(union bpf_attr *attr)
>  	}
>  	__this_cpu_dec(bpf_prog_active);
>  	preempt_enable();
> +	maybe_wait_bpf_programs(map);
>  out:
>  free_value:
>  	kfree(value);
> @@ -883,6 +896,7 @@ static int map_delete_elem(union bpf_attr *attr)
>  	rcu_read_unlock();
>  	__this_cpu_dec(bpf_prog_active);
>  	preempt_enable();
> +	maybe_wait_bpf_programs(map);

Looks good to me,

Reviewed-by: Joel Fernandes (Google) <joel@joelfernandes.org>

Also I believe that those rcu_read_lock() and rcu_read_unlock() calls in the
existing code are useless. preempt_disable()d code is already an RCU
read-side section, and synchronize_rcu and friends work on those type of
read-side sections as well (as of recent kernel releases) however removing it
may make lockdep unhappy, unless we also replace all rcu_dereference() usages
with rcu_dereference_sched(), so lets leave that alone for now I guess.

thanks,

- Joel

^ permalink raw reply

* Re: [PATCH 4/7] net: qmi_wwan: add usbnet -> priv function
From: Ben Dooks @ 2018-10-12 13:22 UTC (permalink / raw)
  To: Bjørn Mork
  Cc: davem, netdev, linux-usb, linux-kernel, linux-kernel, gregkh,
	steve.glendinning
In-Reply-To: <87woqn30ly.fsf@miraculix.mork.no>

On 12/10/18 11:44, Bjørn Mork wrote:
> Ben Dooks <ben.dooks@codethink.co.uk> writes:
> 
>> +static inline struct qmi_wwan_state *usbnet_to_qmi(struct usbnet *usb)
>> +{
>> +	return (void *) &usb->data;
>> +}
> 
> 
> checkpatch doesn't like this, and it is also inconsistent with the
> coding style elsewhere in the driver.

Thank you for pointing this out, will fix in v2.

> CHECK: No space is necessary after a cast
> #30: FILE: drivers/net/usb/qmi_wwan.c:81:
> +       return (void *) &usb->data;
> 
> total: 0 errors, 0 warnings, 1 checks, 115 lines checked
> 
> 
> And as for consistency:  I realize that "dev" is a very generic name,
> but it's used consistendly for all struct usbnet pointers in the driver:

Ok, I'll change it.

> 
> bjorn@miraculix:/usr/local/src/git/linux$ git grep 'struct usbnet' drivers/net/usb/qmi_wwan.c
> drivers/net/usb/qmi_wwan.c:static struct net_device *qmimux_find_dev(struct usbnet *dev, u8 mux_id)
> drivers/net/usb/qmi_wwan.c:static bool qmimux_has_slaves(struct usbnet *dev)
> drivers/net/usb/qmi_wwan.c:static int qmimux_rx_fixup(struct usbnet *dev, struct sk_buff *skb)
> drivers/net/usb/qmi_wwan.c:     struct usbnet *dev = netdev_priv(net);
> drivers/net/usb/qmi_wwan.c:     struct usbnet *dev = netdev_priv(to_net_dev(d));
> drivers/net/usb/qmi_wwan.c:     struct usbnet *dev = netdev_priv(to_net_dev(d));
> drivers/net/usb/qmi_wwan.c:     struct usbnet *dev = netdev_priv(to_net_dev(d));
> drivers/net/usb/qmi_wwan.c:     struct usbnet *dev = netdev_priv(to_net_dev(d));
> drivers/net/usb/qmi_wwan.c:static int qmi_wwan_rx_fixup(struct usbnet *dev, struct sk_buff *skb)
> drivers/net/usb/qmi_wwan.c:static int qmi_wwan_manage_power(struct usbnet *dev, int on)
> drivers/net/usb/qmi_wwan.c:     struct usbnet *dev = usb_get_intfdata(intf);
> drivers/net/usb/qmi_wwan.c:static int qmi_wwan_register_subdriver(struct usbnet *dev)
> drivers/net/usb/qmi_wwan.c:static int qmi_wwan_change_dtr(struct usbnet *dev, bool on)
> drivers/net/usb/qmi_wwan.c:static int qmi_wwan_bind(struct usbnet *dev, struct usb_interface *intf)
> drivers/net/usb/qmi_wwan.c:     BUILD_BUG_ON((sizeof(((struct usbnet *)0)->data) <
> drivers/net/usb/qmi_wwan.c:static void qmi_wwan_unbind(struct usbnet *dev, struct usb_interface *intf)
> drivers/net/usb/qmi_wwan.c:     struct usbnet *dev = usb_get_intfdata(intf);
> drivers/net/usb/qmi_wwan.c:     struct usbnet *dev = usb_get_intfdata(intf);
> drivers/net/usb/qmi_wwan.c:     struct usbnet *dev = usb_get_intfdata(intf);
> 
> The name was chosen to be consistent with the cdc_ether usage. I'd
> prefer it if this function could use the "dev" name, to avoid confusing
> things unnecessarly with yet another generic name like "usb". Which
> isn't used anywhere else in the driver, and could just as easily refer
> to a struct usb_device or struct usb_interface.

Ok, should be fairly easy to change this.

> (yes, I know there are a couple of "struct usb_device *dev" cases. But
> I'd rather see those renamed TBH)

I'd rather not touch that at the moment, this is already gaining complexity.

> I also don't see why this function should be qmi_wwan specific. No need
> to duplicate the same function in every minidriver, when you can do with
> two generic helpers (maybe with more meaningful names):

I personally prefer the return type to be specifically the private
data type for the driver. It makes it a bit easier to spot when
you don't get the cast right.

The functions below are the idea I am working towards for the
usbnet driver, I was trying to avoid doing everything in one
go. Making the accessor functions means the next round of changes
can be much smaller, touching 1-2 areas per driver.

> static inline void *usbnet_priv(const struct usbnet *dev)
> {
>          return (void *)&dev->data;
> }
> 
> static inline void *usbnet_priv0(const struct usbnet * *dev)
> {
> 	return (void *)dev->data[0];
> }

This is probably the end-game of the patch series, just need to
look at a couple of issues and see if there's anything better
than can be done.

I might also add a usbnet_set_privdata(dev, data) or something
like usbnet_alloc_privdata(dev, sizeof(data).

Note, there's also the fun here of the usbnet having private data
for itself, and these sub-drivers also having their own private
data... this probably needs to be named carefully.

> Please fix the checkpatch warning and consider the other comments.
> 
> Personally I don't really think it makes the code any easier to read.
> But if you want it, then I'm fine this. I guess I've grown too used to
> seeing those casts ;-)

-- 
Ben Dooks				http://www.codethink.co.uk/
Senior Engineer				Codethink - Providing Genius

https://www.codethink.co.uk/privacy.html

^ permalink raw reply

* [PATCH net-next,v2] hv_netvsc: fix vf serial matching with pci slot info
From: Haiyang Zhang @ 2018-10-12 20:55 UTC (permalink / raw)
  To: davem, netdev
  Cc: haiyangz, kys, sthemmin, olaf, vkuznets, devel, linux-kernel

From: Haiyang Zhang <haiyangz@microsoft.com>

The VF device's serial number is saved as a string in PCI slot's
kobj name, not the slot->number. This patch corrects the netvsc
driver, so the VF device can be successfully paired with synthetic
NIC.

Fixes: 00d7ddba1143 ("hv_netvsc: pair VF based on serial number")
Reported-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: Haiyang Zhang <haiyangz@microsoft.com>
---
 drivers/net/hyperv/netvsc_drv.c | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index 9bcaf204a7d4..ded623862003 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -2030,14 +2030,15 @@ static void netvsc_vf_setup(struct work_struct *w)
 	rtnl_unlock();
 }
 
-/* Find netvsc by VMBus serial number.
- * The PCI hyperv controller records the serial number as the slot.
+/* Find netvsc by VF serial number.
+ * The PCI hyperv controller records the serial number as the slot kobj name.
  */
 static struct net_device *get_netvsc_byslot(const struct net_device *vf_netdev)
 {
 	struct device *parent = vf_netdev->dev.parent;
 	struct net_device_context *ndev_ctx;
 	struct pci_dev *pdev;
+	u32 serial;
 
 	if (!parent || !dev_is_pci(parent))
 		return NULL; /* not a PCI device */
@@ -2048,16 +2049,22 @@ static struct net_device *get_netvsc_byslot(const struct net_device *vf_netdev)
 		return NULL;
 	}
 
+	if (kstrtou32(kobject_name(&pdev->slot->kobj), 10, &serial)) {
+		netdev_notice(vf_netdev, "Invalid vf serial:%s\n",
+			      pdev->slot->kobj.name);
+		return NULL;
+	}
+
 	list_for_each_entry(ndev_ctx, &netvsc_dev_list, list) {
 		if (!ndev_ctx->vf_alloc)
 			continue;
 
-		if (ndev_ctx->vf_serial == pdev->slot->number)
+		if (ndev_ctx->vf_serial == serial)
 			return hv_get_drvdata(ndev_ctx->device_ctx);
 	}
 
 	netdev_notice(vf_netdev,
-		      "no netdev found for slot %u\n", pdev->slot->number);
+		      "no netdev found for vf serial:%u\n", serial);
 	return NULL;
 }
 
-- 
2.18.0

^ permalink raw reply related

* Re: [PATCH bpf-next 2/2] bpf, libbpf: simplify perf RB walk and do incremental updates
From: Daniel Borkmann @ 2018-10-12 13:30 UTC (permalink / raw)
  To: Jakub Kicinski; +Cc: alexei.starovoitov, netdev
In-Reply-To: <4258c8c5-f18e-65ba-cb14-4a2f2e3187cc@iogearbox.net>

On 10/12/2018 10:39 AM, Daniel Borkmann wrote:
> On 10/12/2018 05:04 AM, Jakub Kicinski wrote:
>> On Thu, 11 Oct 2018 16:02:07 +0200, Daniel Borkmann wrote:
>>> Clean up and improve bpf_perf_event_read_simple() ring walk a bit
>>> to use similar tail update scheme as in perf and bcc allowing the
>>> kernel to make forward progress not only after full timely walk.
>>
>> The extra memory barriers won't impact performance?  If I read the code
>> correctly we now have:
>>
>> 	while (bla) {
>> 		head = HEAD
>> 		rmb()
>>
>> 		...
>>
>> 		mb()
>> 		TAIL = tail
>> 	}
>>
>> Would it make sense to try to piggy back on the mb() for head re-read
>> at least?  Perhaps that's a non-issue, just wondering.
> 
> From the scheme specified in the comment in prior patch my understanding
> would be that they don't pair (see B and C) so there would be no guarantee
> that load of head would happen before load of data. Fwiw, I've been using
> the exact same semantics as user space perf tool walks the perf mmap'ed
> ring buffer (tools/perf/util/mmap.{h,c}) here. Given kernel doesn't stop

On that note, I'll also respin, after some clarification with PeterZ on
why perf is using {rmb,mb}() barriers today as opposed to more lightweight
smp_{rmb,mb}() ones it turns out there is no real reason other than
historic one and perf can be changed and made more efficient as well. ;)

> pushing into ring buffer while user space walks it and indicates how far
> it has consumed data via tail update, it would allow for making room
> successively and not only after full run has complete, so we don't make
> any assumptions in the generic libbpf library helper on how slow/quick
> the callback would be processing resp. how full ring is, etc, and kernel
> pushing new data can be processed in the same run if necessary. One thing
> we could consider is to batch tail updates, say, every 8 elements and a
> final update once we break out walking the ring; probably okay'ish as a
> heuristic..
> 
>>> Also few other improvements to use realloc() instead of free() and
>>> malloc() combination and for the callback use proper perf_event_header
>>> instead of void pointer, so that real applications can use container_of()
>>> macro with proper type checking.
>>
>> FWIW the free() + malloc() was to avoid the the needless copy of the
>> previous event realloc() may do.  It makes sense to use realloc()
>> especially if you want to put extra info in front of the buffer, just
>> sayin' it wasn't a complete braino ;)
> 
> No strong preference from my side, I'd think that it might be sensible in
> any case from applications to call the bpf_perf_event_read_simple() with a
> already preallocated buffer, depending on the expected max element size from
> BPF could e.g. be a buffer of 1 page or so. Given 512 byte stack space from
> the BPF prog and MTU 1500 this would more than suffice to avoid new
> allocations altogether. Anyway, given we only grow the new memory area I
> did some testing on realloc() with an array of pointers to prior malloc()'ed
> buffers, running randomly 10M realloc()s to increase size over them and
> saw <1% where area had to be moved, so we're hitting corner case of a corner
> case, I'm also ok to leave the combination, though. :)
> 
> Thanks,
> Daniel

^ permalink raw reply

* [RFC-resend 0/2] compat: in_compat_syscall() differs on x86
From: Dmitry Safonov @ 2018-10-12 13:42 UTC (permalink / raw)
  To: linux-kernel
  Cc: Dmitry Safonov, Dmitry Safonov, Ard Biesheuvel, Andy Lutomirsky,
	David S. Miller, Herbert Xu, H. Peter Anvin, Ingo Molnar,
	John Stultz, Kirill A. Shutemov, Oleg Nesterov, Steffen Klassert,
	Stephen Boyd, Steven Rostedt, Thomas Gleixner, x86, linux-efi,
	netdev

Reading xfrm (ipsec) code I've found such code:

: #ifdef CONFIG_COMPAT
:         if (in_compat_syscall())
:                 return -EOPNOTSUPP;
: #endif

While I can read that it's false on native i386, it's a bit misleading
and in result it's better to introduce a helper for that.
Grepping other code, I've found that there are already such helpers.
And the uniq behavior of in_compat_syscall() on x86 is disturbing.

Adjusting it to generic with the following..

(on the first non-resend RFC I managed to forget Cc'ing Andy..
 sorry about that, was sure I did add him).

Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Andy Lutomirsky <luto@kernel.org>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: John Stultz <john.stultz@linaro.org>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Steffen Klassert <steffen.klassert@secunet.com>
Cc: Stephen Boyd <sboyd@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: x86@kernel.org
Cc: linux-efi@vger.kernel.org
Cc: netdev@vger.kernel.org
Cc: Dmitry Safonov <0x7f454c46@gmail.com>

Dmitry Safonov (2):
  x86/compat: Adjust in_compat_syscall() to generic code under !COMPAT
  compat: Cleanup in_compat_syscall() callers

 arch/x86/include/asm/compat.h  |  9 ++++++++-
 arch/x86/include/asm/ftrace.h  |  4 +---
 arch/x86/kernel/process_64.c   |  4 ++--
 arch/x86/kernel/sys_x86_64.c   | 11 ++++++-----
 arch/x86/mm/hugetlbpage.c      |  4 ++--
 arch/x86/mm/mmap.c             |  2 +-
 drivers/firmware/efi/efivars.c | 16 ++++------------
 include/linux/compat.h         |  4 ++--
 kernel/time/time.c             |  2 +-
 net/xfrm/xfrm_state.c          |  2 --
 net/xfrm/xfrm_user.c           |  2 --
 11 files changed, 27 insertions(+), 33 deletions(-)

-- 
2.13.6

^ permalink raw reply

* [RFC-resend 1/2] x86/compat: Adjust in_compat_syscall() to generic code under !COMPAT
From: Dmitry Safonov @ 2018-10-12 13:42 UTC (permalink / raw)
  To: linux-kernel
  Cc: Dmitry Safonov, Dmitry Safonov, Ard Biesheuvel, Andy Lutomirsky,
	David S. Miller, Herbert Xu, H. Peter Anvin, Ingo Molnar,
	John Stultz, Kirill A. Shutemov, Oleg Nesterov, Steffen Klassert,
	Stephen Boyd, Steven Rostedt, Thomas Gleixner, x86, linux-efi,
	netdev
In-Reply-To: <20181012134253.23266-1-dima@arista.com>

The result of in_compat_syscall() can be pictured as:

x86 platform:
    ---------------------------------------------------
    |  Arch\syscall  |  64-bit  |   ia32   |   x32    |
    |-------------------------------------------------|
    |     x86_64     |  false   |   true   |   true   |
    |-------------------------------------------------|
    |      i686      |          |  <true>  |          |
    ---------------------------------------------------

Other platforms:
    -------------------------------------------
    |  Arch\syscall  |  64-bit  |   compat    |
    |-----------------------------------------|
    |     64-bit     |  false   |    true     |
    |-----------------------------------------|
    |    32-bit(?)   |          |   <false>   |
    -------------------------------------------

As it seen, the result of in_compat_syscall() on generic 32-bit platform
differs from i686.

There is no reason for in_compat_syscall() == true on native i686.
It also easy to misread code if the result on native 32-bit platform
differs between arches.
Because of that non arch-specific code has many places with:
    if (IS_ENABLED(CONFIG_COMPAT) && in_compat_syscall())
in different variations.

It looks-like the only non-x86 code which uses in_compat_syscall() not
under CONFIG_COMPAT guard is in amd/amdkfd. But according to
the commit a18069c132cb ("amdkfd: Disable support for 32-bit user
processes"), it actually should be disabled on native i686.

Rename in_compat_syscall() to in_32bit_syscall() for x86-specific code
and make in_compat_syscall() false under !CONFIG_COMPAT.

With a following patch I'll clean generic users which were forced
to check IS_ENABLED(CONFIG_COMPAT) with in_compat_syscall().

Signed-off-by: Dmitry Safonov <dima@arista.com>
---
 arch/x86/include/asm/compat.h |  9 ++++++++-
 arch/x86/include/asm/ftrace.h |  4 +---
 arch/x86/kernel/process_64.c  |  4 ++--
 arch/x86/kernel/sys_x86_64.c  | 11 ++++++-----
 arch/x86/mm/hugetlbpage.c     |  4 ++--
 arch/x86/mm/mmap.c            |  2 +-
 include/linux/compat.h        |  4 ++--
 7 files changed, 22 insertions(+), 16 deletions(-)

diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h
index fb97cf7c4137..626bcf1d037d 100644
--- a/arch/x86/include/asm/compat.h
+++ b/arch/x86/include/asm/compat.h
@@ -232,11 +232,18 @@ static inline bool in_x32_syscall(void)
 	return false;
 }
 
-static inline bool in_compat_syscall(void)
+static inline bool in_32bit_syscall(void)
 {
 	return in_ia32_syscall() || in_x32_syscall();
 }
+
+#ifdef CONFIG_COMPAT
+static inline bool in_compat_syscall(void)
+{
+	return in_32bit_syscall();
+}
 #define in_compat_syscall in_compat_syscall	/* override the generic impl */
+#endif
 
 struct compat_siginfo;
 int __copy_siginfo_to_user32(struct compat_siginfo __user *to,
diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h
index c18ed65287d5..cf350639e76d 100644
--- a/arch/x86/include/asm/ftrace.h
+++ b/arch/x86/include/asm/ftrace.h
@@ -76,9 +76,7 @@ static inline bool arch_syscall_match_sym_name(const char *sym, const char *name
 #define ARCH_TRACE_IGNORE_COMPAT_SYSCALLS 1
 static inline bool arch_trace_is_compat_syscall(struct pt_regs *regs)
 {
-	if (in_compat_syscall())
-		return true;
-	return false;
+	return in_32bit_syscall();
 }
 #endif /* CONFIG_FTRACE_SYSCALLS && CONFIG_IA32_EMULATION */
 #endif /* !COMPILE_OFFSETS */
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index ea5ea850348d..f96d9ee80086 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -573,10 +573,10 @@ static void __set_personality_x32(void)
 		current->mm->context.ia32_compat = TIF_X32;
 	current->personality &= ~READ_IMPLIES_EXEC;
 	/*
-	 * in_compat_syscall() uses the presence of the x32 syscall bit
+	 * in_32bit_syscall() uses the presence of the x32 syscall bit
 	 * flag to determine compat status.  The x86 mmap() code relies on
 	 * the syscall bitness so set x32 syscall bit right here to make
-	 * in_compat_syscall() work during exec().
+	 * in_32bit_syscall() work during exec().
 	 *
 	 * Pretend to come from a x32 execve.
 	 */
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
index 6a78d4b36a79..f7476ce23b6e 100644
--- a/arch/x86/kernel/sys_x86_64.c
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -105,7 +105,7 @@ SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len,
 static void find_start_end(unsigned long addr, unsigned long flags,
 		unsigned long *begin, unsigned long *end)
 {
-	if (!in_compat_syscall() && (flags & MAP_32BIT)) {
+	if (!in_32bit_syscall() && (flags & MAP_32BIT)) {
 		/* This is usually used needed to map code in small
 		   model, so it needs to be in the first 31bit. Limit
 		   it to that.  This means we need to move the
@@ -122,7 +122,7 @@ static void find_start_end(unsigned long addr, unsigned long flags,
 	}
 
 	*begin	= get_mmap_base(1);
-	if (in_compat_syscall())
+	if (in_32bit_syscall())
 		*end = task_size_32bit();
 	else
 		*end = task_size_64bit(addr > DEFAULT_MAP_WINDOW);
@@ -193,7 +193,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
 		return addr;
 
 	/* for MAP_32BIT mappings we force the legacy mmap base */
-	if (!in_compat_syscall() && (flags & MAP_32BIT))
+	if (!in_32bit_syscall() && (flags & MAP_32BIT))
 		goto bottomup;
 
 	/* requesting a specific address */
@@ -217,9 +217,10 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
 	 * If hint address is above DEFAULT_MAP_WINDOW, look for unmapped area
 	 * in the full address space.
 	 *
-	 * !in_compat_syscall() check to avoid high addresses for x32.
+	 * !in_32bit_syscall() check to avoid high addresses for x32
+	 * (and make it no op on native i386).
 	 */
-	if (addr > DEFAULT_MAP_WINDOW && !in_compat_syscall())
+	if (addr > DEFAULT_MAP_WINDOW && !in_32bit_syscall())
 		info.high_limit += TASK_SIZE_MAX - DEFAULT_MAP_WINDOW;
 
 	info.align_mask = 0;
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index 00b296617ca4..92e4c4b85bba 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -92,7 +92,7 @@ static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file,
 	 * If hint address is above DEFAULT_MAP_WINDOW, look for unmapped area
 	 * in the full address space.
 	 */
-	info.high_limit = in_compat_syscall() ?
+	info.high_limit = in_32bit_syscall() ?
 		task_size_32bit() : task_size_64bit(addr > DEFAULT_MAP_WINDOW);
 
 	info.align_mask = PAGE_MASK & ~huge_page_mask(h);
@@ -116,7 +116,7 @@ static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file,
 	 * If hint address is above DEFAULT_MAP_WINDOW, look for unmapped area
 	 * in the full address space.
 	 */
-	if (addr > DEFAULT_MAP_WINDOW && !in_compat_syscall())
+	if (addr > DEFAULT_MAP_WINDOW && !in_32bit_syscall())
 		info.high_limit += TASK_SIZE_MAX - DEFAULT_MAP_WINDOW;
 
 	info.align_mask = PAGE_MASK & ~huge_page_mask(h);
diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index 1e95d57760cf..db3165714521 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -166,7 +166,7 @@ unsigned long get_mmap_base(int is_legacy)
 	struct mm_struct *mm = current->mm;
 
 #ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES
-	if (in_compat_syscall()) {
+	if (in_32bit_syscall()) {
 		return is_legacy ? mm->mmap_compat_legacy_base
 				 : mm->mmap_compat_base;
 	}
diff --git a/include/linux/compat.h b/include/linux/compat.h
index 1a3c4f37e908..ac643a2f5f4b 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -1033,9 +1033,9 @@ int kcompat_sys_fstatfs64(unsigned int fd, compat_size_t sz,
 #else /* !CONFIG_COMPAT */
 
 #define is_compat_task() (0)
-#ifndef in_compat_syscall
+/* Ensure no one redefines in_compat_syscall() under !CONFIG_COMPAT */
+#define in_compat_syscall in_compat_syscall
 static inline bool in_compat_syscall(void) { return false; }
-#endif
 
 #endif /* CONFIG_COMPAT */
 
-- 
2.13.6

^ permalink raw reply related

* [RFC-resend 2/2] compat: Cleanup in_compat_syscall() callers
From: Dmitry Safonov @ 2018-10-12 13:42 UTC (permalink / raw)
  To: linux-kernel
  Cc: Dmitry Safonov, Dmitry Safonov, Ard Biesheuvel, Andy Lutomirsky,
	David S. Miller, Herbert Xu, H. Peter Anvin, Ingo Molnar,
	John Stultz, Kirill A. Shutemov, Oleg Nesterov, Steffen Klassert,
	Stephen Boyd, Steven Rostedt, Thomas Gleixner, x86, linux-efi,
	netdev
In-Reply-To: <20181012134253.23266-1-dima@arista.com>

Now that in_compat_syscall() == false on native i686, it's possible to
remove some ifdeffery and no more needed helpers.

Signed-off-by: Dmitry Safonov <dima@arista.com>
---
 drivers/firmware/efi/efivars.c | 16 ++++------------
 kernel/time/time.c             |  2 +-
 net/xfrm/xfrm_state.c          |  2 --
 net/xfrm/xfrm_user.c           |  2 --
 4 files changed, 5 insertions(+), 17 deletions(-)

diff --git a/drivers/firmware/efi/efivars.c b/drivers/firmware/efi/efivars.c
index 3e626fd9bd4e..8061667a6765 100644
--- a/drivers/firmware/efi/efivars.c
+++ b/drivers/firmware/efi/efivars.c
@@ -229,14 +229,6 @@ sanity_check(struct efi_variable *var, efi_char16_t *name, efi_guid_t vendor,
 	return 0;
 }
 
-static inline bool is_compat(void)
-{
-	if (IS_ENABLED(CONFIG_COMPAT) && in_compat_syscall())
-		return true;
-
-	return false;
-}
-
 static void
 copy_out_compat(struct efi_variable *dst, struct compat_efi_variable *src)
 {
@@ -263,7 +255,7 @@ efivar_store_raw(struct efivar_entry *entry, const char *buf, size_t count)
 	u8 *data;
 	int err;
 
-	if (is_compat()) {
+	if (in_compat_syscall()) {
 		struct compat_efi_variable *compat;
 
 		if (count != sizeof(*compat))
@@ -324,7 +316,7 @@ efivar_show_raw(struct efivar_entry *entry, char *buf)
 			     &entry->var.DataSize, entry->var.Data))
 		return -EIO;
 
-	if (is_compat()) {
+	if (in_compat_syscall()) {
 		compat = (struct compat_efi_variable *)buf;
 
 		size = sizeof(*compat);
@@ -418,7 +410,7 @@ static ssize_t efivar_create(struct file *filp, struct kobject *kobj,
 	struct compat_efi_variable *compat = (struct compat_efi_variable *)buf;
 	struct efi_variable *new_var = (struct efi_variable *)buf;
 	struct efivar_entry *new_entry;
-	bool need_compat = is_compat();
+	bool need_compat = in_compat_syscall();
 	efi_char16_t *name;
 	unsigned long size;
 	u32 attributes;
@@ -495,7 +487,7 @@ static ssize_t efivar_delete(struct file *filp, struct kobject *kobj,
 	if (!capable(CAP_SYS_ADMIN))
 		return -EACCES;
 
-	if (is_compat()) {
+	if (in_compat_syscall()) {
 		if (count != sizeof(*compat))
 			return -EINVAL;
 
diff --git a/kernel/time/time.c b/kernel/time/time.c
index ccdb351277ee..4638e8cb6378 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -863,7 +863,7 @@ int get_timespec64(struct timespec64 *ts,
 	ts->tv_sec = kts.tv_sec;
 
 	/* Zero out the padding for 32 bit systems or in compat mode */
-	if (IS_ENABLED(CONFIG_64BIT_TIME) && (!IS_ENABLED(CONFIG_64BIT) || in_compat_syscall()))
+	if (IS_ENABLED(CONFIG_64BIT_TIME) && in_compat_syscall())
 		kts.tv_nsec &= 0xFFFFFFFFUL;
 
 	ts->tv_nsec = kts.tv_nsec;
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index b669262682c9..dc4a9f1fb941 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -2077,10 +2077,8 @@ int xfrm_user_policy(struct sock *sk, int optname, u8 __user *optval, int optlen
 	struct xfrm_mgr *km;
 	struct xfrm_policy *pol = NULL;
 
-#ifdef CONFIG_COMPAT
 	if (in_compat_syscall())
 		return -EOPNOTSUPP;
-#endif
 
 	if (!optval && !optlen) {
 		xfrm_sk_policy_insert(sk, XFRM_POLICY_IN, NULL);
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index df7ca2dabc48..c3aedf8a99ff 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -2621,10 +2621,8 @@ static int xfrm_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
 	const struct xfrm_link *link;
 	int type, err;
 
-#ifdef CONFIG_COMPAT
 	if (in_compat_syscall())
 		return -EOPNOTSUPP;
-#endif
 
 	type = nlh->nlmsg_type;
 	if (type > XFRM_MSG_MAX)
-- 
2.13.6

^ permalink raw reply related

* Re: Possible bug in traffic control?
From: Josh Coombs @ 2018-10-12 13:59 UTC (permalink / raw)
  To: Cong Wang; +Cc: netdev
In-Reply-To: <CACcUnf-HTqht3ogcd4YKdWxe0HvFqo7OYRR8SmkCGWe5d8zDHw@mail.gmail.com>

I've been able to narrow the scope down, the issue is with macsec
itself.  I setup two hosts with a macsec link between them, and let a
couple iperf3 sessions blast traffic across.  At approximately 4.2
billion packets / 6TB data transferred one end stopped transmitting
packets.  Doing a tcpdump on the impacted node's macsec0 interface
shows packets coming in from the remote node, in this case arp
requests, and arp replies from the local host, but watching the
interface counters for macsec0 no packets are being recorded as
transmitting.  Again, nothing in dmesg implying an error.

Deleting the macsec interface via ip link delete macsec0 and
re-creating it gets traffic flowing again without a reboot.

Meanwhile my traffic control bridge without macsec has shuffled 19TB
via 22 billion packets and not skipped a beat, so it appears my
initial assumption of it being the culprit was wrong.

To replicate, setup two hosts with a direct ethernet link between each other.
- Bring up macsec between the two hosts, setup a dedicated /30 on the link.
- Use iperf3 or another traffic generating tool over the /30, one
session for each direction.
- Wait for traffic to stop.

My test bed is on Ubuntu Server 18.04 currently, kernel 4.15.0-36.
I'm going to spin up a vanilla kernel on 4.15 and then -current to see
if this is an Ubuntu-ism from their patches, specific to 4.15, or a
general issue with macsec.

The script I used on each host (keys, rxmacs and IPs updated as appropriate):
#!/bin/bash

# Interfaces:
# dif = Egress physical interface (Dest)
# eif = Encrypted interface
dif=ens224
eif=macsec0

# MACSec Keys:
# txkey = Transmit (Local) key
# rxkey = Receive (Remote) key
# rxmac = Receive (Remote) MAC addy
txkey=60995924232808431491190820961556
rxkey=87345530111733181210202106249824
rxmac=00:0c:29:c5:95:df

# Clear any existing IP config
ifconfig $dif 0.0.0.0

# Bring up macsec:
echo "* Enable MACSec"
modprobe macsec
ip link add link "$dif" "$eif" type macsec
ip macsec add "$eif" tx sa 0 pn 1 on key 02 "$txkey"
ip macsec add "$eif" rx address "$rxmac" port 1
ip macsec add "$eif" rx address "$rxmac" port 1 sa 0 pn 1 on key 01 "$rxkey"
ip link set "$eif" type macsec encrypt on

# Bring up the interfaces:
echo "* Light tunnel NICS"
ip link set "$dif" up
ip link set "$eif" up

# Set IP
ifconfig $eif 192.168.211.1/30

echo " --=[ MACSec Up ]=--"
On Thu, Oct 11, 2018 at 10:05 AM Josh Coombs <jcoombs@staff.gwi.net> wrote:
>
> I'm actually leaning towards macsec now.  I'm at 6TB transferred in a
> double hop, no macsec over the bridge setup without triggering the
> fault.  I'm going to let it continue to churn and setup a second
> testbed that JUST uses macsec without traffic control bridging to see
> if I can trip the issue there.    That should determine if it's macsec
> itself, or an interaction between macsec and traffic control.
>
> Joshua Coombs
> GWI
>
> office 207-494-2140
> www.gwi.net
>
> On Wed, Oct 10, 2018 at 12:39 PM Cong Wang <xiyou.wangcong@gmail.com> wrote:
> >
> > On Wed, Oct 10, 2018 at 8:54 AM Josh Coombs <jcoombs@staff.gwi.net> wrote:
> > >
> > > 2.3 billion 1 byte packets failed to re-create the bug.  To try and
> > > simplify the setup I removed macsec from the equation, using a single
> > > host in the middle as the bridge.  Interestingly, rather than 1.3Gbits
> > > a second in both directions, it ran around 8Mbits a second.  Switching
> > > the filter from u32 to matchall didn't change the performance.  Going
> > > back to the four machine test bed, again removing macsec and just
> > > bridging through radically decreased the throughput to around 8Mbits.
> > > Flip on macsec for the bridge and 1.3Gbits?
> >
> > This is a great narrow down! We can rule out macsec for guilty.
> >
> > Can you share a minimum reproducer for this problem? If so I can take
> > a look.

^ permalink raw reply

* Re: [PATCH] igb: shorten maximum PHC timecounter update interval
From: Richard Cochran @ 2018-10-12 14:05 UTC (permalink / raw)
  To: Miroslav Lichvar; +Cc: intel-wired-lan, netdev, Jacob Keller, Thomas Gleixner
In-Reply-To: <20181012111339.1361-1-mlichvar@redhat.com>

On Fri, Oct 12, 2018 at 01:13:39PM +0200, Miroslav Lichvar wrote:
> Since commit 500462a9d ("timers: Switch to a non-cascading wheel"),
> scheduling of delayed work seems to be less accurate and a requested
> delay of 540 seconds may actually be longer than 550 seconds. Shorten
> the delay to 480 seconds to be sure the timecounter is updated in time.

Good catch.  This timer wheel change will affect other, similar
drivers.  Guess I'll go through and adjust their timeouts, too.

Thanks,
Richard

^ permalink raw reply

* Re: [PATCH] igb: shorten maximum PHC timecounter update interval
From: Richard Cochran @ 2018-10-12 14:08 UTC (permalink / raw)
  To: Miroslav Lichvar; +Cc: intel-wired-lan, netdev, Jacob Keller, Thomas Gleixner
In-Reply-To: <20181012111339.1361-1-mlichvar@redhat.com>

On Fri, Oct 12, 2018 at 01:13:39PM +0200, Miroslav Lichvar wrote:
> This fixes an issue with HW timestamps on 82580/I350/I354 being off by
> ~1100 seconds for few seconds every ~9 minutes.

This patch should go to the stable trees starting with v4.8.

Thanks,
Richard

^ permalink raw reply

* [PATCH net] ipv6: rate-limit probes for neighbourless routes
From: Sabrina Dubroca @ 2018-10-12 14:22 UTC (permalink / raw)
  To: netdev; +Cc: Stefano Brivio, Sabrina Dubroca

When commit 270972554c91 ("[IPV6]: ROUTE: Add Router Reachability
Probing (RFC4191).") introduced router probing, the rt6_probe() function
required that a neighbour entry existed. This neighbour entry is used to
record the timestamp of the last probe via the ->updated field.

Later, commit 2152caea7196 ("ipv6: Do not depend on rt->n in rt6_probe().")
removed the requirement for a neighbour entry. Neighbourless routes skip
the interval check and are not rate-limited.

This patch adds rate-limiting for neighbourless routes, by recording the
timestamp of the last probe in the fib6_info itself.

Fixes: 2152caea7196 ("ipv6: Do not depend on rt->n in rt6_probe().")
Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
Reviewed-by: Stefano Brivio <sbrivio@redhat.com>
---
 include/net/ip6_fib.h |  4 ++++
 net/ipv6/route.c      | 12 ++++++------
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 3d4930528db0..2d31e22babd8 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -159,6 +159,10 @@ struct fib6_info {
 	struct rt6_info * __percpu	*rt6i_pcpu;
 	struct rt6_exception_bucket __rcu *rt6i_exception_bucket;
 
+#ifdef CONFIG_IPV6_ROUTER_PREF
+	unsigned long			last_probe;
+#endif
+
 	u32				fib6_metric;
 	u8				fib6_protocol;
 	u8				fib6_type;
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index a366c05a239d..abcb5ae77319 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -520,10 +520,11 @@ static void rt6_probe_deferred(struct work_struct *w)
 
 static void rt6_probe(struct fib6_info *rt)
 {
-	struct __rt6_probe_work *work;
+	struct __rt6_probe_work *work = NULL;
 	const struct in6_addr *nh_gw;
 	struct neighbour *neigh;
 	struct net_device *dev;
+	struct inet6_dev *idev;
 
 	/*
 	 * Okay, this does not seem to be appropriate
@@ -539,15 +540,12 @@ static void rt6_probe(struct fib6_info *rt)
 	nh_gw = &rt->fib6_nh.nh_gw;
 	dev = rt->fib6_nh.nh_dev;
 	rcu_read_lock_bh();
+	idev = __in6_dev_get(dev);
 	neigh = __ipv6_neigh_lookup_noref(dev, nh_gw);
 	if (neigh) {
-		struct inet6_dev *idev;
-
 		if (neigh->nud_state & NUD_VALID)
 			goto out;
 
-		idev = __in6_dev_get(dev);
-		work = NULL;
 		write_lock(&neigh->lock);
 		if (!(neigh->nud_state & NUD_VALID) &&
 		    time_after(jiffies,
@@ -557,11 +555,13 @@ static void rt6_probe(struct fib6_info *rt)
 				__neigh_set_probe_once(neigh);
 		}
 		write_unlock(&neigh->lock);
-	} else {
+	} else if (time_after(jiffies, rt->last_probe +
+				       idev->cnf.rtr_probe_interval)) {
 		work = kmalloc(sizeof(*work), GFP_ATOMIC);
 	}
 
 	if (work) {
+		rt->last_probe = jiffies;
 		INIT_WORK(&work->work, rt6_probe_deferred);
 		work->target = *nh_gw;
 		dev_hold(dev);
-- 
2.19.1

^ permalink raw reply related

* Re: [PATCH] rxrpc: add IPV6 dependency
From: David Howells @ 2018-10-12 14:25 UTC (permalink / raw)
  To: Arnd Bergmann; +Cc: dhowells, David S. Miller, linux-afs, linux-kernel, netdev
In-Reply-To: <20181012123624.2001922-1-arnd@arndb.de>

Arnd Bergmann <arnd@arndb.de> wrote:

> +	depends on IPV6 || !IPV6

That looks weird.  It looks like it always ought to be true.

David

^ permalink raw reply

* Re: netconsole warning in 4.19.0-rc7
From: Dave Jones @ 2018-10-12 14:32 UTC (permalink / raw)
  To: Cong Wang; +Cc: Meelis Roos, LKML, Linux Kernel Network Developers
In-Reply-To: <CAM_iQpU0+LVwgMea32KQWf8z6Bp1VTmWOGnWNwNSmV0PZzf_ew@mail.gmail.com>

On Wed, Oct 10, 2018 at 10:34:49PM -0700, Cong Wang wrote:
 > (Cc'ing Dave)
 > 
 > On Wed, Oct 10, 2018 at 5:14 AM Meelis Roos <mroos@linux.ee> wrote:
 > >
 > > Thies 4.19-rc7 on a bunch of test machines and got this warning from one.
 > > It is reproducible and I have not noticed it before.
 > >
 > [...]
 > > [    9.914805] WARNING: CPU: 0 PID: 0 at kernel/softirq.c:168 __local_bh_enable_ip+0x2e/0x44
 > > [    9.914806] Modules linked in:
 > > [    9.914808] CPU: 0 PID: 0 Comm: swapper Not tainted 4.19.0-rc7 #210
 > > [    9.914810] Hardware name: MicroLink                       /D850MV                         , BIOS MV85010A.86A.0067.P24.0304081124 04/08/2003
 > > [    9.914811] EIP: __local_bh_enable_ip+0x2e/0x44
 > > [    9.914813] Code: cc 02 5f c8 a9 00 00 0f 00 75 1f 83 ea 01 f7 da 01 15 cc 02 5f c8 a1 cc 02 5f c8 a9 00 ff 1f 00 74 0c ff 0d cc 02 5f c8 5d c3 <0f> 0b eb dd 66 a1 80 cd 5e c8 66 85 c0 74 e9 e8 87 ff ff ff eb e2
 > > [    9.914814] EAX: 80010200 EBX: f602b000 ECX: 36346270 EDX: 00000200
 > > [    9.914815] ESI: f620ecc0 EDI: f620ebac EBP: f600de40 ESP: f600de40
 > > [    9.914816] DS: 007b ES: 007b FS: 0000 GS: 00e0 SS: 0068 EFLAGS: 00010006
 > > [    9.914817] CR0: 80050033 CR2: b7f5f000 CR3: 36389000 CR4: 000006d0
 > > [    9.914818] Call Trace:
 > > [    9.914819]  <IRQ>
 > > [    9.914820]  netpoll_send_skb_on_dev+0xa5/0x1b0
 > 
 > This is exactly what I mentioned in my review here:
 > https://marc.info/?l=linux-netdev&m=153816136624679&w=2
 > 
 > "But irq is disabled here, so not sure if rcu_read_lock_bh()
 > could cause trouble... "

ugh, what a mess.
I'm travelling right now so not going to get to look into this more
for a week or so.  Unless someone has a quick-fix, should we revert ?
We've traded one warning for another, which doesn't really feel like
progress.

	Dave

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox