Netdev List
 help / color / mirror / Atom feed
* [patch net-next RFC 03/12] devlink: introduce a helper to generate physical port names
From: Jiri Pirko @ 2018-03-22 10:55 UTC (permalink / raw)
  To: netdev
  Cc: davem, idosch, jakub.kicinski, mlxsw, andrew, vivien.didelot,
	f.fainelli, michael.chan, ganeshgr, saeedm, simon.horman,
	pieter.jansenvanvuuren, john.hurley, dirk.vandermerwe,
	alexander.h.duyck, ogerlitz, dsahern, vijaya.guvva,
	satananda.burla, raghu.vatsavayi, felix.manlunas, gospo,
	sathya.perla, vasundhara-v.volam, tariqt, eranbe,
	jeffrey.t.kirsher
In-Reply-To: <20180322105522.8186-1-jiri@resnulli.us>

From: Jiri Pirko <jiri@mellanox.com>

Each driver implements physical port name generation by itself. However
as devlink has all needed info, it can easily do the job for all its
users. So implement this helper in devlink.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
---
 include/net/devlink.h |  9 +++++++++
 net/core/devlink.c    | 39 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 48 insertions(+)

diff --git a/include/net/devlink.h b/include/net/devlink.h
index 900295afc521..2e5bfe7723b4 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -384,6 +384,8 @@ void devlink_port_attrs_set(struct devlink_port *devlink_port,
 			    enum devlink_port_flavour flavour,
 			    u32 port_number, bool split,
 			    u32 split_subport_number);
+int devlink_port_get_phys_port_name(struct devlink_port *devlink_port,
+				    char *name, size_t len);
 int devlink_sb_register(struct devlink *devlink, unsigned int sb_index,
 			u32 size, u16 ingress_pools_count,
 			u16 egress_pools_count, u16 ingress_tc_count,
@@ -484,6 +486,13 @@ static inline void devlink_port_attrs_set(struct devlink_port *devlink_port,
 {
 }
 
+static inline int
+devlink_port_get_phys_port_name(struct devlink_port *devlink_port,
+				char *name, size_t len)
+{
+	return -EOPNOTSUPP;
+}
+
 static inline int devlink_sb_register(struct devlink *devlink,
 				      unsigned int sb_index, u32 size,
 				      u16 ingress_pools_count,
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 782476a1ff8f..4ba69383ab58 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -3017,6 +3017,45 @@ void devlink_port_attrs_set(struct devlink_port *devlink_port,
 }
 EXPORT_SYMBOL_GPL(devlink_port_attrs_set);
 
+int devlink_port_get_phys_port_name(struct devlink_port *devlink_port,
+				    char *name, size_t len)
+{
+	struct devlink_port_attrs *attrs = &devlink_port->attrs;
+	int n = 0;
+
+	if (!attrs->set)
+		return -EOPNOTSUPP;
+
+	switch (attrs->flavour) {
+	case DEVLINK_PORT_FLAVOUR_PHYSICAL:
+		if (!attrs->split)
+			n = snprintf(name, len, "p%u", attrs->port_number);
+		else
+			n = snprintf(name, len, "p%us%u", attrs->port_number,
+				     attrs->split_subport_number);
+		break;
+	case DEVLINK_PORT_FLAVOUR_PF_REP:
+		n = snprintf(name, len, "pf%d", attrs->port_number);
+		break;
+	case DEVLINK_PORT_FLAVOUR_VF_REP:
+		n = snprintf(name, len, "vf%d", attrs->port_number);
+		break;
+	case DEVLINK_PORT_FLAVOUR_CPU:
+	case DEVLINK_PORT_FLAVOUR_DSA:
+		/* As CPU and DSA ports do not have a netdevice associated
+		 * case should not ever happen.
+		 */
+		WARN_ON(1);
+		return -EINVAL;
+	}
+
+	if (n >= len)
+		return -EINVAL;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(devlink_port_get_phys_port_name);
+
 int devlink_sb_register(struct devlink *devlink, unsigned int sb_index,
 			u32 size, u16 ingress_pools_count,
 			u16 egress_pools_count, u16 ingress_tc_count,
-- 
2.14.3

^ permalink raw reply related

* [patch net-next RFC 02/12] devlink: extend attrs_set for setting port flavours
From: Jiri Pirko @ 2018-03-22 10:55 UTC (permalink / raw)
  To: netdev
  Cc: davem, idosch, jakub.kicinski, mlxsw, andrew, vivien.didelot,
	f.fainelli, michael.chan, ganeshgr, saeedm, simon.horman,
	pieter.jansenvanvuuren, john.hurley, dirk.vandermerwe,
	alexander.h.duyck, ogerlitz, dsahern, vijaya.guvva,
	satananda.burla, raghu.vatsavayi, felix.manlunas, gospo,
	sathya.perla, vasundhara-v.volam, tariqt, eranbe,
	jeffrey.t.kirsher
In-Reply-To: <20180322105522.8186-1-jiri@resnulli.us>

From: Jiri Pirko <jiri@mellanox.com>

Devlink ports can have specific flavour according to the purpose of use.
This patch extend attrs_set so the driver can say which flavour port
has. Initial flavours are:
physical, pf_rep, vf_rep, cpu, dsa
User can query this to see right away what is the purpose of each port.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlxsw/core.c       |  4 ++--
 drivers/net/ethernet/netronome/nfp/nfp_devlink.c | 12 +++++++++---
 include/net/devlink.h                            |  3 +++
 include/uapi/linux/devlink.h                     | 19 +++++++++++++++++++
 net/core/devlink.c                               |  5 +++++
 5 files changed, 38 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.c b/drivers/net/ethernet/mellanox/mlxsw/core.c
index dc924d5fb3b7..0b6e646fed75 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/core.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/core.c
@@ -1721,8 +1721,8 @@ void mlxsw_core_port_eth_set(struct mlxsw_core *mlxsw_core, u8 local_port,
 	struct devlink_port *devlink_port = &mlxsw_core_port->devlink_port;
 
 	mlxsw_core_port->port_driver_priv = port_driver_priv;
-	devlink_port_attrs_set(devlink_port, port_number,
-			       split, split_port_subnumber);
+	devlink_port_attrs_set(devlink_port, DEVLINK_PORT_FLAVOUR_PHYSICAL,
+			       port_number, split, split_port_subnumber);
 	devlink_port_type_eth_set(devlink_port, dev);
 }
 EXPORT_SYMBOL(mlxsw_core_port_eth_set);
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_devlink.c b/drivers/net/ethernet/netronome/nfp/nfp_devlink.c
index 3c0f0560f834..e3a46faaadc6 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_devlink.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_devlink.c
@@ -175,15 +175,21 @@ static int nfp_devlink_port_attrs_set(struct nfp_port *port)
 		if (ret)
 			return ret;
 
-		devlink_port_attrs_set(&port->dl_port, eth_port.label_port,
+		devlink_port_attrs_set(&port->dl_port,
+				       DEVLINK_PORT_FLAVOUR_PHYSICAL,
+				       eth_port.label_port,
 				       eth_port.is_split,
 				       eth_port.label_subport);
 		break;
 	case NFP_PORT_PF_PORT:
-		devlink_port_attrs_set(&port->dl_port, port->pf_id, false, 0);
+		devlink_port_attrs_set(&port->dl_port,
+				       DEVLINK_PORT_FLAVOUR_PF_REP,
+				       port->pf_id, false, 0);
 		break;
 	case NFP_PORT_VF_PORT:
-		devlink_port_attrs_set(&port->dl_port, port->vf_id, false, 0);
+		devlink_port_attrs_set(&port->dl_port,
+				       DEVLINK_PORT_FLAVOUR_VF_REP,
+				       port->vf_id, false, 0);
 		break;
 	default:
 		break;
diff --git a/include/net/devlink.h b/include/net/devlink.h
index 29c3bc260a3e..900295afc521 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -37,6 +37,7 @@ struct devlink {
 
 struct devlink_port_attrs {
 	bool set;
+	enum devlink_port_flavour flavour;
 	u32 port_number; /* same value as "split group" */
 	bool split;
 	u32 split_subport_number;
@@ -380,6 +381,7 @@ void devlink_port_type_ib_set(struct devlink_port *devlink_port,
 			      struct ib_device *ibdev);
 void devlink_port_type_clear(struct devlink_port *devlink_port);
 void devlink_port_attrs_set(struct devlink_port *devlink_port,
+			    enum devlink_port_flavour flavour,
 			    u32 port_number, bool split,
 			    u32 split_subport_number);
 int devlink_sb_register(struct devlink *devlink, unsigned int sb_index,
@@ -476,6 +478,7 @@ static inline void devlink_port_type_clear(struct devlink_port *devlink_port)
 }
 
 static inline void devlink_port_attrs_set(struct devlink_port *devlink_port,
+					  enum devlink_port_flavour flavour,
 					  u32 port_number, bool split,
 					  u32 split_subport_number)
 {
diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index 15b031a5ee7a..74d0e620059b 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -132,6 +132,24 @@ enum devlink_eswitch_encap_mode {
 	DEVLINK_ESWITCH_ENCAP_MODE_BASIC,
 };
 
+enum devlink_port_flavour {
+	DEVLINK_PORT_FLAVOUR_PHYSICAL, /* Any kind of a port physically
+					* facing the user.
+					*/
+	DEVLINK_PORT_FLAVOUR_PF_REP, /* Port represents a SR-IOV physical
+				      *	function counterpart port of
+				      *	embedded switch.
+				      */
+	DEVLINK_PORT_FLAVOUR_VF_REP, /* Port represents a SR-IOV virtual
+				      *	function counterpart port of
+				      *	embedded switch.
+				      */
+	DEVLINK_PORT_FLAVOUR_CPU, /* CPU port */
+	DEVLINK_PORT_FLAVOUR_DSA, /* Distributed switch architecture
+				   * interconnect port.
+				   */
+};
+
 enum devlink_attr {
 	/* don't change the order or add anything between, this is ABI! */
 	DEVLINK_ATTR_UNSPEC,
@@ -224,6 +242,7 @@ enum devlink_attr {
 	DEVLINK_ATTR_DPIPE_TABLE_RESOURCE_ID,	/* u64 */
 	DEVLINK_ATTR_DPIPE_TABLE_RESOURCE_UNITS,/* u64 */
 
+	DEVLINK_ATTR_PORT_FLAVOUR,		/* u16 */
 	DEVLINK_ATTR_PORT_NUMBER,		/* u32 */
 	DEVLINK_ATTR_PORT_SPLIT_SUBPORT_NUMBER,	/* u32 */
 
diff --git a/net/core/devlink.c b/net/core/devlink.c
index b0fca9644722..782476a1ff8f 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -460,6 +460,8 @@ static int devlink_nl_port_attrs_put(struct sk_buff *msg,
 
 	if (!attrs->set)
 		return 0;
+	if (nla_put_u16(msg, DEVLINK_ATTR_PORT_FLAVOUR, attrs->flavour))
+		return -EMSGSIZE;
 	if (nla_put_u32(msg, DEVLINK_ATTR_PORT_NUMBER, attrs->port_number))
 		return -EMSGSIZE;
 	if (!attrs->split)
@@ -2992,6 +2994,7 @@ EXPORT_SYMBOL_GPL(devlink_port_type_clear);
  *	devlink_port_attrs_set - Set port attributes
  *
  *	@devlink_port: devlink port
+ *	@flavour: flavour of the port
  *	@port_number: number of the port that is facing user, for example
  *	              the front panel port number
  *	@split: indicates if this is split port
@@ -2999,12 +3002,14 @@ EXPORT_SYMBOL_GPL(devlink_port_type_clear);
  *	                       of subport.
  */
 void devlink_port_attrs_set(struct devlink_port *devlink_port,
+			    enum devlink_port_flavour flavour,
 			    u32 port_number, bool split,
 			    u32 split_subport_number)
 {
 	struct devlink_port_attrs *attrs = &devlink_port->attrs;
 
 	attrs->set = true;
+	attrs->flavour = flavour;
 	attrs->port_number = port_number;
 	attrs->split = split;
 	attrs->split_subport_number = split_subport_number;
-- 
2.14.3

^ permalink raw reply related

* [patch net-next RFC 04/12] dsa: set devlink port attrs for dsa ports
From: Jiri Pirko @ 2018-03-22 10:55 UTC (permalink / raw)
  To: netdev
  Cc: davem, idosch, jakub.kicinski, mlxsw, andrew, vivien.didelot,
	f.fainelli, michael.chan, ganeshgr, saeedm, simon.horman,
	pieter.jansenvanvuuren, john.hurley, dirk.vandermerwe,
	alexander.h.duyck, ogerlitz, dsahern, vijaya.guvva,
	satananda.burla, raghu.vatsavayi, felix.manlunas, gospo,
	sathya.perla, vasundhara-v.volam, tariqt, eranbe,
	jeffrey.t.kirsher
In-Reply-To: <20180322105522.8186-1-jiri@resnulli.us>

From: Jiri Pirko <jiri@mellanox.com>

Set the attrs and allow to expose port flavour to user via devlink.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
---
 net/dsa/dsa2.c | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c
index adf50fbc4c13..49453690696d 100644
--- a/net/dsa/dsa2.c
+++ b/net/dsa/dsa2.c
@@ -270,7 +270,27 @@ static int dsa_port_setup(struct dsa_port *dp)
 	case DSA_PORT_TYPE_UNUSED:
 		break;
 	case DSA_PORT_TYPE_CPU:
+		/* dp->index is used now as port_number. However
+		 * CPU ports should have separate numbering
+		 * independent from front panel port numbers.
+		 */
+		devlink_port_attrs_set(&dp->devlink_port,
+				       DEVLINK_PORT_FLAVOUR_CPU,
+				       dp->index, false, 0);
+		err = dsa_port_link_register_of(dp);
+		if (err) {
+			dev_err(ds->dev, "failed to setup link for port %d.%d\n",
+				ds->index, dp->index);
+			return err;
+		}
 	case DSA_PORT_TYPE_DSA:
+		/* dp->index is used now as port_number. However
+		 * DSA ports should have separate numbering
+		 * independent from front panel port numbers.
+		 */
+		devlink_port_attrs_set(&dp->devlink_port,
+				       DEVLINK_PORT_FLAVOUR_DSA,
+				       dp->index, false, 0);
 		err = dsa_port_link_register_of(dp);
 		if (err) {
 			dev_err(ds->dev, "failed to setup link for port %d.%d\n",
@@ -279,6 +299,9 @@ static int dsa_port_setup(struct dsa_port *dp)
 		}
 		break;
 	case DSA_PORT_TYPE_USER:
+		devlink_port_attrs_set(&dp->devlink_port,
+				       DEVLINK_PORT_FLAVOUR_PHYSICAL,
+				       dp->index, false, 0);
 		err = dsa_slave_create(dp);
 		if (err)
 			dev_err(ds->dev, "failed to create slave for port %d.%d\n",
-- 
2.14.3

^ permalink raw reply related

* [patch net-next RFC 05/12] dsa: use devlink helper to generate physical port name
From: Jiri Pirko @ 2018-03-22 10:55 UTC (permalink / raw)
  To: netdev
  Cc: davem, idosch, jakub.kicinski, mlxsw, andrew, vivien.didelot,
	f.fainelli, michael.chan, ganeshgr, saeedm, simon.horman,
	pieter.jansenvanvuuren, john.hurley, dirk.vandermerwe,
	alexander.h.duyck, ogerlitz, dsahern, vijaya.guvva,
	satananda.burla, raghu.vatsavayi, felix.manlunas, gospo,
	sathya.perla, vasundhara-v.volam, tariqt, eranbe,
	jeffrey.t.kirsher
In-Reply-To: <20180322105522.8186-1-jiri@resnulli.us>

From: Jiri Pirko <jiri@mellanox.com>

Since devlink knows the info needed to generate the physical port name
in a generic way for all devlink users, use the helper to do the job.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
---
 net/dsa/slave.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 18561af7a8f1..8d71dd672e52 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -19,6 +19,7 @@
 #include <net/rtnetlink.h>
 #include <net/pkt_cls.h>
 #include <net/tc_act/tc_mirred.h>
+#include <net/devlink.h>
 #include <linux/if_bridge.h>
 #include <linux/netpoll.h>
 #include <linux/ptp_classify.h>
@@ -727,10 +728,7 @@ static int dsa_slave_get_phys_port_name(struct net_device *dev,
 {
 	struct dsa_port *dp = dsa_slave_to_port(dev);
 
-	if (snprintf(name, len, "p%d", dp->index) >= len)
-		return -EINVAL;
-
-	return 0;
+	return devlink_port_get_phys_port_name(&dp->devlink_port, name, len);
 }
 
 static struct dsa_mall_tc_entry *
-- 
2.14.3

^ permalink raw reply related

* [patch net-next RFC 06/12] mlxsw: use devlink helper to generate physical port name
From: Jiri Pirko @ 2018-03-22 10:55 UTC (permalink / raw)
  To: netdev
  Cc: davem, idosch, jakub.kicinski, mlxsw, andrew, vivien.didelot,
	f.fainelli, michael.chan, ganeshgr, saeedm, simon.horman,
	pieter.jansenvanvuuren, john.hurley, dirk.vandermerwe,
	alexander.h.duyck, ogerlitz, dsahern, vijaya.guvva,
	satananda.burla, raghu.vatsavayi, felix.manlunas, gospo,
	sathya.perla, vasundhara-v.volam, tariqt, eranbe,
	jeffrey.t.kirsher
In-Reply-To: <20180322105522.8186-1-jiri@resnulli.us>

From: Jiri Pirko <jiri@mellanox.com>

Since devlink knows the info needed to generate the physical port name
in a generic way for all devlink users, use the helper to do the job.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlxsw/core.c     | 11 +++++++++++
 drivers/net/ethernet/mellanox/mlxsw/core.h     |  2 ++
 drivers/net/ethernet/mellanox/mlxsw/spectrum.c | 17 +++--------------
 drivers/net/ethernet/mellanox/mlxsw/switchx2.c |  9 +++------
 4 files changed, 19 insertions(+), 20 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.c b/drivers/net/ethernet/mellanox/mlxsw/core.c
index 0b6e646fed75..7a49eb2c8db4 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/core.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/core.c
@@ -1762,6 +1762,17 @@ enum devlink_port_type mlxsw_core_port_type_get(struct mlxsw_core *mlxsw_core,
 }
 EXPORT_SYMBOL(mlxsw_core_port_type_get);
 
+int mlxsw_core_port_get_phys_port_name(struct mlxsw_core *mlxsw_core,
+				       u8 local_port, char *name, size_t len)
+{
+	struct mlxsw_core_port *mlxsw_core_port =
+					&mlxsw_core->ports[local_port];
+	struct devlink_port *devlink_port = &mlxsw_core_port->devlink_port;
+
+	return devlink_port_get_phys_port_name(devlink_port, name, len);
+}
+EXPORT_SYMBOL(mlxsw_core_port_get_phys_port_name);
+
 static void mlxsw_core_buf_dump_dbg(struct mlxsw_core *mlxsw_core,
 				    const char *buf, size_t size)
 {
diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.h b/drivers/net/ethernet/mellanox/mlxsw/core.h
index 10589abae67f..09703688ea9a 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/core.h
+++ b/drivers/net/ethernet/mellanox/mlxsw/core.h
@@ -209,6 +209,8 @@ void mlxsw_core_port_clear(struct mlxsw_core *mlxsw_core, u8 local_port,
 			   void *port_driver_priv);
 enum devlink_port_type mlxsw_core_port_type_get(struct mlxsw_core *mlxsw_core,
 						u8 local_port);
+int mlxsw_core_port_get_phys_port_name(struct mlxsw_core *mlxsw_core,
+				       u8 local_port, char *name, size_t len);
 
 int mlxsw_core_schedule_dw(struct delayed_work *dwork, unsigned long delay);
 bool mlxsw_core_schedule_work(struct work_struct *work);
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
index 59be0bf14127..64ea94d4ee14 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
@@ -1238,21 +1238,10 @@ static int mlxsw_sp_port_get_phys_port_name(struct net_device *dev, char *name,
 					    size_t len)
 {
 	struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(dev);
-	u8 module = mlxsw_sp_port->mapping.module;
-	u8 width = mlxsw_sp_port->mapping.width;
-	u8 lane = mlxsw_sp_port->mapping.lane;
-	int err;
-
-	if (!mlxsw_sp_port->split)
-		err = snprintf(name, len, "p%d", module + 1);
-	else
-		err = snprintf(name, len, "p%ds%d", module + 1,
-			       lane / width);
 
-	if (err >= len)
-		return -EINVAL;
-
-	return 0;
+	return mlxsw_core_port_get_phys_port_name(mlxsw_sp_port->mlxsw_sp->core,
+						  mlxsw_sp_port->local_port,
+						  name, len);
 }
 
 static struct mlxsw_sp_port_mall_tc_entry *
diff --git a/drivers/net/ethernet/mellanox/mlxsw/switchx2.c b/drivers/net/ethernet/mellanox/mlxsw/switchx2.c
index eddfcef320f1..96d4c073d9d6 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/switchx2.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/switchx2.c
@@ -417,13 +417,10 @@ static int mlxsw_sx_port_get_phys_port_name(struct net_device *dev, char *name,
 					    size_t len)
 {
 	struct mlxsw_sx_port *mlxsw_sx_port = netdev_priv(dev);
-	int err;
-
-	err = snprintf(name, len, "p%d", mlxsw_sx_port->mapping.module + 1);
-	if (err >= len)
-		return -EINVAL;
 
-	return 0;
+	return mlxsw_core_port_get_phys_port_name(mlxsw_sx_port->mlxsw_sx->core,
+						  mlxsw_sx_port->local_port,
+						  name, len);
 }
 
 static const struct net_device_ops mlxsw_sx_port_netdev_ops = {
-- 
2.14.3

^ permalink raw reply related

* [patch net-next RFC 07/12] nfp: flower: fix error path during representor creation
From: Jiri Pirko @ 2018-03-22 10:55 UTC (permalink / raw)
  To: netdev
  Cc: davem, idosch, jakub.kicinski, mlxsw, andrew, vivien.didelot,
	f.fainelli, michael.chan, ganeshgr, saeedm, simon.horman,
	pieter.jansenvanvuuren, john.hurley, dirk.vandermerwe,
	alexander.h.duyck, ogerlitz, dsahern, vijaya.guvva,
	satananda.burla, raghu.vatsavayi, felix.manlunas, gospo,
	sathya.perla, vasundhara-v.volam, tariqt, eranbe,
	jeffrey.t.kirsher
In-Reply-To: <20180322105522.8186-1-jiri@resnulli.us>

From: Jiri Pirko <jiri@mellanox.com>

Don't store repr pointer to reprs array until the representor is
successfully created. This avoids message about "representor
destruction" even when it was never created. Also it cleans-up the flow.
Also, check return value after port alloc.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
---
 drivers/net/ethernet/netronome/nfp/flower/main.c  | 13 +++++++++++--
 drivers/net/ethernet/netronome/nfp/nfp_net_repr.c |  9 +++++++--
 drivers/net/ethernet/netronome/nfp/nfp_net_repr.h |  1 +
 3 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/flower/main.c b/drivers/net/ethernet/netronome/nfp/flower/main.c
index 742d6f1575b5..aed8df0e9d41 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/main.c
+++ b/drivers/net/ethernet/netronome/nfp/flower/main.c
@@ -247,12 +247,16 @@ nfp_flower_spawn_vnic_reprs(struct nfp_app *app,
 			err = -ENOMEM;
 			goto err_reprs_clean;
 		}
-		RCU_INIT_POINTER(reprs->reprs[i], repr);
 
 		/* For now we only support 1 PF */
 		WARN_ON(repr_type == NFP_REPR_TYPE_PF && i);
 
 		port = nfp_port_alloc(app, port_type, repr);
+		if (IS_ERR(port)) {
+			err = PTR_ERR(port);
+			nfp_repr_free(repr);
+			goto err_reprs_clean;
+		}
 		if (repr_type == NFP_REPR_TYPE_PF) {
 			port->pf_id = i;
 			port->vnic = priv->nn->dp.ctrl_bar;
@@ -271,9 +275,11 @@ nfp_flower_spawn_vnic_reprs(struct nfp_app *app,
 				    port_id, port, priv->nn->dp.netdev);
 		if (err) {
 			nfp_port_free(port);
+			nfp_repr_free(repr);
 			goto err_reprs_clean;
 		}
 
+		RCU_INIT_POINTER(reprs->reprs[i], repr);
 		nfp_info(app->cpp, "%s%d Representor(%s) created\n",
 			 repr_type == NFP_REPR_TYPE_PF ? "PF" : "VF", i,
 			 repr->name);
@@ -344,16 +350,17 @@ nfp_flower_spawn_phy_reprs(struct nfp_app *app, struct nfp_flower_priv *priv)
 			err = -ENOMEM;
 			goto err_reprs_clean;
 		}
-		RCU_INIT_POINTER(reprs->reprs[phys_port], repr);
 
 		port = nfp_port_alloc(app, NFP_PORT_PHYS_PORT, repr);
 		if (IS_ERR(port)) {
 			err = PTR_ERR(port);
+			nfp_repr_free(repr);
 			goto err_reprs_clean;
 		}
 		err = nfp_port_init_phy_port(app->pf, app, port, i);
 		if (err) {
 			nfp_port_free(port);
+			nfp_repr_free(repr);
 			goto err_reprs_clean;
 		}
 
@@ -365,6 +372,7 @@ nfp_flower_spawn_phy_reprs(struct nfp_app *app, struct nfp_flower_priv *priv)
 				    cmsg_port_id, port, priv->nn->dp.netdev);
 		if (err) {
 			nfp_port_free(port);
+			nfp_repr_free(repr);
 			goto err_reprs_clean;
 		}
 
@@ -373,6 +381,7 @@ nfp_flower_spawn_phy_reprs(struct nfp_app *app, struct nfp_flower_priv *priv)
 					     eth_tbl->ports[i].base,
 					     phys_port);
 
+		RCU_INIT_POINTER(reprs->reprs[phys_port], repr);
 		nfp_info(app->cpp, "Phys Port %d Representor(%s) created\n",
 			 phys_port, repr->name);
 	}
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c b/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c
index 619570524d2a..d98cbc173dca 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c
@@ -337,12 +337,17 @@ int nfp_repr_init(struct nfp_app *app, struct net_device *netdev,
 	return err;
 }
 
-static void nfp_repr_free(struct nfp_repr *repr)
+static void __nfp_repr_free(struct nfp_repr *repr)
 {
 	free_percpu(repr->stats);
 	free_netdev(repr->netdev);
 }
 
+void nfp_repr_free(struct net_device *netdev)
+{
+	__nfp_repr_free(netdev_priv(netdev));
+}
+
 struct net_device *nfp_repr_alloc(struct nfp_app *app)
 {
 	struct net_device *netdev;
@@ -374,7 +379,7 @@ static void nfp_repr_clean_and_free(struct nfp_repr *repr)
 	nfp_info(repr->app->cpp, "Destroying Representor(%s)\n",
 		 repr->netdev->name);
 	nfp_repr_clean(repr);
-	nfp_repr_free(repr);
+	__nfp_repr_free(repr);
 }
 
 void nfp_reprs_clean_and_free(struct nfp_app *app, struct nfp_reprs *reprs)
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_repr.h b/drivers/net/ethernet/netronome/nfp/nfp_net_repr.h
index a621e8ff528e..cd756a15445f 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_repr.h
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_repr.h
@@ -123,6 +123,7 @@ void nfp_repr_inc_rx_stats(struct net_device *netdev, unsigned int len);
 int nfp_repr_init(struct nfp_app *app, struct net_device *netdev,
 		  u32 cmsg_port_id, struct nfp_port *port,
 		  struct net_device *pf_netdev);
+void nfp_repr_free(struct net_device *netdev);
 struct net_device *nfp_repr_alloc(struct nfp_app *app);
 void nfp_reprs_clean_and_free(struct nfp_app *app, struct nfp_reprs *reprs);
 void nfp_reprs_clean_and_free_by_type(struct nfp_app *app,
-- 
2.14.3

^ permalink raw reply related

* [patch net-next RFC 08/12] nfp: set eth_id for representors to avoid port index conflict
From: Jiri Pirko @ 2018-03-22 10:55 UTC (permalink / raw)
  To: netdev
  Cc: davem, idosch, jakub.kicinski, mlxsw, andrew, vivien.didelot,
	f.fainelli, michael.chan, ganeshgr, saeedm, simon.horman,
	pieter.jansenvanvuuren, john.hurley, dirk.vandermerwe,
	alexander.h.duyck, ogerlitz, dsahern, vijaya.guvva,
	satananda.burla, raghu.vatsavayi, felix.manlunas, gospo,
	sathya.perla, vasundhara-v.volam, tariqt, eranbe,
	jeffrey.t.kirsher
In-Reply-To: <20180322105522.8186-1-jiri@resnulli.us>

From: Jiri Pirko <jiri@mellanox.com>

Incorrect, need to be done differently

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
---
 drivers/net/ethernet/netronome/nfp/nfp_net_repr.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c b/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c
index d98cbc173dca..e6445f6707cb 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c
@@ -328,6 +328,8 @@ int nfp_repr_init(struct nfp_app *app, struct net_device *netdev,
 	if (err)
 		goto err_repr_clean;
 
+	/* This is incorrect - the id has to be figured out differently */
+	port->eth_id = cmsg_port_id;
 	return 0;
 
 err_repr_clean:
-- 
2.14.3

^ permalink raw reply related

* [patch net-next RFC 09/12] nfp: register devlink port for VF/PF representors
From: Jiri Pirko @ 2018-03-22 10:55 UTC (permalink / raw)
  To: netdev
  Cc: davem, idosch, jakub.kicinski, mlxsw, andrew, vivien.didelot,
	f.fainelli, michael.chan, ganeshgr, saeedm, simon.horman,
	pieter.jansenvanvuuren, john.hurley, dirk.vandermerwe,
	alexander.h.duyck, ogerlitz, dsahern, vijaya.guvva,
	satananda.burla, raghu.vatsavayi, felix.manlunas, gospo,
	sathya.perla, vasundhara-v.volam, tariqt, eranbe,
	jeffrey.t.kirsher
In-Reply-To: <20180322105522.8186-1-jiri@resnulli.us>

From: Jiri Pirko <jiri@mellanox.com>

Drivers should always register devlink port instance for all their
ports. So fix nfp and register devlink port for VF and PF representors.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
---
 drivers/net/ethernet/netronome/nfp/nfp_net_repr.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c b/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c
index e6445f6707cb..eff07e9a175d 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c
@@ -270,6 +270,8 @@ const struct net_device_ops nfp_repr_netdev_ops = {
 
 static void nfp_repr_clean(struct nfp_repr *repr)
 {
+	if (repr->port)
+		nfp_devlink_port_unregister(repr->port);
 	unregister_netdev(repr->netdev);
 	nfp_app_repr_clean(repr->app, repr->netdev);
 	dst_release((struct dst_entry *)repr->dst);
@@ -330,8 +332,14 @@ int nfp_repr_init(struct nfp_app *app, struct net_device *netdev,
 
 	/* This is incorrect - the id has to be figured out differently */
 	port->eth_id = cmsg_port_id;
+	err = nfp_devlink_port_register(app, port);
+	if (err)
+		goto err_netdev_clean;
+
 	return 0;
 
+err_netdev_clean:
+	unregister_netdev(netdev);
 err_repr_clean:
 	nfp_app_repr_clean(app, netdev);
 err_clean:
-- 
2.14.3

^ permalink raw reply related

* [patch net-next RFC 10/12] nfp: flower: create port for flower vnic
From: Jiri Pirko @ 2018-03-22 10:55 UTC (permalink / raw)
  To: netdev
  Cc: davem, idosch, jakub.kicinski, mlxsw, andrew, vivien.didelot,
	f.fainelli, michael.chan, ganeshgr, saeedm, simon.horman,
	pieter.jansenvanvuuren, john.hurley, dirk.vandermerwe,
	alexander.h.duyck, ogerlitz, dsahern, vijaya.guvva,
	satananda.burla, raghu.vatsavayi, felix.manlunas, gospo,
	sathya.perla, vasundhara-v.volam, tariqt, eranbe,
	jeffrey.t.kirsher
In-Reply-To: <20180322105522.8186-1-jiri@resnulli.us>

From: Jiri Pirko <jiri@mellanox.com>

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
---
 drivers/net/ethernet/netronome/nfp/flower/main.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/flower/main.c b/drivers/net/ethernet/netronome/nfp/flower/main.c
index aed8df0e9d41..1890af7e6196 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/main.c
+++ b/drivers/net/ethernet/netronome/nfp/flower/main.c
@@ -427,10 +427,9 @@ static int nfp_flower_vnic_alloc(struct nfp_app *app, struct nfp_net *nn,
 		goto err_invalid_port;
 	}
 
-	eth_hw_addr_random(nn->dp.netdev);
 	netif_keep_dst(nn->dp.netdev);
 
-	return 0;
+	return nfp_app_nic_vnic_alloc(app, nn, id);
 
 err_invalid_port:
 	nn->port = nfp_port_alloc(app, NFP_PORT_INVALID, nn->dp.netdev);
-- 
2.14.3

^ permalink raw reply related

* [patch net-next RFC 11/12] nfp: use devlink helper to generate physical port name
From: Jiri Pirko @ 2018-03-22 10:55 UTC (permalink / raw)
  To: netdev
  Cc: davem, idosch, jakub.kicinski, mlxsw, andrew, vivien.didelot,
	f.fainelli, michael.chan, ganeshgr, saeedm, simon.horman,
	pieter.jansenvanvuuren, john.hurley, dirk.vandermerwe,
	alexander.h.duyck, ogerlitz, dsahern, vijaya.guvva,
	satananda.burla, raghu.vatsavayi, felix.manlunas, gospo,
	sathya.perla, vasundhara-v.volam, tariqt, eranbe,
	jeffrey.t.kirsher
In-Reply-To: <20180322105522.8186-1-jiri@resnulli.us>

From: Jiri Pirko <jiri@mellanox.com>

Since devlink knows the info needed to generate the physical port name
in a generic way for all devlink users, use the helper to do the job.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
---
 drivers/net/ethernet/netronome/nfp/nfp_port.c | 30 ++-------------------------
 1 file changed, 2 insertions(+), 28 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_port.c b/drivers/net/ethernet/netronome/nfp/nfp_port.c
index 7bd8be5c833b..01dac8533ef6 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_port.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_port.c
@@ -34,6 +34,7 @@
 #include <linux/lockdep.h>
 #include <linux/netdevice.h>
 #include <net/switchdev.h>
+#include <net/devlink.h>
 
 #include "nfpcore/nfp_cpp.h"
 #include "nfpcore/nfp_nsp.h"
@@ -160,40 +161,13 @@ struct nfp_eth_table_port *nfp_port_get_eth_port(struct nfp_port *port)
 int
 nfp_port_get_phys_port_name(struct net_device *netdev, char *name, size_t len)
 {
-	struct nfp_eth_table_port *eth_port;
 	struct nfp_port *port;
-	int n;
 
 	port = nfp_port_from_netdev(netdev);
 	if (!port)
 		return -EOPNOTSUPP;
 
-	switch (port->type) {
-	case NFP_PORT_PHYS_PORT:
-		eth_port = __nfp_port_get_eth_port(port);
-		if (!eth_port)
-			return -EOPNOTSUPP;
-
-		if (!eth_port->is_split)
-			n = snprintf(name, len, "p%d", eth_port->label_port);
-		else
-			n = snprintf(name, len, "p%ds%d", eth_port->label_port,
-				     eth_port->label_subport);
-		break;
-	case NFP_PORT_PF_PORT:
-		n = snprintf(name, len, "pf%d", port->pf_id);
-		break;
-	case NFP_PORT_VF_PORT:
-		n = snprintf(name, len, "pf%dvf%d", port->pf_id, port->vf_id);
-		break;
-	default:
-		return -EOPNOTSUPP;
-	}
-
-	if (n >= len)
-		return -EINVAL;
-
-	return 0;
+	return devlink_port_get_phys_port_name(&port->dl_port, name, len);
 }
 
 /**
-- 
2.14.3

^ permalink raw reply related

* [patch net-next RFC 12/12] nfp: flower: set sysfs link to device for representors
From: Jiri Pirko @ 2018-03-22 10:55 UTC (permalink / raw)
  To: netdev
  Cc: davem, idosch, jakub.kicinski, mlxsw, andrew, vivien.didelot,
	f.fainelli, michael.chan, ganeshgr, saeedm, simon.horman,
	pieter.jansenvanvuuren, john.hurley, dirk.vandermerwe,
	alexander.h.duyck, ogerlitz, dsahern, vijaya.guvva,
	satananda.burla, raghu.vatsavayi, felix.manlunas, gospo,
	sathya.perla, vasundhara-v.volam, tariqt, eranbe,
	jeffrey.t.kirsher
In-Reply-To: <20180322105522.8186-1-jiri@resnulli.us>

From: Jiri Pirko <jiri@mellanox.com>

Do this so the sysfs has "device" link correctly set.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
---
 drivers/net/ethernet/netronome/nfp/flower/main.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/ethernet/netronome/nfp/flower/main.c b/drivers/net/ethernet/netronome/nfp/flower/main.c
index 1890af7e6196..9751708585e4 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/main.c
+++ b/drivers/net/ethernet/netronome/nfp/flower/main.c
@@ -267,6 +267,7 @@ nfp_flower_spawn_vnic_reprs(struct nfp_app *app,
 				app->pf->vf_cfg_mem + i * NFP_NET_CFG_BAR_SZ;
 		}
 
+		SET_NETDEV_DEV(repr, &priv->nn->pdev->dev);
 		eth_hw_addr_random(repr);
 
 		port_id = nfp_flower_cmsg_pcie_port(nfp_pcie, vnic_type,
-- 
2.14.3

^ permalink raw reply related

* Re: [PATCH nf] netfilter: drop template ct when conntrack is skipped.
From: Florian Westphal @ 2018-03-22 11:16 UTC (permalink / raw)
  To: Paolo Abeni
  Cc: netdev, Pablo Neira Ayuso, Florian Westphal, netfilter-devel,
	coreteam, syzbot+0346441ae0545cfcea3a, syzkaller-bugs
In-Reply-To: <7d4cd8cddda45ba93066e8f977aed5d16d220a67.1521713327.git.pabeni@redhat.com>

Paolo Abeni <pabeni@redhat.com> wrote:
> The ipv4 nf_ct code currently skips the nf_conntrak_in() call
> for fragmented packets. As a results later matches/target can end
> up manipulating template ct entry instead of 'real' ones.
> 
> Exploiting the above, syzbot found a way to trigger the following
> splat:
> 
> WARNING: CPU: 1 PID: 4242 at net/netfilter/xt_cluster.c:55
> xt_cluster_mt+0x6c1/0x840 net/netfilter/xt_cluster.c:127
> Kernel panic - not syncing: panic_on_warn set ...

Right, template has l3 protocol 0.

> Instead of adding checks for template ct on every target/match
> manipulating skb->_nfct, simply drop the template ct when skipping
> nf_conntrack_in().

Fixes: 7b4fdf77a450ec ("netfilter: don't track fragmented packets")
Acked-by: Florian Westphal <fw@strlen.de>

^ permalink raw reply

* Re: [PATCH/RFC 2/3] net/sched: act_tunnel_key: add extended ack support
From: Simon Horman @ 2018-03-22 11:49 UTC (permalink / raw)
  To: Jiri Benc
  Cc: Jiri Pirko, Jamal Hadi Salim, Cong Wang, netdev, oss-drivers,
	Pieter Jansen van Vuuren
In-Reply-To: <20180309122248.1979e00f@redhat.com>

On Fri, Mar 09, 2018 at 12:22:48PM +0100, Jiri Benc wrote:
> On Tue,  6 Mar 2018 18:08:04 +0100, Simon Horman wrote:
> > -	if (!tb[TCA_TUNNEL_KEY_PARMS])
> > +	if (!tb[TCA_TUNNEL_KEY_PARMS]) {
> > +		NL_SET_ERR_MSG(extack, "Missing tunnel key parameter");
> 
> "parameters" (it's not just one parameter)
> 
> > @@ -107,6 +109,7 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla,
> >  		break;
> >  	case TCA_TUNNEL_KEY_ACT_SET:
> >  		if (!tb[TCA_TUNNEL_KEY_ENC_KEY_ID]) {
> > +			NL_SET_ERR_MSG(extack, "Missing tunnel key enc id");
> 
> This is not much helpful to the user. What's "enc"? I guess "Missing
> tunnel key id" would be enough and better.
> 
> > @@ -144,6 +147,7 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla,
> >  						      0, flags,
> >  						      key_id, 0);
> >  		} else {
> > +			NL_SET_ERR_MSG(extack, "Missing both ipv4 and ipv6 enc src and dst");
> 
> We don't need both but only one of them. And again, "enc" does not have
> a clear meaning.
> 
> "Missing either IPv4 or IPv6 source and destination address"?

Sure, I'll work on making the messages clearer.

> In addition, it makes little sense to me to add extacks to just some of
> the errors in the tunnel_key_init function. Please add extacks to all
> of them.

At the time I wrote the patch I tried to cover those errors that could
result from user-input. I can extend the coverage if that is preferred.

^ permalink raw reply

* Re: [PATCH net] virtio-net: Fix operstate for virtio when no VIRTIO_NET_F_STATUS
From: Michael S. Tsirkin @ 2018-03-22 11:52 UTC (permalink / raw)
  To: Jay Vosburgh; +Cc: netdev, Jason Wang, David Miller, Ben Hutchings
In-Reply-To: <12441.1521709552@nyx>

On Thu, Mar 22, 2018 at 09:05:52AM +0000, Jay Vosburgh wrote:
> 	The operstate update logic will leave an interface in the
> default UNKNOWN operstate if the interface carrier state never changes
> from the default carrier up state set at creation.  This includes the
> case of an explicit call to netif_carrier_on, as the carrier on to on
> transition has no effect on operstate.
> 
> 	This affects virtio-net for the case that the virtio peer does
> not support VIRTIO_NET_F_STATUS (the feature that provides carrier state
> updates).  Without this feature, the virtio specification states that
> "the link should be assumed active," so, logically, the operstate should
> be UP instead of UNKNOWN.  This has impact on user space applications
> that use the operstate to make availability decisions for the interface.
> 
> 	Resolve this by changing the virtio probe logic slightly to call
> netif_carrier_off for both the "with" and "without" VIRTIO_NET_F_STATUS
> cases, and then the existing call to netif_carrier_on for the "without"
> case will cause an operstate transition.
> 
> Cc: "Michael S. Tsirkin" <mst@redhat.com>
> Cc: Jason Wang <jasowang@redhat.com>
> Cc: Ben Hutchings <ben@decadent.org.uk>
> Fixes: 167c25e4c550 ("virtio-net: init link state correctly")

I'd say that's an abuse of this notation. openstate was UNKNOWN
even before that fix.

> Signed-off-by: Jay Vosburgh <jay.vosburgh@canonical.com>

Acked-by: Michael S. Tsirkin <mst@redhat.com>


> ---
> 
> 	I considered resolving this by changing linkwatch_init_dev to
> unconditionally call rfc2863_policy, as that would always set operstate
> for all interfaces.
> 
> 	This would not have any impact on most cases (as most drivers
> call netif_carrier_off during probe), except for the loopback device,
> which currently has an operstate of UNKNOWN (because it never does any
> carrier state transitions).  This change would add a round trip on the
> dev_base_lock for every loopback device creation, which could have a
> negative impact when creating many loopback devices, e.g., when
> concurrently creating large numbers of containers.
> 
> 
>  drivers/net/virtio_net.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index 23374603e4d9..7b187ec7411e 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -2857,8 +2857,8 @@ static int virtnet_probe(struct virtio_device *vdev)
>  
>  	/* Assume link up if device can't report link status,
>  	   otherwise get link status from config. */
> +	netif_carrier_off(dev);
>  	if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_STATUS)) {
> -		netif_carrier_off(dev);
>  		schedule_work(&vi->config_work);
>  	} else {
>  		vi->status = VIRTIO_NET_S_LINK_UP;
> -- 
> 2.14.1

^ permalink raw reply

* Re: [PATCH/RFC 3/3] net/sched: add tunnel option support to act_tunnel_key
From: Simon Horman @ 2018-03-22 11:53 UTC (permalink / raw)
  To: Jiri Benc
  Cc: Jiri Pirko, Jamal Hadi Salim, Cong Wang, netdev, oss-drivers,
	Pieter Jansen van Vuuren
In-Reply-To: <20180309125317.076ddb5d@redhat.com>

On Fri, Mar 09, 2018 at 12:53:17PM +0100, Jiri Benc wrote:
> On Tue,  6 Mar 2018 18:08:05 +0100, Simon Horman wrote:
> > +static int
> > +tunnel_key_copy_geneve_opt(const struct nlattr *nla, int dst_len, void *dst,
> > +			   struct netlink_ext_ack *extack)
> > +{
> > +	struct nlattr *tb[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_MAX + 1];
> > +	int err, data_len, opt_len;
> > +	u8 *data;
> > +
> > +	err = nla_parse_nested(tb, TCA_TUNNEL_KEY_ENC_OPT_GENEVE_MAX,
> > +			       nla, geneve_opt_policy, extack);
> > +	if (err < 0)
> > +		return err;
> > +
> > +	if (!tb[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_CLASS] ||
> > +	    !tb[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_TYPE] ||
> > +	    !tb[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_DATA]) {
> > +		NL_SET_ERR_MSG(extack, "Missing tunnel key enc geneve option class, type or data");
> 
> I think it's obvious by now that I don't like the "enc" :-)
> 
> > +		return -EINVAL;
> > +	}
> > +
> > +	data = nla_data(tb[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_DATA]);
> > +	data_len = nla_len(tb[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_DATA]);
> > +	if (data_len < 4) {
> > +		NL_SET_ERR_MSG(extack, "Tunnel key enc geneve option data is less than 4 bytes long");
> > +		return -ERANGE;
> > +	}
> > +	if (data_len % 4) {
> > +		NL_SET_ERR_MSG(extack, "Tunnel key enc geneve option data is not a multiple of 4 bytes long");
> > +		return -ERANGE;
> > +	}
> > +
> > +	opt_len = sizeof(struct geneve_opt) + data_len;
> > +	if (dst) {
> > +		struct geneve_opt *opt = dst;
> > +		u16 class;
> > +
> > +		if (dst_len < opt_len) {
> > +			NL_SET_ERR_MSG(extack, "Tunnel key enc geneve option data length is longer than the supplied data");
> 
> I don't understand this error. What can the user do about it?
> Furthermore, the error is not propagated to the user space (see also
> below).
> 
> Shouldn't this be WARN_ON or something?

Sure, that is fine by me.

> 
> > +			return -EINVAL;
> > +		}
> > +
> > +		class = nla_get_u16(tb[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_CLASS]);
> > +		put_unaligned_be16(class, &opt->opt_class);
> 
> How this can be unaligned, given we check that the option length is a
> multiple of 4 bytes and the option header is 4 bytes?

True.

> > +
> > +		opt->type = nla_get_u8(tb[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_TYPE]);
> > +		opt->length = data_len / 4; /* length is in units of 4 bytes */
> > +		opt->r1 = 0;
> > +		opt->r2 = 0;
> > +		opt->r3 = 0;
> > +
> > +		memcpy(opt + 1, data, data_len);
> > +	}
> > +
> > +	return opt_len;
> > +}
> > +
> > +static int tunnel_key_copy_opts(const struct nlattr *nla, u8 *dst,
> > +				int dst_len, struct netlink_ext_ack *extack)
> 
> Please be consistent with parameter ordering, dst and dst_len are in a
> different order here and in tunnel_key_copy_geneve_opt.

Sure, will do.

> [...]
> > @@ -157,6 +292,11 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla,
> >  			goto err_out;
> >  		}
> >  
> > +		if (opts_len)
> > +			tunnel_key_opts_set(tb[TCA_TUNNEL_KEY_ENC_OPTS],
> > +					    &metadata->u.tun_info, opts_len,
> > +					    extack);
> 
> You need to propagate the error here. The previous validation is not
> enough as errors while copying to tun_info were not covered.

Thanks, sorry for missing that.

> > +
> >  		metadata->u.tun_info.mode |= IP_TUNNEL_INFO_TX;
> >  		break;
> >  	default:
> > @@ -221,6 +361,53 @@ static void tunnel_key_release(struct tc_action *a)
> >  	kfree_rcu(params, rcu);
> >  }
> >  
> > +static int tunnel_key_geneve_opts_dump(struct sk_buff *skb,
> > +				       const struct ip_tunnel_info *info)
> > +{
> > +	int len = info->options_len;
> > +	u8 *src = (u8 *)(info + 1);
> > +
> > +	while (len > 0) {
> > +		struct geneve_opt *opt = (struct geneve_opt *)src;
> > +		u16 class;
> > +
> > +		class = get_unaligned_be16(&opt->opt_class);
> 
> I don't think this can be unaligned.

Thanks, I'm not sure why I thought otherwise.

> > +		if (nla_put_u16(skb, TCA_TUNNEL_KEY_ENC_OPT_GENEVE_CLASS,
> > +				class) ||
> > +		    nla_put_u8(skb, TCA_TUNNEL_KEY_ENC_OPT_GENEVE_TYPE,
> > +			       opt->type) ||
> > +		    nla_put(skb, TCA_TUNNEL_KEY_ENC_OPT_GENEVE_DATA,
> > +			    opt->length * 4, opt + 1))
> > +			return -EMSGSIZE;
> > +
> > +		len -= sizeof(struct geneve_opt) + opt->length * 4;
> > +		src += sizeof(struct geneve_opt) + opt->length * 4;
> > +	}
> 
> All of this needs to be nested in TCA_TUNNEL_KEY_ENC_OPTS_GENEVE.

Agreed.

> > +
> > +	return 0;
> > +}
> > +
> > +static int tunnel_key_opts_dump(struct sk_buff *skb,
> > +				const struct ip_tunnel_info *info)
> > +{
> > +	struct nlattr *start;
> > +	int err;
> > +
> > +	if (!info->options_len)
> > +		return 0;
> > +
> > +	start = nla_nest_start(skb, TCA_TUNNEL_KEY_ENC_OPTS);
> > +	if (!start)
> > +		return -EMSGSIZE;
> > +
> > +	err = tunnel_key_geneve_opts_dump(skb, info);
> 
> How do you know it is geneve opts and not some other (future) tunnel
> type options?
> 
> This is actually more fundamental problem with this patch. The
> metadata_dst options are blindly set to the provided data, yet nowhere
> we check whether the tunnel type matches. This must be done somehow. We
> probably need to store the options type in metadata_dst.

Thanks for pointing that out. Its a pretty fundamental oversight.

^ permalink raw reply

* Re: [PATCH nf] netfilter: drop template ct when conntrack is skipped.
From: Pablo Neira Ayuso @ 2018-03-22 11:56 UTC (permalink / raw)
  To: Paolo Abeni
  Cc: netdev, Florian Westphal, netfilter-devel, coreteam,
	syzbot+0346441ae0545cfcea3a, syzkaller-bugs
In-Reply-To: <7d4cd8cddda45ba93066e8f977aed5d16d220a67.1521713327.git.pabeni@redhat.com>

On Thu, Mar 22, 2018 at 11:08:50AM +0100, Paolo Abeni wrote:
> The ipv4 nf_ct code currently skips the nf_conntrak_in() call
> for fragmented packets. As a results later matches/target can end
> up manipulating template ct entry instead of 'real' ones.
> 
> Exploiting the above, syzbot found a way to trigger the following
> splat:
> 
> WARNING: CPU: 1 PID: 4242 at net/netfilter/xt_cluster.c:55
> xt_cluster_mt+0x6c1/0x840 net/netfilter/xt_cluster.c:127
> Kernel panic - not syncing: panic_on_warn set ...
> 
> CPU: 1 PID: 4242 Comm: syzkaller027971 Not tainted 4.16.0-rc2+ #243
> Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS
> Google 01/01/2011
> Call Trace:
>   __dump_stack lib/dump_stack.c:17 [inline]
>   dump_stack+0x194/0x24d lib/dump_stack.c:53
>   panic+0x1e4/0x41c kernel/panic.c:183
>   __warn+0x1dc/0x200 kernel/panic.c:547
>   report_bug+0x211/0x2d0 lib/bug.c:184
>   fixup_bug.part.11+0x37/0x80 arch/x86/kernel/traps.c:178
>   fixup_bug arch/x86/kernel/traps.c:247 [inline]
>   do_error_trap+0x2d7/0x3e0 arch/x86/kernel/traps.c:296
>   do_invalid_op+0x1b/0x20 arch/x86/kernel/traps.c:315
>   invalid_op+0x58/0x80 arch/x86/entry/entry_64.S:957
> RIP: 0010:xt_cluster_hash net/netfilter/xt_cluster.c:55 [inline]
> RIP: 0010:xt_cluster_mt+0x6c1/0x840 net/netfilter/xt_cluster.c:127
> RSP: 0018:ffff8801d2f6f2d0 EFLAGS: 00010293
> RAX: ffff8801af700540 RBX: 0000000000000000 RCX: ffffffff84a2d1e1
> RDX: 0000000000000000 RSI: ffff8801d2f6f478 RDI: ffff8801cafd336a
> RBP: ffff8801d2f6f2e8 R08: 0000000000000000 R09: 0000000000000001
> R10: 0000000000000000 R11: 0000000000000000 R12: ffff8801b03b3d18
> R13: ffff8801cafd3300 R14: dffffc0000000000 R15: ffff8801d2f6f478
>   ipt_do_table+0xa91/0x19b0 net/ipv4/netfilter/ip_tables.c:296
>   iptable_filter_hook+0x65/0x80 net/ipv4/netfilter/iptable_filter.c:41
>   nf_hook_entry_hookfn include/linux/netfilter.h:120 [inline]
>   nf_hook_slow+0xba/0x1a0 net/netfilter/core.c:483
>   nf_hook include/linux/netfilter.h:243 [inline]
>   NF_HOOK include/linux/netfilter.h:286 [inline]
>   raw_send_hdrinc.isra.17+0xf39/0x1880 net/ipv4/raw.c:432
>   raw_sendmsg+0x14cd/0x26b0 net/ipv4/raw.c:669
>   inet_sendmsg+0x11f/0x5e0 net/ipv4/af_inet.c:763
>   sock_sendmsg_nosec net/socket.c:629 [inline]
>   sock_sendmsg+0xca/0x110 net/socket.c:639
>   SYSC_sendto+0x361/0x5c0 net/socket.c:1748
>   SyS_sendto+0x40/0x50 net/socket.c:1716
>   do_syscall_64+0x280/0x940 arch/x86/entry/common.c:287
>   entry_SYSCALL_64_after_hwframe+0x42/0xb7
> RIP: 0033:0x441b49
> RSP: 002b:00007ffff5ca8b18 EFLAGS: 00000216 ORIG_RAX: 000000000000002c
> RAX: ffffffffffffffda RBX: 00000000004002c8 RCX: 0000000000441b49
> RDX: 0000000000000030 RSI: 0000000020ff7000 RDI: 0000000000000003
> RBP: 00000000006cc018 R08: 000000002066354c R09: 0000000000000010
> R10: 0000000000000000 R11: 0000000000000216 R12: 0000000000403470
> R13: 0000000000403500 R14: 0000000000000000 R15: 0000000000000000
> Dumping ftrace buffer:
>     (ftrace buffer empty)
> Kernel Offset: disabled
> Rebooting in 86400 seconds..
> 
> Instead of adding checks for template ct on every target/match
> manipulating skb->_nfct, simply drop the template ct when skipping
> nf_conntrack_in().

Applied, thanks.

^ permalink raw reply

* Re: [PATCH v2 2/2] i40e: add support for XDP_REDIRECT
From: Jesper Dangaard Brouer @ 2018-03-22 11:58 UTC (permalink / raw)
  To: Björn Töpel
  Cc: brouer, jeffrey.t.kirsher, intel-wired-lan, Björn Töpel,
	magnus.karlsson, netdev, alexander.h.duyck, alexander.duyck
In-Reply-To: <20180322090307.14409-2-bjorn.topel@gmail.com>


On Thu, 22 Mar 2018 10:03:07 +0100 Björn Töpel <bjorn.topel@gmail.com> wrote:

> +/**
> + * i40e_xdp_xmit - Implements ndo_xdp_xmit
> + * @dev: netdev
> + * @xdp: XDP buffer
> + *
> + * Returns Zero if sent, else an error code
> + **/
> +int i40e_xdp_xmit(struct net_device *dev, struct xdp_buff *xdp)
> +{

The return code is used by the XDP redirect tracepoint... this is the
only way we have to debug/troubleshoot runtime issues with XDP. Thus,
these need to be consistent across drives and distinguishable.

> +	struct i40e_netdev_priv *np = netdev_priv(dev);
> +	unsigned int queue_index = smp_processor_id();
> +	struct i40e_vsi *vsi = np->vsi;
> +	int err;
> +
> +	if (test_bit(__I40E_VSI_DOWN, vsi->state))
> +		return -EINVAL;

Should be: -ENETDOWN

> +
> +	if (!i40e_enabled_xdp_vsi(vsi) || queue_index >= vsi->num_queue_pairs)
> +		return -EINVAL;

Should be: -ENXIO

> +	err = i40e_xmit_xdp_ring(xdp, vsi->xdp_rings[queue_index]);
> +	if (err != I40E_XDP_TX)
> +		return -ENOMEM;

Should be: -ENOSPC

The ENOSPC return code is important, as this can be used as a feedback
to a XDP_REDIRECT load-balancer facility.

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer

^ permalink raw reply

* Re: [PATCH net] virtio-net: Fix operstate for virtio when no VIRTIO_NET_F_STATUS
From: Jay Vosburgh @ 2018-03-22 12:02 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: netdev, Jason Wang, David Miller, Ben Hutchings
In-Reply-To: <20180322134901-mutt-send-email-mst@kernel.org>

Michael S. Tsirkin <mst@redhat.com> wrote:

>On Thu, Mar 22, 2018 at 09:05:52AM +0000, Jay Vosburgh wrote:
>> 	The operstate update logic will leave an interface in the
>> default UNKNOWN operstate if the interface carrier state never changes
>> from the default carrier up state set at creation.  This includes the
>> case of an explicit call to netif_carrier_on, as the carrier on to on
>> transition has no effect on operstate.
>> 
>> 	This affects virtio-net for the case that the virtio peer does
>> not support VIRTIO_NET_F_STATUS (the feature that provides carrier state
>> updates).  Without this feature, the virtio specification states that
>> "the link should be assumed active," so, logically, the operstate should
>> be UP instead of UNKNOWN.  This has impact on user space applications
>> that use the operstate to make availability decisions for the interface.
>> 
>> 	Resolve this by changing the virtio probe logic slightly to call
>> netif_carrier_off for both the "with" and "without" VIRTIO_NET_F_STATUS
>> cases, and then the existing call to netif_carrier_on for the "without"
>> case will cause an operstate transition.
>> 
>> Cc: "Michael S. Tsirkin" <mst@redhat.com>
>> Cc: Jason Wang <jasowang@redhat.com>
>> Cc: Ben Hutchings <ben@decadent.org.uk>
>> Fixes: 167c25e4c550 ("virtio-net: init link state correctly")
>
>I'd say that's an abuse of this notation. openstate was UNKNOWN
>even before that fix.

	I went back to the commit that added the dependency on
VIRTIO_NET_F_STATUS (and that this patch would thus apply on top of).
If that's an issue, I can resubmit without it.

	-J

>> Signed-off-by: Jay Vosburgh <jay.vosburgh@canonical.com>
>
>Acked-by: Michael S. Tsirkin <mst@redhat.com>
>
>
>> ---
>> 
>> 	I considered resolving this by changing linkwatch_init_dev to
>> unconditionally call rfc2863_policy, as that would always set operstate
>> for all interfaces.
>> 
>> 	This would not have any impact on most cases (as most drivers
>> call netif_carrier_off during probe), except for the loopback device,
>> which currently has an operstate of UNKNOWN (because it never does any
>> carrier state transitions).  This change would add a round trip on the
>> dev_base_lock for every loopback device creation, which could have a
>> negative impact when creating many loopback devices, e.g., when
>> concurrently creating large numbers of containers.
>> 
>> 
>>  drivers/net/virtio_net.c | 2 +-
>>  1 file changed, 1 insertion(+), 1 deletion(-)
>> 
>> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
>> index 23374603e4d9..7b187ec7411e 100644
>> --- a/drivers/net/virtio_net.c
>> +++ b/drivers/net/virtio_net.c
>> @@ -2857,8 +2857,8 @@ static int virtnet_probe(struct virtio_device *vdev)
>>  
>>  	/* Assume link up if device can't report link status,
>>  	   otherwise get link status from config. */
>> +	netif_carrier_off(dev);
>>  	if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_STATUS)) {
>> -		netif_carrier_off(dev);
>>  		schedule_work(&vi->config_work);
>>  	} else {
>>  		vi->status = VIRTIO_NET_S_LINK_UP;
>> -- 
>> 2.14.1

^ permalink raw reply

* Re: [PATCH v2 2/2] i40e: add support for XDP_REDIRECT
From: Björn Töpel @ 2018-03-22 12:20 UTC (permalink / raw)
  To: Jesper Dangaard Brouer
  Cc: Jeff Kirsher, intel-wired-lan, Björn Töpel,
	Karlsson, Magnus, Netdev, Duyck, Alexander H, Alexander Duyck
In-Reply-To: <20180322125811.0dcc1b28@redhat.com>

2018-03-22 12:58 GMT+01:00 Jesper Dangaard Brouer <brouer@redhat.com>:
>
> On Thu, 22 Mar 2018 10:03:07 +0100 Björn Töpel <bjorn.topel@gmail.com> wrote:
>
>> +/**
>> + * i40e_xdp_xmit - Implements ndo_xdp_xmit
>> + * @dev: netdev
>> + * @xdp: XDP buffer
>> + *
>> + * Returns Zero if sent, else an error code
>> + **/
>> +int i40e_xdp_xmit(struct net_device *dev, struct xdp_buff *xdp)
>> +{
>
> The return code is used by the XDP redirect tracepoint... this is the
> only way we have to debug/troubleshoot runtime issues with XDP. Thus,
> these need to be consistent across drives and distinguishable.
>

Thanks for pointing this out! I'll address all your comments and do a
respin (but I'll wait for Alex' comments, if any).


Björn

>> +     struct i40e_netdev_priv *np = netdev_priv(dev);
>> +     unsigned int queue_index = smp_processor_id();
>> +     struct i40e_vsi *vsi = np->vsi;
>> +     int err;
>> +
>> +     if (test_bit(__I40E_VSI_DOWN, vsi->state))
>> +             return -EINVAL;
>
> Should be: -ENETDOWN
>
>> +
>> +     if (!i40e_enabled_xdp_vsi(vsi) || queue_index >= vsi->num_queue_pairs)
>> +             return -EINVAL;
>
> Should be: -ENXIO
>
>> +     err = i40e_xmit_xdp_ring(xdp, vsi->xdp_rings[queue_index]);
>> +     if (err != I40E_XDP_TX)
>> +             return -ENOMEM;
>
> Should be: -ENOSPC
>
> The ENOSPC return code is important, as this can be used as a feedback
> to a XDP_REDIRECT load-balancer facility.
>
> --
> Best regards,
>   Jesper Dangaard Brouer
>   MSc.CS, Principal Kernel Engineer at Red Hat
>   LinkedIn: http://www.linkedin.com/in/brouer

^ permalink raw reply

* [RESEND PATCH net-next 1/1] tc-testing: updated police, mirred, skbedit and skbmod with more tests
From: Roman Mashak @ 2018-03-22 12:23 UTC (permalink / raw)
  To: davem; +Cc: netdev, jhs, xiyou.wangcong, jiri, Roman Mashak

Added extra test cases for control actions (reclassify, pipe etc.),
cookies, max index value and police args sanity check.

Signed-off-by: Roman Mashak <mrv@mojatatu.com>
---
 .../tc-testing/tc-tests/actions/mirred.json        | 192 +++++++++++++++++++++
 .../tc-testing/tc-tests/actions/police.json        | 144 ++++++++++++++++
 .../tc-testing/tc-tests/actions/skbedit.json       | 168 ++++++++++++++++++
 .../tc-testing/tc-tests/actions/skbmod.json        |  26 ++-
 4 files changed, 529 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/tc-testing/tc-tests/actions/mirred.json b/tools/testing/selftests/tc-testing/tc-tests/actions/mirred.json
index 0fcccf18399b..443c9b3c8664 100644
--- a/tools/testing/selftests/tc-testing/tc-tests/actions/mirred.json
+++ b/tools/testing/selftests/tc-testing/tc-tests/actions/mirred.json
@@ -171,6 +171,198 @@
         ]
     },
     {
+        "id": "8917",
+        "name": "Add mirred mirror action with control pass",
+        "category": [
+            "actions",
+            "mirred"
+        ],
+        "setup": [
+            [
+                "$TC actions flush action mirred",
+                0,
+                1,
+                255
+            ]
+        ],
+        "cmdUnderTest": "$TC actions add action mirred ingress mirror dev lo pass index 1",
+        "expExitCode": "0",
+        "verifyCmd": "$TC actions get action mirred index 1",
+        "matchPattern": "action order [0-9]*: mirred \\(Ingress Mirror to device lo\\) pass.*index 1 ref",
+        "matchCount": "1",
+        "teardown": [
+            "$TC actions flush action mirred"
+        ]
+    },
+    {
+        "id": "1054",
+        "name": "Add mirred mirror action with control pipe",
+        "category": [
+            "actions",
+            "mirred"
+        ],
+        "setup": [
+            [
+                "$TC actions flush action mirred",
+                0,
+                1,
+                255
+            ]
+        ],
+        "cmdUnderTest": "$TC actions add action mirred ingress mirror dev lo pipe index 15",
+        "expExitCode": "0",
+        "verifyCmd": "$TC actions get action mirred index 15",
+        "matchPattern": "action order [0-9]*: mirred \\(Ingress Mirror to device lo\\) pipe.*index 15 ref",
+        "matchCount": "1",
+        "teardown": [
+            "$TC actions flush action mirred"
+        ]
+    },
+    {
+        "id": "9887",
+        "name": "Add mirred mirror action with control continue",
+        "category": [
+            "actions",
+            "mirred"
+        ],
+        "setup": [
+            [
+                "$TC actions flush action mirred",
+                0,
+                1,
+                255
+            ]
+        ],
+        "cmdUnderTest": "$TC actions add action mirred ingress mirror dev lo continue index 15",
+        "expExitCode": "0",
+        "verifyCmd": "$TC actions get action mirred index 15",
+        "matchPattern": "action order [0-9]*: mirred \\(Ingress Mirror to device lo\\) continue.*index 15 ref",
+        "matchCount": "1",
+        "teardown": [
+            "$TC actions flush action mirred"
+        ]
+    },
+    {
+        "id": "e4aa",
+        "name": "Add mirred mirror action with control reclassify",
+        "category": [
+            "actions",
+            "mirred"
+        ],
+        "setup": [
+            [
+                "$TC actions flush action mirred",
+                0,
+                1,
+                255
+            ]
+        ],
+        "cmdUnderTest": "$TC actions add action mirred ingress mirror dev lo reclassify index 150",
+        "expExitCode": "0",
+        "verifyCmd": "$TC actions get action mirred index 150",
+        "matchPattern": "action order [0-9]*: mirred \\(Ingress Mirror to device lo\\) reclassify.*index 150 ref",
+        "matchCount": "1",
+        "teardown": [
+            "$TC actions flush action mirred"
+        ]
+    },
+    {
+        "id": "ece9",
+        "name": "Add mirred mirror action with control drop",
+        "category": [
+            "actions",
+            "mirred"
+        ],
+        "setup": [
+            [
+                "$TC actions flush action mirred",
+                0,
+                1,
+                255
+            ]
+        ],
+        "cmdUnderTest": "$TC actions add action mirred ingress mirror dev lo drop index 99",
+        "expExitCode": "0",
+        "verifyCmd": "$TC actions get action mirred index 99",
+        "matchPattern": "action order [0-9]*: mirred \\(Ingress Mirror to device lo\\) drop.*index 99 ref",
+        "matchCount": "1",
+        "teardown": [
+            "$TC actions flush action mirred"
+        ]
+    },
+    {
+        "id": "0031",
+        "name": "Add mirred mirror action with control jump",
+        "category": [
+            "actions",
+            "mirred"
+        ],
+        "setup": [
+            [
+                "$TC actions flush action mirred",
+                0,
+                1,
+                255
+            ]
+        ],
+        "cmdUnderTest": "$TC actions add action mirred ingress mirror dev lo jump 10 index 99",
+        "expExitCode": "0",
+        "verifyCmd": "$TC actions get action mirred index 99",
+        "matchPattern": "action order [0-9]*: mirred \\(Ingress Mirror to device lo\\) jump 10.*index 99 ref",
+        "matchCount": "1",
+        "teardown": [
+            "$TC actions flush action mirred"
+        ]
+    },
+    {
+        "id": "407c",
+        "name": "Add mirred mirror action with cookie",
+        "category": [
+            "actions",
+            "mirred"
+        ],
+        "setup": [
+            [
+                "$TC actions flush action mirred",
+                0,
+                1,
+                255
+            ]
+        ],
+        "cmdUnderTest": "$TC actions add action mirred ingress mirror dev lo reclassify cookie aa11bb22cc33dd44ee55",
+        "expExitCode": "0",
+        "verifyCmd": "$TC actions ls action mirred",
+        "matchPattern": "action order [0-9]*: mirred \\(Ingress Mirror to device lo\\) reclassify.*cookie aa11bb22cc33dd44ee55",
+        "matchCount": "1",
+        "teardown": [
+            "$TC actions flush action mirred"
+        ]
+    },
+    {
+        "id": "8b69",
+        "name": "Add mirred mirror action with maximum index",
+        "category": [
+            "actions",
+            "mirred"
+        ],
+        "setup": [
+            [
+                "$TC actions flush action mirred",
+                0,
+                1,
+                255
+            ]
+        ],
+        "cmdUnderTest": "$TC actions add action mirred ingress mirror dev lo pipe index 4294967295",
+        "expExitCode": "0",
+        "verifyCmd": "$TC actions get action mirred index 4294967295",
+        "matchPattern": "action order [0-9]*: mirred \\(Ingress Mirror to device lo\\) pipe.*index 4294967295",
+        "matchCount": "1",
+        "teardown": [
+            "$TC actions flush action mirred"
+        ]
+    },
+    {
         "id": "a70e",
         "name": "Delete mirred mirror action",
         "category": [
diff --git a/tools/testing/selftests/tc-testing/tc-tests/actions/police.json b/tools/testing/selftests/tc-testing/tc-tests/actions/police.json
index 0e602a3f9393..38d85a1d7492 100644
--- a/tools/testing/selftests/tc-testing/tc-tests/actions/police.json
+++ b/tools/testing/selftests/tc-testing/tc-tests/actions/police.json
@@ -265,6 +265,150 @@
         ]
     },
     {
+        "id": "ddd6",
+        "name": "Add police action with invalid rate value",
+        "category": [
+            "actions",
+            "police"
+        ],
+        "setup": [
+            [
+                "$TC actions flush action police",
+                0,
+                1,
+                255
+            ]
+        ],
+        "cmdUnderTest": "$TC actions add action police rate 3tb burst 250k conform-exceed pass/pipe index 5",
+        "expExitCode": "255",
+        "verifyCmd": "$TC actions ls action police",
+        "matchPattern": "action order [0-9]*:  police 0x5 rate 3Tb burst 250Kb mtu 2Kb action pass/pipe",
+        "matchCount": "0",
+        "teardown": [
+            "$TC actions flush action police"
+        ]
+    },
+    {
+        "id": "f61c",
+        "name": "Add police action with invalid burst value",
+        "category": [
+            "actions",
+            "police"
+        ],
+        "setup": [
+            [
+                "$TC actions flush action police",
+                0,
+                1,
+                255
+            ]
+        ],
+        "cmdUnderTest": "$TC actions add action police rate 3kbit burst 250P conform-exceed pass/pipe index 5",
+        "expExitCode": "255",
+        "verifyCmd": "$TC actions ls action police",
+        "matchPattern": "action order [0-9]*:  police 0x5 rate 3Kbit burst 250Pb mtu 2Kb action pass/pipe",
+        "matchCount": "0",
+        "teardown": [
+            "$TC actions flush action police"
+        ]
+    },
+    {
+        "id": "c26f",
+        "name": "Add police action with invalid peakrate value",
+        "category": [
+            "actions",
+            "police"
+        ],
+        "setup": [
+            [
+                "$TC actions flush action police",
+                0,
+                1,
+                255
+            ]
+        ],
+        "cmdUnderTest": "$TC actions add action police rate 90kbit burst 10k mtu 2kb peakrate 100T index 1",
+        "expExitCode": "255",
+        "verifyCmd": "$TC actions ls action police",
+        "matchPattern": "action order [0-9]*:  police 0x1 rate 90Kbit burst 10Kb mtu 2Kb peakrate 100Tbit",
+        "matchCount": "0",
+        "teardown": [
+            "$TC actions flush action police"
+        ]
+    },
+    {
+        "id": "db04",
+        "name": "Add police action with invalid mtu value",
+        "category": [
+            "actions",
+            "police"
+        ],
+        "setup": [
+            [
+                "$TC actions flush action police",
+                0,
+                1,
+                255
+            ]
+        ],
+        "cmdUnderTest": "$TC actions add action police rate 10kbit burst 10k mtu 2Pbit index 1",
+        "expExitCode": "255",
+        "verifyCmd": "$TC actions ls action police",
+        "matchPattern": "action order [0-9]*:  police 0x1 rate 10Kbit burst 1Kb mtu 2Pb",
+        "matchCount": "0",
+        "teardown": [
+            "$TC actions flush action police"
+        ]
+    },
+    {
+        "id": "f3c9",
+        "name": "Add police action with cookie",
+        "category": [
+            "actions",
+            "police"
+        ],
+        "setup": [
+            [
+                "$TC actions flush action police",
+                0,
+                1,
+                255
+            ]
+        ],
+        "cmdUnderTest": "$TC actions add action police rate 10mbit burst 10k index 1 cookie a1b1c1d1e1f12233bb",
+        "expExitCode": "0",
+        "verifyCmd": "$TC actions get action police index 1",
+        "matchPattern": "action order [0-9]*:  police 0x1 rate 10Mbit burst 10Kb mtu 2Kb.*cookie a1b1c1d1e1f12233bb",
+        "matchCount": "1",
+        "teardown": [
+            "$TC actions flush action police"
+        ]
+    },
+    {
+        "id": "d190",
+        "name": "Add police action with maximum index",
+        "category": [
+            "actions",
+            "police"
+        ],
+        "setup": [
+            [
+                "$TC actions flush action police",
+                0,
+                1,
+                255
+            ]
+        ],
+        "cmdUnderTest": "$TC actions add action police rate 10mbit burst 10k index 4294967295",
+        "expExitCode": "0",
+        "verifyCmd": "$TC actions get action mirred index 4294967295",
+        "matchPattern": "action order [0-9]*:  police 0xffffffff rate 10Mbit burst 10Kb mtu 2Kb",
+        "matchCount": "1",
+        "teardown": [
+            "$TC actions flush action mirred"
+        ]
+    },
+    {
         "id": "336e",
         "name": "Delete police action",
         "category": [
diff --git a/tools/testing/selftests/tc-testing/tc-tests/actions/skbedit.json b/tools/testing/selftests/tc-testing/tc-tests/actions/skbedit.json
index 99635ea4722e..37ecc2716fee 100644
--- a/tools/testing/selftests/tc-testing/tc-tests/actions/skbedit.json
+++ b/tools/testing/selftests/tc-testing/tc-tests/actions/skbedit.json
@@ -216,6 +216,174 @@
         ]
     },
     {
+        "id": "464a",
+        "name": "Add skbedit action with control pipe",
+        "category": [
+            "actions",
+            "skbedit"
+        ],
+        "setup": [
+            [
+                "$TC actions flush action skbedit",
+                0,
+                1,
+                255
+            ]
+        ],
+        "cmdUnderTest": "$TC actions add action skbedit ptype host pipe index 11",
+        "expExitCode": "0",
+        "verifyCmd": "$TC actions get action skbedit index 11",
+        "matchPattern": "action order [0-9]*:  skbedit ptype host pipe.*index 11 ref",
+        "matchCount": "1",
+        "teardown": [
+            "$TC actions flush action skbedit"
+        ]
+    },
+    {
+        "id": "212f",
+        "name": "Add skbedit action with control reclassify",
+        "category": [
+            "actions",
+            "skbedit"
+        ],
+        "setup": [
+            [
+                "$TC actions flush action skbedit",
+                0,
+                1,
+                255
+            ]
+        ],
+        "cmdUnderTest": "$TC actions add action skbedit mark 56789 reclassify index 90",
+        "expExitCode": "0",
+        "verifyCmd": "$TC actions get action skbedit index 90",
+        "matchPattern": "action order [0-9]*:  skbedit mark 56789 reclassify.*index 90 ref",
+        "matchCount": "1",
+        "teardown": [
+            "$TC actions flush action skbedit"
+        ]
+    },
+    {
+        "id": "0651",
+        "name": "Add skbedit action with control pass",
+        "category": [
+            "actions",
+            "skbedit"
+        ],
+        "setup": [
+            [
+                "$TC actions flush action skbedit",
+                0,
+                1,
+                255
+            ]
+        ],
+        "cmdUnderTest": "$TC actions add action skbedit queue_mapping 3 pass index 271",
+        "expExitCode": "0",
+        "verifyCmd": "$TC actions get action skbedit index 271",
+        "matchPattern": "action order [0-9]*:  skbedit queue_mapping 3 pass.*index 271 ref",
+        "matchCount": "1",
+        "teardown": [
+            "$TC actions flush action skbedit"
+        ]
+    },
+    {
+        "id": "cc53",
+        "name": "Add skbedit action with control drop",
+        "category": [
+            "actions",
+            "skbedit"
+        ],
+        "setup": [
+            [
+                "$TC actions flush action skbedit",
+                0,
+                1,
+                255
+            ]
+        ],
+        "cmdUnderTest": "$TC actions add action skbedit queue_mapping 3 drop index 271",
+        "expExitCode": "0",
+        "verifyCmd": "$TC actions get action skbedit index 271",
+        "matchPattern": "action order [0-9]*:  skbedit queue_mapping 3 drop.*index 271 ref",
+        "matchCount": "1",
+        "teardown": [
+            "$TC actions flush action skbedit"
+        ]
+    },
+    {
+        "id": "ec16",
+        "name": "Add skbedit action with control jump",
+        "category": [
+            "actions",
+            "skbedit"
+        ],
+        "setup": [
+            [
+                "$TC actions flush action skbedit",
+                0,
+                1,
+                255
+            ]
+        ],
+        "cmdUnderTest": "$TC actions add action skbedit priority 8 jump 9 index 2",
+        "expExitCode": "0",
+        "verifyCmd": "$TC actions get action skbedit index 2",
+        "matchPattern": "action order [0-9]*:  skbedit priority :8 jump 9.*index 2 ref",
+        "matchCount": "1",
+        "teardown": [
+            "$TC actions flush action skbedit"
+        ]
+    },
+    {
+        "id": "db54",
+        "name": "Add skbedit action with control continue",
+        "category": [
+            "actions",
+            "skbedit"
+        ],
+        "setup": [
+            [
+                "$TC actions flush action skbedit",
+                0,
+                1,
+                255
+            ]
+        ],
+        "cmdUnderTest": "$TC actions add action skbedit priority 16 continue index 32",
+        "expExitCode": "0",
+        "verifyCmd": "$TC actions get action skbedit index 32",
+        "matchPattern": "action order [0-9]*:  skbedit priority :16 continue.*index 32 ref",
+        "matchCount": "1",
+        "teardown": [
+            "$TC actions flush action skbedit"
+        ]
+    },
+    {
+        "id": "1055",
+        "name": "Add skbedit action with cookie",
+        "category": [
+            "actions",
+            "skbedit"
+        ],
+        "setup": [
+            [
+                "$TC actions flush action skbedit",
+                0,
+                1,
+                255
+            ]
+        ],
+        "cmdUnderTest": "$TC actions add action skbedit priority 16 continue index 32 cookie deadbeef",
+        "expExitCode": "0",
+        "verifyCmd": "$TC actions get action skbedit index 32",
+        "matchPattern": "action order [0-9]*:  skbedit priority :16 continue.*index 32 ref.*cookie deadbeef",
+        "matchCount": "1",
+        "teardown": [
+            "$TC actions flush action skbedit"
+        ]
+    },
+    {
         "id": "5172",
         "name": "List skbedit actions",
         "category": [
diff --git a/tools/testing/selftests/tc-testing/tc-tests/actions/skbmod.json b/tools/testing/selftests/tc-testing/tc-tests/actions/skbmod.json
index 90bba48c3f07..8aa5a88ccb19 100644
--- a/tools/testing/selftests/tc-testing/tc-tests/actions/skbmod.json
+++ b/tools/testing/selftests/tc-testing/tc-tests/actions/skbmod.json
@@ -264,6 +264,30 @@
         ]
     },
     {
+        "id": "6046",
+        "name": "Add skbmod action with control reclassify and cookie",
+        "category": [
+            "actions",
+            "skbmod"
+        ],
+        "setup": [
+            [
+                "$TC actions flush action skbmod",
+                0,
+                1,
+                255
+            ]
+        ],
+        "cmdUnderTest": "$TC actions add action skbmod set smac 00:01:02:03:04:01 reclassify index 1 cookie ddeeffaabb11cc22",
+        "expExitCode": "0",
+        "verifyCmd": "$TC actions get action skbmod index 1",
+        "matchPattern": "action order [0-9]*: skbmod reclassify set smac 00:01:02:03:04:01.*index 1 ref.*cookie ddeeffaabb11cc22",
+        "matchCount": "1",
+        "teardown": [
+            "$TC actions flush action skbmod"
+        ]
+    },
+    {
         "id": "58cb",
         "name": "List skbmod actions",
         "category": [
@@ -369,4 +393,4 @@
             "$TC actions flush action skbmod"
         ]
     }
-]
+]
\ No newline at end of file
-- 
2.7.4

^ permalink raw reply related

* Re: [PATCH net-next 06/14] net/tls: Add generic NIC offload infrastructure
From: Boris Pismenny @ 2018-03-22 12:38 UTC (permalink / raw)
  To: Kirill Tkhai, Saeed Mahameed, David S. Miller
  Cc: netdev, Dave Watson, Ilya Lesokhin, Aviad Yehezkel
In-Reply-To: <372a4ca2-9ecf-95fe-7233-4937da82003a@virtuozzo.com>

...
>>>
>>> Can't we move this check in tls_dev_event() and use it for all types of events?
>>> Then we avoid duplicate code.
>>>
>>
>> No. Not all events require this check. Also, the result is different for different events.
> 
> No. You always return NOTIFY_DONE, in case of !(netdev->features & NETIF_F_HW_TLS_TX).
> See below:
> 
> static int tls_check_dev_ops(struct net_device *dev)
> {
> 	if (!dev->tlsdev_ops)
> 		return NOTIFY_BAD;
> 
> 	return NOTIFY_DONE;
> }
> 
> static int tls_device_down(struct net_device *netdev)
> {
> 	struct tls_context *ctx, *tmp;
> 	struct list_head list;
> 	unsigned long flags;
> 
> 	...
> 	return NOTIFY_DONE;
> }
> 
> static int tls_dev_event(struct notifier_block *this, unsigned long event,
>          		 void *ptr)
> {
>          struct net_device *dev = netdev_notifier_info_to_dev(ptr);
> 
> 	if (!(netdev->features & NETIF_F_HW_TLS_TX))
> 		return NOTIFY_DONE;
>   
>          switch (event) {
>          case NETDEV_REGISTER:
>          case NETDEV_FEAT_CHANGE:
>          	return tls_check_dev_ops(dev);
>   
>          case NETDEV_DOWN:
>          	return tls_device_down(dev);
>          }
>          return NOTIFY_DONE;
> }
>  

Sure, will fix in V3.

>>>> +
>>>> +    /* Request a write lock to block new offload attempts
>>>> +     */
>>>> +    percpu_down_write(&device_offload_lock);
>>>
>>> What is the reason percpu_rwsem is chosen here? It looks like this primitive
>>> gives more advantages readers, then plain rwsem does. But it also gives
>>> disadvantages to writers. It would be good, unless tls_device_down() is called
>>> with rtnl_lock() held from netdevice notifier. But since netdevice notifier
>>> are called with rtnl_lock() held, percpu_rwsem will increase the time rtnl_lock()
>>> is locked.
>> We use the a rwsem to allow multiple (readers) invocations of tls_set_device_offload, which is triggered by the user (persumably) during the TLS handshake. This might be considered a fast-path.
>>
>> However, we must block all calls to tls_set_device_offload while we are processing NETDEV_DOWN events (writer).
>>
>> As you've mentioned, the percpu rwsem is more efficient for readers, especially on NUMA systems, where cache-line bouncing occurs during reader acquire and reduces performance.
> 
> Hm, and who are the readers? It's used from do_tls_setsockopt_tx(), but it doesn't
> seem to be performance critical. Who else?
> 

It depends on whether you consider the TLS handshake code as critical.
The readers are TCP connections processing the CCS message of the TLS 
handshake. They are providing key material to the kernel to start using 
Kernel TLS.


>>>
>>> Can't we use plain rwsem here instead?
>>>
>>
>> Its a performance tradeoff. I'm not certain that the percpu rwsem write side acquire is significantly worse than using the global rwsem.
>>
>> For now, while all of this is experimental, can we agree to focus on the performance of readers? We can change it later if it becomes a problem.
> 
> Same as above.
>   

Replaced with rwsem from V2.

^ permalink raw reply

* Re: [PATCH V2 net-next 06/14] net/tls: Add generic NIC offload infrastructure
From: Boris Pismenny @ 2018-03-22 12:45 UTC (permalink / raw)
  To: Eric Dumazet, Saeed Mahameed, David S. Miller
  Cc: netdev, Dave Watson, Ilya Lesokhin, Aviad Yehezkel
In-Reply-To: <4cfad193-27f5-6982-66cc-4e327c8b10a0@gmail.com>



On 3/21/2018 11:10 PM, Eric Dumazet wrote:
> 
> 
> On 03/21/2018 02:01 PM, Saeed Mahameed wrote:
>> From: Ilya Lesokhin <ilyal@mellanox.com>
>>
>> This patch adds a generic infrastructure to offload TLS crypto to a
> 
> ...
> 
>> +
>> +static inline int tls_push_record(struct sock *sk,
>> +				  struct tls_context *ctx,
>> +				  struct tls_offload_context *offload_ctx,
>> +				  struct tls_record_info *record,
>> +				  struct page_frag *pfrag,
>> +				  int flags,
>> +				  unsigned char record_type)
>> +{
>> +	skb_frag_t *frag;
>> +	struct tcp_sock *tp = tcp_sk(sk);
>> +	struct page_frag fallback_frag;
>> +	struct page_frag  *tag_pfrag = pfrag;
>> +	int i;
>> +
>> +	/* fill prepand */
>> +	frag = &record->frags[0];
>> +	tls_fill_prepend(ctx,
>> +			 skb_frag_address(frag),
>> +			 record->len - ctx->prepend_size,
>> +			 record_type);
>> +
>> +	if (unlikely(!skb_page_frag_refill(ctx->tag_size, pfrag, GFP_KERNEL))) {
>> +		/* HW doesn't care about the data in the tag
>> +		 * so in case pfrag has no room
>> +		 * for a tag and we can't allocate a new pfrag
>> +		 * just use the page in the first frag
>> +		 * rather then write a complicated fall back code.
>> +		 */
>> +		tag_pfrag = &fallback_frag;
>> +		tag_pfrag->page = skb_frag_page(frag);
>> +		tag_pfrag->offset = 0;
>> +	}
>> +
> 
> If HW does not care, why even trying to call skb_page_frag_refill() ?
> 

There's no particular reason for allocating memory here. I'll remove it 
for V3.

> If you remove it, then we remove one seldom used path and might uncover bugs
> 
> This part looks very suspect to me, to be honest.
> 

HW doesn't care, because it generates the tag, and nothing along the 
network stack touches the data here.

Would you prefer that we allocate it anyway and wait for memory if it is 
not available?

^ permalink raw reply

* RE: [RFC PATCH 0/3] kernel: add support for 256-bit IO access
From: David Laight @ 2018-03-22 12:48 UTC (permalink / raw)
  To: 'Linus Torvalds', 'Ingo Molnar'
  Cc: 'Thomas Gleixner', 'Rahul Lakkireddy',
	'x86@kernel.org', 'linux-kernel@vger.kernel.org',
	'netdev@vger.kernel.org', 'mingo@redhat.com',
	'hpa@zytor.com', 'davem@davemloft.net',
	'akpm@linux-foundation.org',
	'ganeshgr@chelsio.com', 'nirranjan@chelsio.com',
	'indranil@chelsio.com', 'Andy Lutomirski',
	'Peter Zijlstra', 'Fenghua Yu',
	'Eric Biggers'
In-Reply-To: <2f5cb27711e741fba77846574e72eb62@AcuMS.aculab.com>

From: David Laight
> Sent: 22 March 2018 10:36
...
> Any code would need to be in memcpy_fromio(), not in every driver that
> might benefit.
> Then fallback code can be used if the registers aren't available.
> 
> >  (b) we can't guarantee that %ymm register write will show up on any
> > bus as a single write transaction anyway
> 
> Misaligned 8 byte accesses generate a single PCIe TLP.
> I can look at what happens for AVX2 transfers later.
> I've got code that mmap()s PCIe addresses into user space, and can
> look at the TLP (indirectly through tracing on an fpga target).
> Just need to set something up that uses AVX copies.

On my i7-7700 reads into xmm registers generate 16 byte TLP and
reads into ymm registers 32 byte TLP.
I don't think I've a system that supports avx-512

With my lethargic fpga slave 32 bytes reads happen every 144 clocks
and 16 byte ones every 126 (+/- the odd clock).
Single bytes ones happen every 108, 8 byte 117.
So I have (about) 110 clock overhead on every read cycle.
These clocks are the 62.5MHz clock on the fpga.

So if we needed to do PIO reads using the AVX2 (or better AVX-512)
registers would make a significant difference.
Fortunately we can 'dma' most of the data we need to transfer.

I've traced writes before, they are a lot faster and are limited
by things in the fpga fabric (they appear back to back).

	David


^ permalink raw reply

* Re: [PATCH net-next 06/14] net/tls: Add generic NIC offload infrastructure
From: Kirill Tkhai @ 2018-03-22 13:03 UTC (permalink / raw)
  To: Boris Pismenny, Saeed Mahameed, David S. Miller
  Cc: netdev, Dave Watson, Ilya Lesokhin, Aviad Yehezkel
In-Reply-To: <45bbe8bc-1de0-3f69-51f9-8db02203f989@mellanox.com>

On 22.03.2018 15:38, Boris Pismenny wrote:
> ...
>>>>
>>>> Can't we move this check in tls_dev_event() and use it for all types of events?
>>>> Then we avoid duplicate code.
>>>>
>>>
>>> No. Not all events require this check. Also, the result is different for different events.
>>
>> No. You always return NOTIFY_DONE, in case of !(netdev->features & NETIF_F_HW_TLS_TX).
>> See below:
>>
>> static int tls_check_dev_ops(struct net_device *dev)
>> {
>>     if (!dev->tlsdev_ops)
>>         return NOTIFY_BAD;
>>
>>     return NOTIFY_DONE;
>> }
>>
>> static int tls_device_down(struct net_device *netdev)
>> {
>>     struct tls_context *ctx, *tmp;
>>     struct list_head list;
>>     unsigned long flags;
>>
>>     ...
>>     return NOTIFY_DONE;
>> }
>>
>> static int tls_dev_event(struct notifier_block *this, unsigned long event,
>>                   void *ptr)
>> {
>>          struct net_device *dev = netdev_notifier_info_to_dev(ptr);
>>
>>     if (!(netdev->features & NETIF_F_HW_TLS_TX))
>>         return NOTIFY_DONE;
>>            switch (event) {
>>          case NETDEV_REGISTER:
>>          case NETDEV_FEAT_CHANGE:
>>              return tls_check_dev_ops(dev);
>>            case NETDEV_DOWN:
>>              return tls_device_down(dev);
>>          }
>>          return NOTIFY_DONE;
>> }
>>  
> 
> Sure, will fix in V3.
> 
>>>>> +
>>>>> +    /* Request a write lock to block new offload attempts
>>>>> +     */
>>>>> +    percpu_down_write(&device_offload_lock);
>>>>
>>>> What is the reason percpu_rwsem is chosen here? It looks like this primitive
>>>> gives more advantages readers, then plain rwsem does. But it also gives
>>>> disadvantages to writers. It would be good, unless tls_device_down() is called
>>>> with rtnl_lock() held from netdevice notifier. But since netdevice notifier
>>>> are called with rtnl_lock() held, percpu_rwsem will increase the time rtnl_lock()
>>>> is locked.
>>> We use the a rwsem to allow multiple (readers) invocations of tls_set_device_offload, which is triggered by the user (persumably) during the TLS handshake. This might be considered a fast-path.
>>>
>>> However, we must block all calls to tls_set_device_offload while we are processing NETDEV_DOWN events (writer).
>>>
>>> As you've mentioned, the percpu rwsem is more efficient for readers, especially on NUMA systems, where cache-line bouncing occurs during reader acquire and reduces performance.
>>
>> Hm, and who are the readers? It's used from do_tls_setsockopt_tx(), but it doesn't
>> seem to be performance critical. Who else?
>>
> 
> It depends on whether you consider the TLS handshake code as critical.
> The readers are TCP connections processing the CCS message of the TLS handshake. They are providing key material to the kernel to start using Kernel TLS.

The thing is rtnl_lock() is critical for the rest of the system,
while TLS handshake is small subset of actions the system makes.

rtnl_lock() is used just almost everywhere, from netlink messages
to netdev ioctls.

Currently, you even just can't close raw socket without rtnl lock.
So, all of this is big reason to avoid doing rcu waitings under it.

Kirill

>>>>
>>>> Can't we use plain rwsem here instead?
>>>>
>>>
>>> Its a performance tradeoff. I'm not certain that the percpu rwsem write side acquire is significantly worse than using the global rwsem.
>>>
>>> For now, while all of this is experimental, can we agree to focus on the performance of readers? We can change it later if it becomes a problem.
>>
>> Same as above.
>>   
> 
> Replaced with rwsem from V2.

^ permalink raw reply

* Re: [PATCH net-next 1/1] net/ipv4: disable SMC TCP option with SYN Cookies
From: Ursula Braun @ 2018-03-22 13:23 UTC (permalink / raw)
  To: Eric Dumazet, davem
  Cc: netdev, linux-s390, schwidefsky, heiko.carstens, raspl
In-Reply-To: <27701c4c-3759-6226-3307-fa03a9cc49b8@gmail.com>



On 03/20/2018 05:43 PM, Eric Dumazet wrote:
> 
> 
> On 03/20/2018 09:21 AM, Eric Dumazet wrote:
>>
>>
>> On 03/20/2018 08:53 AM, Ursula Braun wrote:
>>> From: Hans Wippel <hwippel@linux.vnet.ibm.com>
>>>
>>> Currently, the SMC experimental TCP option in a SYN packet is lost on
>>> the server side when SYN Cookies are active. However, the corresponding
>>> SYNACK sent back to the client contains the SMC option. This causes an
>>> inconsistent view of the SMC capabilities on the client and server.
>>>
>>> This patch disables the SMC option in the SYNACK when SYN Cookies are
>>> active to avoid this issue.
>>>
>>> Signed-off-by: Hans Wippel <hwippel@linux.vnet.ibm.com>
>>> Signed-off-by: Ursula Braun <ubraun@linux.vnet.ibm.com>
>>> ---
>>>  net/ipv4/tcp_output.c | 2 ++
>>>  1 file changed, 2 insertions(+)
>>>
>>> diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
>>> index 383cac0ff0ec..22894514feae 100644
>>> --- a/net/ipv4/tcp_output.c
>>> +++ b/net/ipv4/tcp_output.c
>>> @@ -3199,6 +3199,8 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
>>>  		/* Under synflood, we do not attach skb to a socket,
>>>  		 * to avoid false sharing.
>>>  		 */
>>> +		if (IS_ENABLED(CONFIG_SMC))
>>> +			ireq->smc_ok = 0;
>>>  		break;
>>>  	case TCP_SYNACK_FASTOPEN:
>>>  		/* sk is a const pointer, because we want to express multiple
>>>
>>
>> I disagree with net-next qualification.
>>
>> This fixes a bug, so please send it for net tree, and including an appropriate Fixes: tag.
>>

Okay, I will send it for the net tree.

> 
> Also, please do not add the fix in tcp_make_synack()
> 
> tcp_make_synack() builds an skb, and really should not modify ireq, ideally.
> The only reason ireq is not const is because of the skb_set_owner_w().
> 
> I would clear it in cookie_v4_check()/cookie_v6_check()
> 
> (We could have a common helper to allocate a TCP ireq btw, but this will wait a future patch for net-next)
>

We moved the clear to cookie_v4_check()/cookie_v6_check. However, this does not seem to
be sufficient to prevent the SYNACK from containing the SMC experimental option.
We found that an additional check in tcp_conn_request() helps:

--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -6248,6 +6248,9 @@ int tcp_conn_request(struct request_sock
 	if (want_cookie && !tmp_opt.saw_tstamp)
 		tcp_clear_options(&tmp_opt);
 
+	if (IS_ENABLED(CONFIG_SMC) && want_cookie && tmp_opt.smc_ok)
+		tmp_opt.smc_ok = 0;
+
 	tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
 	tcp_openreq_init(req, &tmp_opt, skb, sk);
 	inet_rsk(req)->no_srccheck = inet_sk(sk)->transparent;

Do you think this could be the right place for clearing the smc_ok bit?

  

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox