Netdev List
 help / color / mirror / Atom feed
* [PATCH net-next 6/6] net: dsa: clear VLAN flags for CPU port
From: Vivien Didelot @ 2019-08-22 20:13 UTC (permalink / raw)
  To: netdev; +Cc: davem, f.fainelli, andrew, olteanv, Vivien Didelot
In-Reply-To: <20190822201323.1292-1-vivien.didelot@gmail.com>

When the bridge offloads a VLAN on a slave port, we also need to
program its dedicated CPU port as a member of the VLAN.

Drivers may handle the CPU port's membership as they want. For example,
Marvell as a special "Unmodified" mode to pass frames as is through
such ports.

Even though DSA expects the drivers to handle the CPU port membership,
they are unlikely to program such VLANs untagged, and certainly not as
PVID. This patch clears the VLAN flags before programming the CPU port.

Signed-off-by: Vivien Didelot <vivien.didelot@gmail.com>
Suggested-by: Vladimir Oltean <olteanv@gmail.com>
---
 net/dsa/slave.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 8267c156a51a..48df48f76c67 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -332,6 +332,12 @@ static int dsa_slave_vlan_add(struct net_device *dev,
 	if (err)
 		return err;
 
+	/* We need the dedicated CPU port to be a member of the VLAN as well.
+	 * Even though drivers often handle CPU membership in special ways,
+	 * CPU ports are likely to be tagged, so clear the VLAN flags.
+	 */
+	vlan.flags = 0;
+
 	err = dsa_port_vlan_add(dp->cpu_dp, &vlan, trans);
 	if (err)
 		return err;
-- 
2.23.0


^ permalink raw reply related

* [PATCH net-next 4/6] net: dsa: check bridge VLAN in slave operations
From: Vivien Didelot @ 2019-08-22 20:13 UTC (permalink / raw)
  To: netdev; +Cc: davem, f.fainelli, andrew, olteanv, Vivien Didelot
In-Reply-To: <20190822201323.1292-1-vivien.didelot@gmail.com>

The bridge VLANs are not offloaded by dsa_port_vlan_* if the port is
not bridged or if its bridge is not VLAN aware.

This is a good thing but other corners of DSA, such as the tag_8021q
driver, may need to program VLANs regardless the bridge state.

And also because bridge_dev is specific to user ports anyway, move
these checks were it belongs, one layer up in the slave code.

Signed-off-by: Vivien Didelot <vivien.didelot@gmail.com>
Suggested-by: Vladimir Oltean <olteanv@gmail.com>
---
 net/dsa/port.c  | 10 ++--------
 net/dsa/slave.c | 12 ++++++++++++
 2 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/net/dsa/port.c b/net/dsa/port.c
index ef28df7ecbde..9b54e5a76297 100644
--- a/net/dsa/port.c
+++ b/net/dsa/port.c
@@ -348,10 +348,7 @@ int dsa_port_vlan_add(struct dsa_port *dp,
 		.vlan = vlan,
 	};
 
-	if (!dp->bridge_dev || br_vlan_enabled(dp->bridge_dev))
-		return dsa_port_notify(dp, DSA_NOTIFIER_VLAN_ADD, &info);
-
-	return 0;
+	return dsa_port_notify(dp, DSA_NOTIFIER_VLAN_ADD, &info);
 }
 
 int dsa_port_vlan_del(struct dsa_port *dp,
@@ -363,10 +360,7 @@ int dsa_port_vlan_del(struct dsa_port *dp,
 		.vlan = vlan,
 	};
 
-	if (!dp->bridge_dev || br_vlan_enabled(dp->bridge_dev))
-		return dsa_port_notify(dp, DSA_NOTIFIER_VLAN_DEL, &info);
-
-	return 0;
+	return dsa_port_notify(dp, DSA_NOTIFIER_VLAN_DEL, &info);
 }
 
 int dsa_port_vid_add(struct dsa_port *dp, u16 vid, u16 flags)
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 8f5126c41282..82e48d247b81 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -323,6 +323,9 @@ static int dsa_slave_vlan_add(struct net_device *dev,
 	if (obj->orig_dev != dev)
 		return -EOPNOTSUPP;
 
+	if (dp->bridge_dev && !br_vlan_enabled(dp->bridge_dev))
+		return 0;
+
 	vlan = *SWITCHDEV_OBJ_PORT_VLAN(obj);
 
 	err = dsa_port_vlan_add(dp, &vlan, trans);
@@ -377,6 +380,9 @@ static int dsa_slave_vlan_del(struct net_device *dev,
 	if (obj->orig_dev != dev)
 		return -EOPNOTSUPP;
 
+	if (dp->bridge_dev && !br_vlan_enabled(dp->bridge_dev))
+		return 0;
+
 	return dsa_port_vlan_del(dp, SWITCHDEV_OBJ_PORT_VLAN(obj));
 }
 
@@ -1099,6 +1105,9 @@ static int dsa_slave_vlan_rx_add_vid(struct net_device *dev, __be16 proto,
 	 * need to emulate the switchdev prepare + commit phase.
 	 */
 	if (dp->bridge_dev) {
+		if (!br_vlan_enabled(dp->bridge_dev))
+			return 0;
+
 		/* br_vlan_get_info() returns -EINVAL or -ENOENT if the
 		 * device, respectively the VID is not found, returning
 		 * 0 means success, which is a failure for us here.
@@ -1126,6 +1135,9 @@ static int dsa_slave_vlan_rx_kill_vid(struct net_device *dev, __be16 proto,
 	 * need to emulate the switchdev prepare + commit phase.
 	 */
 	if (dp->bridge_dev) {
+		if (!br_vlan_enabled(dp->bridge_dev))
+			return 0;
+
 		/* br_vlan_get_info() returns -EINVAL or -ENOENT if the
 		 * device, respectively the VID is not found, returning
 		 * 0 means success, which is a failure for us here.
-- 
2.23.0


^ permalink raw reply related

* [PATCH net-next 3/6] net: dsa: add slave VLAN helpers
From: Vivien Didelot @ 2019-08-22 20:13 UTC (permalink / raw)
  To: netdev; +Cc: davem, f.fainelli, andrew, olteanv, Vivien Didelot
In-Reply-To: <20190822201323.1292-1-vivien.didelot@gmail.com>

Add dsa_slave_vlan_add and dsa_slave_vlan_del helpers to handle
SWITCHDEV_OBJ_ID_PORT_VLAN switchdev objects. Also copy the
switchdev_obj_port_vlan structure on add since we will modify it in
future patches.

Signed-off-by: Vivien Didelot <vivien.didelot@gmail.com>
---
 net/dsa/slave.c | 40 +++++++++++++++++++++++++++++++++-------
 1 file changed, 33 insertions(+), 7 deletions(-)

diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 9d61d9dbf001..8f5126c41282 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -312,6 +312,26 @@ static int dsa_slave_port_attr_set(struct net_device *dev,
 	return ret;
 }
 
+static int dsa_slave_vlan_add(struct net_device *dev,
+			      const struct switchdev_obj *obj,
+			      struct switchdev_trans *trans)
+{
+	struct dsa_port *dp = dsa_slave_to_port(dev);
+	struct switchdev_obj_port_vlan vlan;
+	int err;
+
+	if (obj->orig_dev != dev)
+		return -EOPNOTSUPP;
+
+	vlan = *SWITCHDEV_OBJ_PORT_VLAN(obj);
+
+	err = dsa_port_vlan_add(dp, &vlan, trans);
+	if (err)
+		return err;
+
+	return 0;
+}
+
 static int dsa_slave_port_obj_add(struct net_device *dev,
 				  const struct switchdev_obj *obj,
 				  struct switchdev_trans *trans,
@@ -339,10 +359,7 @@ static int dsa_slave_port_obj_add(struct net_device *dev,
 				       trans);
 		break;
 	case SWITCHDEV_OBJ_ID_PORT_VLAN:
-		if (obj->orig_dev != dev)
-			return -EOPNOTSUPP;
-		err = dsa_port_vlan_add(dp, SWITCHDEV_OBJ_PORT_VLAN(obj),
-					trans);
+		err = dsa_slave_vlan_add(dev, obj, trans);
 		break;
 	default:
 		err = -EOPNOTSUPP;
@@ -352,6 +369,17 @@ static int dsa_slave_port_obj_add(struct net_device *dev,
 	return err;
 }
 
+static int dsa_slave_vlan_del(struct net_device *dev,
+			      const struct switchdev_obj *obj)
+{
+	struct dsa_port *dp = dsa_slave_to_port(dev);
+
+	if (obj->orig_dev != dev)
+		return -EOPNOTSUPP;
+
+	return dsa_port_vlan_del(dp, SWITCHDEV_OBJ_PORT_VLAN(obj));
+}
+
 static int dsa_slave_port_obj_del(struct net_device *dev,
 				  const struct switchdev_obj *obj)
 {
@@ -371,9 +399,7 @@ static int dsa_slave_port_obj_del(struct net_device *dev,
 		err = dsa_port_mdb_del(dp->cpu_dp, SWITCHDEV_OBJ_PORT_MDB(obj));
 		break;
 	case SWITCHDEV_OBJ_ID_PORT_VLAN:
-		if (obj->orig_dev != dev)
-			return -EOPNOTSUPP;
-		err = dsa_port_vlan_del(dp, SWITCHDEV_OBJ_PORT_VLAN(obj));
+		err = dsa_slave_vlan_del(dev, obj);
 		break;
 	default:
 		err = -EOPNOTSUPP;
-- 
2.23.0


^ permalink raw reply related

* [PATCH net-next 2/6] net: dsa: do not skip -EOPNOTSUPP in dsa_port_vid_add
From: Vivien Didelot @ 2019-08-22 20:13 UTC (permalink / raw)
  To: netdev; +Cc: davem, f.fainelli, andrew, olteanv, Vivien Didelot
In-Reply-To: <20190822201323.1292-1-vivien.didelot@gmail.com>

Currently dsa_port_vid_add returns 0 if the switch returns -EOPNOTSUPP.

This function is used in the tag_8021q.c code to offload the PVID of
ports, which would simply not work if .port_vlan_add is not supported
by the underlying switch.

Do not skip -EOPNOTSUPP in dsa_port_vid_add but only when necessary,
that is to say in dsa_slave_vlan_rx_add_vid.

Signed-off-by: Vivien Didelot <vivien.didelot@gmail.com>
---
 net/dsa/port.c  | 4 ++--
 net/dsa/slave.c | 7 +++++--
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/net/dsa/port.c b/net/dsa/port.c
index f75301456430..ef28df7ecbde 100644
--- a/net/dsa/port.c
+++ b/net/dsa/port.c
@@ -382,8 +382,8 @@ int dsa_port_vid_add(struct dsa_port *dp, u16 vid, u16 flags)
 
 	trans.ph_prepare = true;
 	err = dsa_port_vlan_add(dp, &vlan, &trans);
-	if (err == -EOPNOTSUPP)
-		return 0;
+	if (err)
+		return err;
 
 	trans.ph_prepare = false;
 	return dsa_port_vlan_add(dp, &vlan, &trans);
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 33f41178afcc..9d61d9dbf001 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -1082,8 +1082,11 @@ static int dsa_slave_vlan_rx_add_vid(struct net_device *dev, __be16 proto,
 			return -EBUSY;
 	}
 
-	/* This API only allows programming tagged, non-PVID VIDs */
-	return dsa_port_vid_add(dp, vid, 0);
+	ret = dsa_port_vid_add(dp, vid, 0);
+	if (ret && ret != -EOPNOTSUPP)
+		return ret;
+
+	return 0;
 }
 
 static int dsa_slave_vlan_rx_kill_vid(struct net_device *dev, __be16 proto,
-- 
2.23.0


^ permalink raw reply related

* [PATCH net-next 1/6] net: dsa: remove bitmap operations
From: Vivien Didelot @ 2019-08-22 20:13 UTC (permalink / raw)
  To: netdev; +Cc: davem, f.fainelli, andrew, olteanv, Vivien Didelot
In-Reply-To: <20190822201323.1292-1-vivien.didelot@gmail.com>

The bitmap operations were introduced to simplify the switch drivers
in the future, since most of them could implement the common VLAN and
MDB operations (add, del, dump) with simple functions taking all target
ports at once, and thus limiting the number of hardware accesses.

Programming an MDB or VLAN this way in a single operation would clearly
simplify the drivers a lot but would require a new get-set interface
in DSA. The usage of such bitmap from the stack also raised concerned
in the past, leading to the dynamic allocation of a new ds->_bitmap
member in the dsa_switch structure. So let's get rid of them for now.

This commit nicely wraps the ds->ops->port_{mdb,vlan}_{prepare,add}
switch operations into new dsa_switch_{mdb,vlan}_{prepare,add}
variants not using any bitmap argument anymore.

New dsa_switch_{mdb,vlan}_match helpers have been introduced to make
clear which local port of a switch must be programmed with the target
object. While the targeted user port is an obvious candidate, the
DSA links must also be programmed, as well as the CPU port for VLANs.

While at it, also remove local variables that are only used once.

Signed-off-by: Vivien Didelot <vivien.didelot@gmail.com>
---
 include/net/dsa.h |   3 --
 net/dsa/dsa2.c    |  14 -----
 net/dsa/switch.c  | 132 +++++++++++++++++++++-------------------------
 3 files changed, 59 insertions(+), 90 deletions(-)

diff --git a/include/net/dsa.h b/include/net/dsa.h
index 147b757ef8ea..96acb14ec1a8 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -275,9 +275,6 @@ struct dsa_switch {
 	 */
 	bool			vlan_filtering;
 
-	unsigned long		*bitmap;
-	unsigned long		_bitmap;
-
 	/* Dynamically allocated ports, keep last */
 	size_t num_ports;
 	struct dsa_port ports[];
diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c
index 8c4eccb0cfe6..f8445fa73448 100644
--- a/net/dsa/dsa2.c
+++ b/net/dsa/dsa2.c
@@ -834,20 +834,6 @@ struct dsa_switch *dsa_switch_alloc(struct device *dev, size_t n)
 	if (!ds)
 		return NULL;
 
-	/* We avoid allocating memory outside dsa_switch
-	 * if it is not needed.
-	 */
-	if (n <= sizeof(ds->_bitmap) * 8) {
-		ds->bitmap = &ds->_bitmap;
-	} else {
-		ds->bitmap = devm_kcalloc(dev,
-					  BITS_TO_LONGS(n),
-					  sizeof(unsigned long),
-					  GFP_KERNEL);
-		if (unlikely(!ds->bitmap))
-			return NULL;
-	}
-
 	ds->dev = dev;
 	ds->num_ports = n;
 
diff --git a/net/dsa/switch.c b/net/dsa/switch.c
index 09d9286b27cc..489eb7b430a4 100644
--- a/net/dsa/switch.c
+++ b/net/dsa/switch.c
@@ -128,57 +128,51 @@ static int dsa_switch_fdb_del(struct dsa_switch *ds,
 	return ds->ops->port_fdb_del(ds, port, info->addr, info->vid);
 }
 
-static int
-dsa_switch_mdb_prepare_bitmap(struct dsa_switch *ds,
-			      const struct switchdev_obj_port_mdb *mdb,
-			      const unsigned long *bitmap)
+static bool dsa_switch_mdb_match(struct dsa_switch *ds, int port,
+				 struct dsa_notifier_mdb_info *info)
+{
+	if (ds->index == info->sw_index && port == info->port)
+		return true;
+
+	if (dsa_is_dsa_port(ds, port))
+		return true;
+
+	return false;
+}
+
+static int dsa_switch_mdb_prepare(struct dsa_switch *ds,
+				  struct dsa_notifier_mdb_info *info)
 {
 	int port, err;
 
 	if (!ds->ops->port_mdb_prepare || !ds->ops->port_mdb_add)
 		return -EOPNOTSUPP;
 
-	for_each_set_bit(port, bitmap, ds->num_ports) {
-		err = ds->ops->port_mdb_prepare(ds, port, mdb);
-		if (err)
-			return err;
+	for (port = 0; port < ds->num_ports; port++) {
+		if (dsa_switch_mdb_match(ds, port, info)) {
+			err = ds->ops->port_mdb_prepare(ds, port, info->mdb);
+			if (err)
+				return err;
+		}
 	}
 
 	return 0;
 }
 
-static void dsa_switch_mdb_add_bitmap(struct dsa_switch *ds,
-				      const struct switchdev_obj_port_mdb *mdb,
-				      const unsigned long *bitmap)
-{
-	int port;
-
-	if (!ds->ops->port_mdb_add)
-		return;
-
-	for_each_set_bit(port, bitmap, ds->num_ports)
-		ds->ops->port_mdb_add(ds, port, mdb);
-}
-
 static int dsa_switch_mdb_add(struct dsa_switch *ds,
 			      struct dsa_notifier_mdb_info *info)
 {
-	const struct switchdev_obj_port_mdb *mdb = info->mdb;
-	struct switchdev_trans *trans = info->trans;
 	int port;
 
-	/* Build a mask of Multicast group members */
-	bitmap_zero(ds->bitmap, ds->num_ports);
-	if (ds->index == info->sw_index)
-		set_bit(info->port, ds->bitmap);
-	for (port = 0; port < ds->num_ports; port++)
-		if (dsa_is_dsa_port(ds, port))
-			set_bit(port, ds->bitmap);
+	if (switchdev_trans_ph_prepare(info->trans))
+		return dsa_switch_mdb_prepare(ds, info);
 
-	if (switchdev_trans_ph_prepare(trans))
-		return dsa_switch_mdb_prepare_bitmap(ds, mdb, ds->bitmap);
+	if (!ds->ops->port_mdb_add)
+		return 0;
 
-	dsa_switch_mdb_add_bitmap(ds, mdb, ds->bitmap);
+	for (port = 0; port < ds->num_ports; port++)
+		if (dsa_switch_mdb_match(ds, port, info))
+			ds->ops->port_mdb_add(ds, port, info->mdb);
 
 	return 0;
 }
@@ -186,13 +180,11 @@ static int dsa_switch_mdb_add(struct dsa_switch *ds,
 static int dsa_switch_mdb_del(struct dsa_switch *ds,
 			      struct dsa_notifier_mdb_info *info)
 {
-	const struct switchdev_obj_port_mdb *mdb = info->mdb;
-
 	if (!ds->ops->port_mdb_del)
 		return -EOPNOTSUPP;
 
 	if (ds->index == info->sw_index)
-		return ds->ops->port_mdb_del(ds, info->port, mdb);
+		return ds->ops->port_mdb_del(ds, info->port, info->mdb);
 
 	return 0;
 }
@@ -234,59 +226,55 @@ static int dsa_port_vlan_check(struct dsa_switch *ds, int port,
 			     (void *)vlan);
 }
 
-static int
-dsa_switch_vlan_prepare_bitmap(struct dsa_switch *ds,
-			       const struct switchdev_obj_port_vlan *vlan,
-			       const unsigned long *bitmap)
+static bool dsa_switch_vlan_match(struct dsa_switch *ds, int port,
+				  struct dsa_notifier_vlan_info *info)
+{
+	if (ds->index == info->sw_index && port == info->port)
+		return true;
+
+	if (dsa_is_cpu_port(ds, port) || dsa_is_dsa_port(ds, port))
+		return true;
+
+	return false;
+}
+
+static int dsa_switch_vlan_prepare(struct dsa_switch *ds,
+				   struct dsa_notifier_vlan_info *info)
 {
 	int port, err;
 
 	if (!ds->ops->port_vlan_prepare || !ds->ops->port_vlan_add)
 		return -EOPNOTSUPP;
 
-	for_each_set_bit(port, bitmap, ds->num_ports) {
-		err = dsa_port_vlan_check(ds, port, vlan);
-		if (err)
-			return err;
+	for (port = 0; port < ds->num_ports; port++) {
+		if (dsa_switch_vlan_match(ds, port, info)) {
+			err = dsa_port_vlan_check(ds, port, info->vlan);
+			if (err)
+				return err;
 
-		err = ds->ops->port_vlan_prepare(ds, port, vlan);
-		if (err)
-			return err;
+			err = ds->ops->port_vlan_prepare(ds, port, info->vlan);
+			if (err)
+				return err;
+		}
 	}
 
 	return 0;
 }
 
-static void
-dsa_switch_vlan_add_bitmap(struct dsa_switch *ds,
-			   const struct switchdev_obj_port_vlan *vlan,
-			   const unsigned long *bitmap)
-{
-	int port;
-
-	for_each_set_bit(port, bitmap, ds->num_ports)
-		ds->ops->port_vlan_add(ds, port, vlan);
-}
-
 static int dsa_switch_vlan_add(struct dsa_switch *ds,
 			       struct dsa_notifier_vlan_info *info)
 {
-	const struct switchdev_obj_port_vlan *vlan = info->vlan;
-	struct switchdev_trans *trans = info->trans;
 	int port;
 
-	/* Build a mask of VLAN members */
-	bitmap_zero(ds->bitmap, ds->num_ports);
-	if (ds->index == info->sw_index)
-		set_bit(info->port, ds->bitmap);
-	for (port = 0; port < ds->num_ports; port++)
-		if (dsa_is_cpu_port(ds, port) || dsa_is_dsa_port(ds, port))
-			set_bit(port, ds->bitmap);
+	if (switchdev_trans_ph_prepare(info->trans))
+		return dsa_switch_vlan_prepare(ds, info);
 
-	if (switchdev_trans_ph_prepare(trans))
-		return dsa_switch_vlan_prepare_bitmap(ds, vlan, ds->bitmap);
+	if (!ds->ops->port_vlan_add)
+		return 0;
 
-	dsa_switch_vlan_add_bitmap(ds, vlan, ds->bitmap);
+	for (port = 0; port < ds->num_ports; port++)
+		if (dsa_switch_vlan_match(ds, port, info))
+			ds->ops->port_vlan_add(ds, port, info->vlan);
 
 	return 0;
 }
@@ -294,13 +282,11 @@ static int dsa_switch_vlan_add(struct dsa_switch *ds,
 static int dsa_switch_vlan_del(struct dsa_switch *ds,
 			       struct dsa_notifier_vlan_info *info)
 {
-	const struct switchdev_obj_port_vlan *vlan = info->vlan;
-
 	if (!ds->ops->port_vlan_del)
 		return -EOPNOTSUPP;
 
 	if (ds->index == info->sw_index)
-		return ds->ops->port_vlan_del(ds, info->port, vlan);
+		return ds->ops->port_vlan_del(ds, info->port, info->vlan);
 
 	return 0;
 }
-- 
2.23.0


^ permalink raw reply related

* [PATCH net-next 0/6] net: dsa: explicit programmation of VLAN on CPU ports
From: Vivien Didelot @ 2019-08-22 20:13 UTC (permalink / raw)
  To: netdev; +Cc: davem, f.fainelli, andrew, olteanv, Vivien Didelot

When a VLAN is programmed on a user port, every switch of the fabric also
program the CPU ports and the DSA links as part of the VLAN. To do that,
DSA makes use of bitmaps to prepare all members of a VLAN.

While this is expected for DSA links which are used as conduit between
interconnected switches, only the dedicated CPU port of the slave must be
programmed, not all CPU ports of the fabric. This may also cause problems in
other corners of DSA such as the tag_8021q.c driver, which needs to program
its ports manually, CPU port included.

We need the dsa_port_vlan_{add,del} functions and its dsa_port_vid_{add,del}
variants to simply trigger the VLAN programmation without any logic in them,
but they may currently skip the operation based on the bridge device state.

This patchset gets rid of the bitmap operations, and moves the bridge device
check as well as the explicit programmation of CPU ports where they belong,
in the slave code.

While at it, clear the VLAN flags before programming a CPU port, as it
doesn't make sense to forward the PVID flag for example for such ports.

Vivien Didelot (6):
  net: dsa: remove bitmap operations
  net: dsa: do not skip -EOPNOTSUPP in dsa_port_vid_add
  net: dsa: add slave VLAN helpers
  net: dsa: check bridge VLAN in slave operations
  net: dsa: program VLAN on CPU port from slave
  net: dsa: clear VLAN flags for CPU port

 include/net/dsa.h |   3 --
 net/dsa/dsa2.c    |  14 -----
 net/dsa/port.c    |  14 ++---
 net/dsa/slave.c   |  79 +++++++++++++++++++++++----
 net/dsa/switch.c  | 135 +++++++++++++++++++++-------------------------
 5 files changed, 136 insertions(+), 109 deletions(-)

-- 
2.23.0


^ permalink raw reply

* Re: New skb extension for use by LSMs (skb "security blob")?
From: Casey Schaufler @ 2019-08-22 20:10 UTC (permalink / raw)
  To: Paul Moore, Florian Westphal
  Cc: netdev, linux-security-module, selinux, casey
In-Reply-To: <CAHC9VhQ_+3ywPu0QRzP3cSgPH2i9Br994wJttp-yXy2GA4FrNg@mail.gmail.com>

On 8/22/2019 9:32 AM, Paul Moore wrote:
> On Thu, Aug 22, 2019 at 3:03 AM Florian Westphal <fw@strlen.de> wrote:
>> Paul Moore <paul@paul-moore.com> wrote:
>>> Hello netdev,
>>>
>>> I was just made aware of the skb extension work, and it looks very
>>> appealing from a LSM perspective.  As some of you probably remember,
>>> we (the LSM folks) have wanted a proper security blob in the skb for
>>> quite some time, but netdev has been resistant to this idea thus far.
>> Is that "blob" in addition to skb->secmark, or a replacement?
> That's a good question.  While I thought about that, I wasn't sure if
> that was worth bringing up as previous attempts to trade the secmark
> field for a void pointer met with failure.  Last time I played with it
> I was able to take the additional 32-bits from holes in the skb, and
> possibly even improve some of the cacheline groupings (but that is
> always going to be a dependent on use case I think), but that wasn't
> enough.
>
> I think we could consider freeing up the secmark in the main skb, and
> move it to a skb extension, but this would potentially increase the
> chances that we would need to add an extension to a skb.  I don't have
> any hard numbers, but based on discussions and questions I suspect
> Secmark is more widely used than NetLabel and/or labeled IPsec;
> although I'm confident it is still a minor percentage of the overall
> Linux installed base.

Smack uses both extensively. As far as Smack is concerned giving up
the secmark for a blob would be just fine.

I am also working on security module stacking, and a blob in the
skb would dramatically improve the options for making that work
rationally.

> For me the big question is what would it take for us to get a security
> blob associated with the skb?  Would moving the secmark into the skb
> extension be enough?  Something else?  Or is this simply never going
> to happen?  I want to remain optimistic, but I've been trying for this
> off-and-on for over a decade and keep running into a brick wall ;)

Given that the original objection to using a skb extension for a
security blob was that an extension is dynamic, and that the ubiquitous
nature of LSM use makes that unreasonable, it would seem that supporting
the security blob as a basic part if the skb would be the obvious and
correct solution. If the normal case is that there is an LSM that would
befit from the native (unextended) support of a blob, it would seem
that that is the case that should be optimized.



^ permalink raw reply

* Re: [PATCH 1/3] net: Add HW_BRIDGE offload feature
From: Andrew Lunn @ 2019-08-22 20:08 UTC (permalink / raw)
  To: Horatiu Vultur
  Cc: roopa, nikolay, davem, UNGLinuxDriver, alexandre.belloni,
	allan.nielsen, netdev, linux-kernel, bridge
In-Reply-To: <1566500850-6247-2-git-send-email-horatiu.vultur@microchip.com>

> +/* Determin if the SW bridge can be offloaded to HW. Return true if all
> + * the interfaces of the bridge have the feature NETIF_F_HW_SWITCHDEV set
> + * and have the same netdev_ops.
> + */

Hi Horatiu

Why do you need these restrictions. The HW bridge should be able to
learn that a destination MAC address can be reached via the SW
bridge. The software bridge can then forward it out the correct
interface.

Or are you saying your hardware cannot learn from frames which come
from the CPU?

	Andrew

^ permalink raw reply

* net/dst_cache.c: preemption bug in net/dst_cache.c
From: Bharath Vedartham @ 2019-08-22 19:51 UTC (permalink / raw)
  To: davem, gregkh, allison; +Cc: tglx, netdev, linux-kernel

Hi all,

I just want to bring attention to the syzbot bug [1]

Even though syzbot claims the bug to be in net/tipc, I feel it is in
net/dst_cache.c. Please correct me if I am wrong.

This bug is being triggered a lot of times by syzbot since the day it
was reported. Also given that this is core networking code, I felt it
was important to bring this to attention.

It looks like preemption needs to be disabled before using this_cpu_ptr
or maybe we would be better of using a get_cpu_var and put_cpu_var combo
here.

[1] https://syzkaller.appspot.com/bug?id=dc6352b92862eb79373fe03fdf9af5928753e057

Thank you
Bharath

^ permalink raw reply

* Re: [PATCH net-next v2 2/3] net: ethernet: mediatek: Re-add support SGMII
From: René van Dorst @ 2019-08-22 19:50 UTC (permalink / raw)
  To: Russell King - ARM Linux admin
  Cc: John Crispin, Sean Wang, Nelson Chang, David S . Miller,
	Matthias Brugger, Frank Wunderlich, netdev, linux-mips,
	linux-mediatek, Stefan Roese, linux-arm-kernel
In-Reply-To: <20190822144433.GT13294@shell.armlinux.org.uk>

Hi Russell,

Quoting Russell King - ARM Linux admin <linux@armlinux.org.uk>:

> On Wed, Aug 21, 2019 at 04:43:35PM +0200, René van Dorst wrote:
>> +	if (MTK_HAS_CAPS(mac->hw->soc->caps, MTK_SGMII)) {
>> +		if (state->interface != PHY_INTERFACE_MODE_2500BASEX) {
>>  			phylink_set(mask, 1000baseT_Full);
>>  			phylink_set(mask, 1000baseX_Full);
>> +		} else {
>> +			phylink_set(mask, 2500baseT_Full);
>> +			phylink_set(mask, 2500baseX_Full);
>> +		}
>
> If you can dynamically switch between 1000BASE-X and 2500BASE-X, then
> you need to have both set.  See mvneta.c:
>
>         if (pp->comphy || state->interface != PHY_INTERFACE_MODE_2500BASEX) {
>                 phylink_set(mask, 1000baseT_Full);
>                 phylink_set(mask, 1000baseX_Full);
>         }
>         if (pp->comphy || state->interface == PHY_INTERFACE_MODE_2500BASEX) {
>                 phylink_set(mask, 2500baseT_Full);
>                 phylink_set(mask, 2500baseX_Full);
>         }
>
> What this is saying is, if we have a comphy (which is the serdes lane
> facing component, where the data rate is setup) then we can support
> both speeds (and so mask ends up with all four bits set.)  Otherwise,
> we only support a single-speed (1000Gbps for non-2500BASE-X etc.)
>
>> +	} else {
>> +		if (state->interface == PHY_INTERFACE_MODE_TRGMII) {
>> +			phylink_set(mask, 1000baseT_Full);
>> +		} else {
>> +			phylink_set(mask, 10baseT_Half);
>> +			phylink_set(mask, 10baseT_Full);
>> +			phylink_set(mask, 100baseT_Half);
>> +			phylink_set(mask, 100baseT_Full);
>> +
>> +			if (state->interface != PHY_INTERFACE_MODE_MII) {
>> +				phylink_set(mask, 1000baseT_Half);
>> +				phylink_set(mask, 1000baseT_Full);
>> +				phylink_set(mask, 1000baseX_Full);
>> +			}
>
> I'm also wondering about the "MTK_HAS_CAPS(mac->hw->soc->caps,
> MTK_SGMII)" above.

This totally wrong.
MTK_HAS_CAPS(mac->hw->soc->caps, MTK_SGMII) tells me that the SOC has SGMII
lane(s). Having a SGMII block doesn't mean that other functions aren't
supported. I have to redo this!

> (Here comes a reason why using SGMII to cover all single-lane serdes
> modes causes confusion - unfortunately, some folk use SGMII to describe
> all these modes.  So, I'm going to use the terminology "Cisco SGMII"
> to mean exactly the SGMII format published by Cisco, "802.3 1000BASE-X"
> to mean the original IEEE 802.3 format running at 1.25Gbps, and
> "up-clocked 2500BASE-X" to mean the 3.125Gbps version of the 802.3
> 1000BASE-X protocol.)

Thanks for the explanation. In your previous review v1 you also explained it.
I did change the forced modes for x-BaseX modes and auto negotiation for Cisco
SGMII. But I seems to miss the link that I also have to improve this  
validation
part.

>
> Isn't this set for Cisco SGMII as well as for 802.3 1000BASE-X and
> the up-clocked 2500BASE-X modes?
>
> If so, is there a reason why 10Mbps and 100Mbps speeds aren't
> supported on Cisco SGMII links?

I can only tell a bit about the mt7622 SOC, datasheet tells me that:

The SGMII is the interface between 10/100/1000/2500 Mbps PHY and Ethernet MAC,
the spec is raised by Cisco in 1999, which is aims for pin reduction compare
with the GMII. It uses 2 differential data pair for TX and RX with clock
embedded bit stream to convey frame data and port ability information.
The core leverages the 1000Base-X PCS and Auto-Negotiation from IEEE 802.3
specification (clause 36/37). This IP can support up to 3.125G baud  
for 2.5Gbps
(proprietary 2500Base-X) data rate of MAC by overclocking.

Also features tells me: Support 10/100/1000/2500 Mbps in full duplex mode and
10/100 Mbps in half duplex mode.

I going make a new version.

Greats,

René

> --
> RMK's Patch system: https://www.armlinux.org.uk/developer/patches/
> FTTC broadband for 0.8mile line in suburbia: sync at 12.1Mbps down 622kbps up
> According to speedtest.net: 11.9Mbps down 500kbps up




^ permalink raw reply

* Re: [PATCH net-next,v4, 4/6] net/mlx5: Add HV VHCA infrastructure
From: Eran Ben Elisha @ 2019-08-22 19:33 UTC (permalink / raw)
  To: Leon Romanovsky, haiyangz
  Cc: sashal@kernel.org, davem@davemloft.net, Saeed Mahameed,
	lorenzo.pieralisi@arm.com, bhelgaas@google.com,
	linux-pci@vger.kernel.org, linux-hyperv@vger.kernel.org,
	netdev@vger.kernel.org, kys, Stephen Hemminger,
	linux-kernel@vger.kernel.org
In-Reply-To: <20190822185847.GP29433@mtr-leonro.mtl.com>



On 8/22/2019 9:58 PM, Leon Romanovsky wrote:
> On Thu, Aug 22, 2019 at 05:05:51AM +0000, Haiyang Zhang wrote:
>> From: Eran Ben Elisha <eranbe@mellanox.com>
>>
>> HV VHCA is a layer which provides PF to VF communication channel based on
>> HyperV PCI config channel. It implements Mellanox's Inter VHCA control
>> communication protocol. The protocol contains control block in order to
>> pass messages between the PF and VF drivers, and data blocks in order to
>> pass actual data.
>>
>> The infrastructure is agent based. Each agent will be responsible of
>> contiguous buffer blocks in the VHCA config space. This infrastructure will
>> bind agents to their blocks, and those agents can only access read/write
>> the buffer blocks assigned to them. Each agent will provide three
>> callbacks (control, invalidate, cleanup). Control will be invoked when
>> block-0 is invalidated with a command that concerns this agent. Invalidate
>> callback will be invoked if one of the blocks assigned to this agent was
>> invalidated. Cleanup will be invoked before the agent is being freed in
>> order to clean all of its open resources or deferred works.
>>
>> Block-0 serves as the control block. All execution commands from the PF
>> will be written by the PF over this block. VF will ack on those by
>> writing on block-0 as well. Its format is described by struct
>> mlx5_hv_vhca_control_block layout.
>>
>> Signed-off-by: Eran Ben Elisha <eranbe@mellanox.com>
>> Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
>> Signed-off-by: Haiyang Zhang <haiyangz@microsoft.com>
>> ---
>>   drivers/net/ethernet/mellanox/mlx5/core/Makefile   |   2 +-
>>   .../net/ethernet/mellanox/mlx5/core/lib/hv_vhca.c  | 253 +++++++++++++++++++++
>>   .../net/ethernet/mellanox/mlx5/core/lib/hv_vhca.h  | 102 +++++++++
>>   drivers/net/ethernet/mellanox/mlx5/core/main.c     |   7 +
>>   include/linux/mlx5/driver.h                        |   2 +
>>   5 files changed, 365 insertions(+), 1 deletion(-)
>>   create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.c
>>   create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.h
>>
>> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
>> index fd32a5b..8d443fc 100644
>> --- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile
>> +++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
>> @@ -45,7 +45,7 @@ mlx5_core-$(CONFIG_MLX5_ESWITCH)   += eswitch.o eswitch_offloads.o eswitch_offlo
>>   mlx5_core-$(CONFIG_MLX5_MPFS)      += lib/mpfs.o
>>   mlx5_core-$(CONFIG_VXLAN)          += lib/vxlan.o
>>   mlx5_core-$(CONFIG_PTP_1588_CLOCK) += lib/clock.o
>> -mlx5_core-$(CONFIG_PCI_HYPERV_INTERFACE) += lib/hv.o
>> +mlx5_core-$(CONFIG_PCI_HYPERV_INTERFACE) += lib/hv.o lib/hv_vhca.o
>>
>>   #
>>   # Ipoib netdev
>> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.c
>> new file mode 100644
>> index 0000000..84d1d75
>> --- /dev/null
>> +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.c
>> @@ -0,0 +1,253 @@
>> +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
>> +// Copyright (c) 2018 Mellanox Technologies
>> +
>> +#include <linux/hyperv.h>
>> +#include "mlx5_core.h"
>> +#include "lib/hv.h"
>> +#include "lib/hv_vhca.h"
>> +
>> +struct mlx5_hv_vhca {
>> +	struct mlx5_core_dev       *dev;
>> +	struct workqueue_struct    *work_queue;
>> +	struct mlx5_hv_vhca_agent  *agents[MLX5_HV_VHCA_AGENT_MAX];
>> +	struct mutex                agents_lock; /* Protect agents array */
>> +};
>> +
>> +struct mlx5_hv_vhca_work {
>> +	struct work_struct     invalidate_work;
>> +	struct mlx5_hv_vhca   *hv_vhca;
>> +	u64                    block_mask;
>> +};
>> +
>> +struct mlx5_hv_vhca_data_block {
>> +	u16     sequence;
>> +	u16     offset;
>> +	u8      reserved[4];
>> +	u64     data[15];
>> +};
>> +
>> +struct mlx5_hv_vhca_agent {
>> +	enum mlx5_hv_vhca_agent_type	 type;
>> +	struct mlx5_hv_vhca		*hv_vhca;
>> +	void				*priv;
>> +	u16                              seq;
>> +	void (*control)(struct mlx5_hv_vhca_agent *agent,
>> +			struct mlx5_hv_vhca_control_block *block);
>> +	void (*invalidate)(struct mlx5_hv_vhca_agent *agent,
>> +			   u64 block_mask);
>> +	void (*cleanup)(struct mlx5_hv_vhca_agent *agent);
>> +};
>> +
>> +struct mlx5_hv_vhca *mlx5_hv_vhca_create(struct mlx5_core_dev *dev)
>> +{
>> +	struct mlx5_hv_vhca *hv_vhca = NULL;
>> +
>> +	hv_vhca = kzalloc(sizeof(*hv_vhca), GFP_KERNEL);
>> +	if (!hv_vhca)
>> +		return ERR_PTR(-ENOMEM);
>> +
>> +	hv_vhca->work_queue = create_singlethread_workqueue("mlx5_hv_vhca");
> 
> I was under impression that usage of create_* interfaces is discouraged,
> It has WQ_MEMORY_LEGACY flag inside and commit b71ab8c2025ca talks about
> this interface as legacy one.

mlx5 driver has dozens of singlethread workqueues. A general effort to 
remove them can be done later.

> 
>> +	if (!hv_vhca->work_queue) {
>> +		kfree(hv_vhca);
>> +		return ERR_PTR(-ENOMEM);
>> +	}
>> +
>> +	hv_vhca->dev = dev;
>> +	mutex_init(&hv_vhca->agents_lock);
>> +
>> +	return hv_vhca;
>> +}
>> +
>> +void mlx5_hv_vhca_destroy(struct mlx5_hv_vhca *hv_vhca)
>> +{
>> +	if (IS_ERR_OR_NULL(hv_vhca))
>> +		return;
>> +
>> +	destroy_workqueue(hv_vhca->work_queue);
>> +	kfree(hv_vhca);
>> +}
>> +
>> +static void mlx5_hv_vhca_invalidate_work(struct work_struct *work)
>> +{
>> +	struct mlx5_hv_vhca_work *hwork;
>> +	struct mlx5_hv_vhca *hv_vhca;
>> +	int i;
>> +
>> +	hwork = container_of(work, struct mlx5_hv_vhca_work, invalidate_work);
>> +	hv_vhca = hwork->hv_vhca;
>> +
>> +	mutex_lock(&hv_vhca->agents_lock);
>> +	for (i = 0; i < MLX5_HV_VHCA_AGENT_MAX; i++) {
>> +		struct mlx5_hv_vhca_agent *agent = hv_vhca->agents[i];
>> +
>> +		if (!agent || !agent->invalidate)
>> +			continue;
>> +
>> +		if (!(BIT(agent->type) & hwork->block_mask))
>> +			continue;
>> +
>> +		agent->invalidate(agent, hwork->block_mask);
>> +	}
>> +	mutex_unlock(&hv_vhca->agents_lock);
>> +
>> +	kfree(hwork);
>> +}
>> +
>> +void mlx5_hv_vhca_invalidate(void *context, u64 block_mask)
>> +{
>> +	struct mlx5_hv_vhca *hv_vhca = (struct mlx5_hv_vhca *)context;
>> +	struct mlx5_hv_vhca_work *work;
>> +
>> +	work = kzalloc(sizeof(*work), GFP_ATOMIC);
>> +	if (!work)
>> +		return;
>> +
>> +	INIT_WORK(&work->invalidate_work, mlx5_hv_vhca_invalidate_work);
>> +	work->hv_vhca    = hv_vhca;
>> +	work->block_mask = block_mask;
>> +
>> +	queue_work(hv_vhca->work_queue, &work->invalidate_work);
>> +}
>> +
>> +int mlx5_hv_vhca_init(struct mlx5_hv_vhca *hv_vhca)
>> +{
>> +	if (IS_ERR_OR_NULL(hv_vhca))
>> +		return IS_ERR_OR_NULL(hv_vhca);
>> +
>> +	return mlx5_hv_register_invalidate(hv_vhca->dev, hv_vhca,
>> +					   mlx5_hv_vhca_invalidate);
>> +}
>> +
>> +void mlx5_hv_vhca_cleanup(struct mlx5_hv_vhca *hv_vhca)
>> +{
>> +	int i;
>> +
>> +	if (IS_ERR_OR_NULL(hv_vhca))
>> +		return;
>> +
>> +	mutex_lock(&hv_vhca->agents_lock);
>> +	for (i = 0; i < MLX5_HV_VHCA_AGENT_MAX; i++)
>> +		WARN_ON(hv_vhca->agents[i]);
>> +
>> +	mutex_unlock(&hv_vhca->agents_lock);
>> +
>> +	mlx5_hv_unregister_invalidate(hv_vhca->dev);
>> +}
>> +
>> +struct mlx5_hv_vhca_agent *
>> +mlx5_hv_vhca_agent_create(struct mlx5_hv_vhca *hv_vhca,
>> +			  enum mlx5_hv_vhca_agent_type type,
>> +			  void (*control)(struct mlx5_hv_vhca_agent*,
>> +					  struct mlx5_hv_vhca_control_block *block),
>> +			  void (*invalidate)(struct mlx5_hv_vhca_agent*,
>> +					     u64 block_mask),
>> +			  void (*cleaup)(struct mlx5_hv_vhca_agent *agent),
>> +			  void *priv)
>> +{
>> +	struct mlx5_hv_vhca_agent *agent;
>> +
>> +	if (IS_ERR_OR_NULL(hv_vhca))
>> +		return ERR_PTR(-ENOMEM);
>> +
>> +	if (type >= MLX5_HV_VHCA_AGENT_MAX)
>> +		return ERR_PTR(-EINVAL);
>> +
>> +	mutex_lock(&hv_vhca->agents_lock);
>> +	if (hv_vhca->agents[type]) {
>> +		mutex_unlock(&hv_vhca->agents_lock);
>> +		return ERR_PTR(-EINVAL);
>> +	}
>> +	mutex_unlock(&hv_vhca->agents_lock);
>> +
>> +	agent = kzalloc(sizeof(*agent), GFP_KERNEL);
>> +	if (!agent)
>> +		return ERR_PTR(-ENOMEM);
>> +
>> +	agent->type      = type;
>> +	agent->hv_vhca   = hv_vhca;
>> +	agent->priv      = priv;
>> +	agent->control   = control;
>> +	agent->invalidate = invalidate;
>> +	agent->cleanup   = cleaup;
>> +
>> +	mutex_lock(&hv_vhca->agents_lock);
>> +	hv_vhca->agents[type] = agent;
>> +	mutex_unlock(&hv_vhca->agents_lock);
>> +
>> +	return agent;
>> +}
>> +
>> +void mlx5_hv_vhca_agent_destroy(struct mlx5_hv_vhca_agent *agent)
>> +{
>> +	struct mlx5_hv_vhca *hv_vhca = agent->hv_vhca;
>> +
>> +	mutex_lock(&hv_vhca->agents_lock);
>> +
>> +	if (WARN_ON(agent != hv_vhca->agents[agent->type])) {
>> +		mutex_unlock(&hv_vhca->agents_lock);
>> +		return;
>> +	}
>> +
>> +	hv_vhca->agents[agent->type] = NULL;
>> +	mutex_unlock(&hv_vhca->agents_lock);
>> +
>> +	if (agent->cleanup)
>> +		agent->cleanup(agent);
>> +
>> +	kfree(agent);
>> +}
>> +
>> +static int mlx5_hv_vhca_data_block_prepare(struct mlx5_hv_vhca_agent *agent,
>> +					   struct mlx5_hv_vhca_data_block *data_block,
>> +					   void *src, int len, int *offset)
>> +{
>> +	int bytes = min_t(int, (int)sizeof(data_block->data), len);
>> +
>> +	data_block->sequence = agent->seq;
>> +	data_block->offset   = (*offset)++;
>> +	memcpy(data_block->data, src, bytes);
>> +
>> +	return bytes;
>> +}
>> +
>> +static void mlx5_hv_vhca_agent_seq_update(struct mlx5_hv_vhca_agent *agent)
>> +{
>> +	agent->seq++;
>> +}
>> +
>> +int mlx5_hv_vhca_agent_write(struct mlx5_hv_vhca_agent *agent,
>> +			     void *buf, int len)
>> +{
>> +	int offset = agent->type * HV_CONFIG_BLOCK_SIZE_MAX;
>> +	int block_offset = 0;
>> +	int total = 0;
>> +	int err;
>> +
>> +	while (len) {
>> +		struct mlx5_hv_vhca_data_block data_block = {0};
>> +		int bytes;
>> +
>> +		bytes = mlx5_hv_vhca_data_block_prepare(agent, &data_block,
>> +							buf + total,
>> +							len, &block_offset);
>> +		if (!bytes)
>> +			return -ENOMEM;
>> +
>> +		err = mlx5_hv_write_config(agent->hv_vhca->dev, &data_block,
>> +					   sizeof(data_block), offset);
>> +		if (err)
>> +			return err;
>> +
>> +		total += bytes;
>> +		len   -= bytes;
>> +	}
>> +
>> +	mlx5_hv_vhca_agent_seq_update(agent);
>> +
>> +	return 0;
>> +}
>> +
>> +void *mlx5_hv_vhca_agent_priv(struct mlx5_hv_vhca_agent *agent)
>> +{
>> +	return agent->priv;
>> +}
>> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.h
>> new file mode 100644
>> index 0000000..cdf1303
>> --- /dev/null
>> +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.h
>> @@ -0,0 +1,102 @@
>> +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
>> +/* Copyright (c) 2019 Mellanox Technologies. */
>> +
>> +#ifndef __LIB_HV_VHCA_H__
>> +#define __LIB_HV_VHCA_H__
>> +
>> +#include "en.h"
>> +#include "lib/hv.h"
>> +
>> +struct mlx5_hv_vhca_agent;
>> +struct mlx5_hv_vhca;
>> +struct mlx5_hv_vhca_control_block;
>> +
>> +enum mlx5_hv_vhca_agent_type {
>> +	MLX5_HV_VHCA_AGENT_MAX = 32,
>> +};
>> +
>> +#if IS_ENABLED(CONFIG_PCI_HYPERV_INTERFACE)
>> +
>> +struct mlx5_hv_vhca_control_block {
>> +	u32     capabilities;
>> +	u32     control;
>> +	u16     command;
>> +	u16     command_ack;
>> +	u16     version;
>> +	u16     rings;
>> +	u32     reserved1[28];
>> +};
>> +
>> +struct mlx5_hv_vhca *mlx5_hv_vhca_create(struct mlx5_core_dev *dev);
>> +void mlx5_hv_vhca_destroy(struct mlx5_hv_vhca *hv_vhca);
>> +int mlx5_hv_vhca_init(struct mlx5_hv_vhca *hv_vhca);
>> +void mlx5_hv_vhca_cleanup(struct mlx5_hv_vhca *hv_vhca);
>> +void mlx5_hv_vhca_invalidate(void *context, u64 block_mask);
>> +
>> +struct mlx5_hv_vhca_agent *
>> +mlx5_hv_vhca_agent_create(struct mlx5_hv_vhca *hv_vhca,
>> +			  enum mlx5_hv_vhca_agent_type type,
>> +			  void (*control)(struct mlx5_hv_vhca_agent*,
>> +					  struct mlx5_hv_vhca_control_block *block),
>> +			  void (*invalidate)(struct mlx5_hv_vhca_agent*,
>> +					     u64 block_mask),
>> +			  void (*cleanup)(struct mlx5_hv_vhca_agent *agent),
>> +			  void *context);
>> +
>> +void mlx5_hv_vhca_agent_destroy(struct mlx5_hv_vhca_agent *agent);
>> +int mlx5_hv_vhca_agent_write(struct mlx5_hv_vhca_agent *agent,
>> +			     void *buf, int len);
>> +void *mlx5_hv_vhca_agent_priv(struct mlx5_hv_vhca_agent *agent);
>> +
>> +#else
>> +
>> +static inline struct mlx5_hv_vhca *
>> +mlx5_hv_vhca_create(struct mlx5_core_dev *dev)
>> +{
>> +	return NULL;
>> +}
>> +
>> +static inline void mlx5_hv_vhca_destroy(struct mlx5_hv_vhca *hv_vhca)
>> +{
>> +}
>> +
>> +static inline int mlx5_hv_vhca_init(struct mlx5_hv_vhca *hv_vhca)
>> +{
>> +	return 0;
>> +}
>> +
>> +static inline void mlx5_hv_vhca_cleanup(struct mlx5_hv_vhca *hv_vhca)
>> +{
>> +}
>> +
>> +static inline void mlx5_hv_vhca_invalidate(void *context,
>> +					   u64 block_mask)
>> +{
>> +}
>> +
>> +static inline struct mlx5_hv_vhca_agent *
>> +mlx5_hv_vhca_agent_create(struct mlx5_hv_vhca *hv_vhca,
>> +			  enum mlx5_hv_vhca_agent_type type,
>> +			  void (*control)(struct mlx5_hv_vhca_agent*,
>> +					  struct mlx5_hv_vhca_control_block *block),
>> +			  void (*invalidate)(struct mlx5_hv_vhca_agent*,
>> +					     u64 block_mask),
>> +			  void (*cleanup)(struct mlx5_hv_vhca_agent *agent),
>> +			  void *context)
>> +{
>> +	return NULL;
>> +}
>> +
>> +static inline void mlx5_hv_vhca_agent_destroy(struct mlx5_hv_vhca_agent *agent)
>> +{
>> +}
>> +
>> +static inline int
>> +mlx5_hv_vhca_write_agent(struct mlx5_hv_vhca_agent *agent,
>> +			 void *buf, int len)
>> +{
>> +	return 0;
>> +}
>> +#endif
>> +
>> +#endif /* __LIB_HV_VHCA_H__ */
>> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
>> index 0b70b1d..61388ca 100644
>> --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
>> +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
>> @@ -69,6 +69,7 @@
>>   #include "lib/pci_vsc.h"
>>   #include "diag/fw_tracer.h"
>>   #include "ecpf.h"
>> +#include "lib/hv_vhca.h"
>>
>>   MODULE_AUTHOR("Eli Cohen <eli@mellanox.com>");
>>   MODULE_DESCRIPTION("Mellanox 5th generation network adapters (ConnectX series) core driver");
>> @@ -870,6 +871,7 @@ static int mlx5_init_once(struct mlx5_core_dev *dev)
>>   	}
>>
>>   	dev->tracer = mlx5_fw_tracer_create(dev);
>> +	dev->hv_vhca = mlx5_hv_vhca_create(dev);
>>
>>   	return 0;
>>
>> @@ -900,6 +902,7 @@ static int mlx5_init_once(struct mlx5_core_dev *dev)
>>
>>   static void mlx5_cleanup_once(struct mlx5_core_dev *dev)
>>   {
>> +	mlx5_hv_vhca_destroy(dev->hv_vhca);
>>   	mlx5_fw_tracer_destroy(dev->tracer);
>>   	mlx5_fpga_cleanup(dev);
>>   	mlx5_eswitch_cleanup(dev->priv.eswitch);
>> @@ -1067,6 +1070,8 @@ static int mlx5_load(struct mlx5_core_dev *dev)
>>   		goto err_fw_tracer;
>>   	}
>>
>> +	mlx5_hv_vhca_init(dev->hv_vhca);
> 
> What is the point to declare this function as "int ..." if you are not
> interested in result?
> 
>> +
>>   	err = mlx5_fpga_device_start(dev);
>>   	if (err) {
>>   		mlx5_core_err(dev, "fpga device start failed %d\n", err);
>> @@ -1122,6 +1127,7 @@ static int mlx5_load(struct mlx5_core_dev *dev)
>>   err_ipsec_start:
>>   	mlx5_fpga_device_stop(dev);
>>   err_fpga_start:
>> +	mlx5_hv_vhca_cleanup(dev->hv_vhca);
>>   	mlx5_fw_tracer_cleanup(dev->tracer);
>>   err_fw_tracer:
>>   	mlx5_eq_table_destroy(dev);
>> @@ -1142,6 +1148,7 @@ static void mlx5_unload(struct mlx5_core_dev *dev)
>>   	mlx5_accel_ipsec_cleanup(dev);
>>   	mlx5_accel_tls_cleanup(dev);
>>   	mlx5_fpga_device_stop(dev);
>> +	mlx5_hv_vhca_cleanup(dev->hv_vhca);
>>   	mlx5_fw_tracer_cleanup(dev->tracer);
>>   	mlx5_eq_table_destroy(dev);
>>   	mlx5_irq_table_destroy(dev);
>> diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
>> index df23f17..13b4cf2 100644
>> --- a/include/linux/mlx5/driver.h
>> +++ b/include/linux/mlx5/driver.h
>> @@ -659,6 +659,7 @@ struct mlx5_clock {
>>   struct mlx5_fw_tracer;
>>   struct mlx5_vxlan;
>>   struct mlx5_geneve;
>> +struct mlx5_hv_vhca;
>>
>>   struct mlx5_core_dev {
>>   	struct device *device;
>> @@ -706,6 +707,7 @@ struct mlx5_core_dev {
>>   	struct mlx5_ib_clock_info  *clock_info;
>>   	struct mlx5_fw_tracer   *tracer;
>>   	u32                      vsc_addr;
>> +	struct mlx5_hv_vhca	*hv_vhca;
>>   };
>>
>>   struct mlx5_db {
>> --
>> 1.8.3.1
>>

^ permalink raw reply

* Re: [PATCH] nexthops: remove redundant assignment to variable err
From: David Miller @ 2019-08-22 19:14 UTC (permalink / raw)
  To: colin.king
  Cc: dsahern, kuznet, yoshfuji, netdev, kernel-janitors, linux-kernel
In-Reply-To: <20190822125340.30783-1-colin.king@canonical.com>

From: Colin King <colin.king@canonical.com>
Date: Thu, 22 Aug 2019 13:53:40 +0100

> From: Colin Ian King <colin.king@canonical.com>
> 
> Variable err is initialized to a value that is never read and it is
> re-assigned later. The initialization is redundant and can be removed.
> 
> Addresses-Coverity: ("Unused Value")
> Signed-off-by: Colin Ian King <colin.king@canonical.com>

Applied to net-next.

^ permalink raw reply

* Re: [PATCH net 0/9] rxrpc: Fix use of skb_cow_data()
From: David Miller @ 2019-08-22 19:12 UTC (permalink / raw)
  To: dhowells; +Cc: netdev, linux-afs, linux-kernel
In-Reply-To: <156647655350.10908.12081183247715153431.stgit@warthog.procyon.org.uk>

From: David Howells <dhowells@redhat.com>
Date: Thu, 22 Aug 2019 13:22:33 +0100

> Here's a series of patches that fixes the use of skb_cow_data() in rxrpc.
> The problem is that skb_cow_data() indirectly requires that the maximum
> usage count on an sk_buff be 1, and it may generate an assertion failure in
> pskb_expand_head() if not.

It sounds like you are effectively doing a late unshare when you have to
do in-place encryption.

Why don't you just do an skb_unshare() at the beginning when you know that
you'll need to do that?

^ permalink raw reply

* Re: [net] devlink: Add method for time-stamp on reporter's dump
From: Arnd Bergmann @ 2019-08-22 19:11 UTC (permalink / raw)
  To: Ido Schimmel
  Cc: Andrew Lunn, Aya Levin, David S. Miller, Jiri Pirko, Networking,
	Linux Kernel Mailing List
In-Reply-To: <20190822174037.GA18030@splinter>

On Thu, Aug 22, 2019 at 7:40 PM Ido Schimmel <idosch@idosch.org> wrote:
> On Thu, Aug 22, 2019 at 04:06:35PM +0200, Andrew Lunn wrote:
> > On Thu, Aug 22, 2019 at 11:17:51AM +0300, Aya Levin wrote:
> > > When setting the dump's time-stamp, use ktime_get_real in addition to
> > > jiffies. This simplifies the user space implementation and bypasses
> > > some inconsistent behavior with translating jiffies to current time.
> >
> > Is this year 2038 safe? I don't know enough about this to answer the
> > question myself.
>
> Good point. 'struct timespec' is not considered year 2038 safe and
> unfortunately I recently made the mistake of using it to communicate
> timestamps to user space over netlink. :/ The code is still in net-next,
> so I will fix it while I can.
>
> Arnd, would it be acceptable to use 'struct __kernel_timespec' instead?

The in-kernel representation should just use 'timespec64' if you need
separate seconds and nanoseconds, you can convert that to
__kernel_timespec while copying to user space.

However, please consider two other points:

- for simplicity, the general recommendation is to use 64-bit nanoseconds
  without separate seconds for timestamps
- instead of CLOCK_REALTIME, you could use CLOCK_MONOTONIC
  timestamps that are not affected by clock_settime() or leap second jumps.

      Arnd

^ permalink raw reply

* [PATCH 0/3] Add NETIF_F_HW_BRIDGE feature
From: Horatiu Vultur @ 2019-08-22 19:07 UTC (permalink / raw)
  To: roopa, nikolay, davem, UNGLinuxDriver, alexandre.belloni,
	allan.nielsen, netdev, linux-kernel, bridge
  Cc: Horatiu Vultur

Current implementation of the SW bridge is setting the interfaces in
promisc mode when they are added to bridge if learning of the frames is
enabled.
In case of Ocelot which has HW capabilities to switch frames, it is not
needed to set the ports in promisc mode because the HW already capable of
doing that. Therefore add NETIF_F_HW_BRIDGE feature to indicate that the
HW has bridge capabilities. Therefore the SW bridge doesn't need to set
the ports in promisc mode to do the switching.
This optimization takes places only if all the interfaces that are part
of the bridge have this flag and have the same network driver.

If the bridge interfaces is added in promisc mode then also the ports part
of the bridge are set in promisc mode.

Horatiu Vultur (3):
  net: Add HW_BRIDGE offload feature
  net: mscc: Use NETIF_F_HW_BRIDGE
  net: mscc: Implement promisc mode.

 drivers/net/ethernet/mscc/ocelot.c | 26 ++++++++++++++++++++++++--
 include/linux/netdev_features.h    |  3 +++
 net/bridge/br_if.c                 | 29 ++++++++++++++++++++++++++++-
 net/core/ethtool.c                 |  1 +
 4 files changed, 56 insertions(+), 3 deletions(-)

-- 
2.7.4


^ permalink raw reply

* [PATCH 1/3] net: Add HW_BRIDGE offload feature
From: Horatiu Vultur @ 2019-08-22 19:07 UTC (permalink / raw)
  To: roopa, nikolay, davem, UNGLinuxDriver, alexandre.belloni,
	allan.nielsen, netdev, linux-kernel, bridge
  Cc: Horatiu Vultur
In-Reply-To: <1566500850-6247-1-git-send-email-horatiu.vultur@microchip.com>

This patch adds a netdev feature to configure the HW as a switch.
The purpose of this flag is to show that the hardware can do switching
of the frames.

Signed-off-by: Horatiu Vultur <horatiu.vultur@microchip.com>
---
 include/linux/netdev_features.h |  3 +++
 net/bridge/br_if.c              | 29 ++++++++++++++++++++++++++++-
 net/core/ethtool.c              |  1 +
 3 files changed, 32 insertions(+), 1 deletion(-)

diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h
index 4b19c54..34274de 100644
--- a/include/linux/netdev_features.h
+++ b/include/linux/netdev_features.h
@@ -78,6 +78,8 @@ enum {
 	NETIF_F_HW_TLS_TX_BIT,		/* Hardware TLS TX offload */
 	NETIF_F_HW_TLS_RX_BIT,		/* Hardware TLS RX offload */
 
+	NETIF_F_HW_BRIDGE_BIT,		/* Bridge offload support */
+
 	NETIF_F_GRO_HW_BIT,		/* Hardware Generic receive offload */
 	NETIF_F_HW_TLS_RECORD_BIT,	/* Offload TLS record */
 
@@ -150,6 +152,7 @@ enum {
 #define NETIF_F_GSO_UDP_L4	__NETIF_F(GSO_UDP_L4)
 #define NETIF_F_HW_TLS_TX	__NETIF_F(HW_TLS_TX)
 #define NETIF_F_HW_TLS_RX	__NETIF_F(HW_TLS_RX)
+#define NETIF_F_HW_BRIDGE	__NETIF_F(HW_BRIDGE)
 
 /* Finds the next feature with the highest number of the range of start till 0.
  */
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index 4fe30b1..975a34c 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -127,6 +127,31 @@ static void br_port_clear_promisc(struct net_bridge_port *p)
 	p->flags &= ~BR_PROMISC;
 }
 
+/* Determin if the SW bridge can be offloaded to HW. Return true if all
+ * the interfaces of the bridge have the feature NETIF_F_HW_SWITCHDEV set
+ * and have the same netdev_ops.
+ */
+static int br_hw_offload(struct net_bridge *br)
+{
+	const struct net_device_ops *ops = NULL;
+	struct net_bridge_port *p;
+
+	list_for_each_entry(p, &br->port_list, list) {
+		if (!(p->dev->hw_features & NETIF_F_HW_BRIDGE))
+			return 0;
+
+		if (!ops) {
+			ops = p->dev->netdev_ops;
+			continue;
+		}
+
+		if (ops != p->dev->netdev_ops)
+			return 0;
+	}
+
+	return 1;
+}
+
 /* When a port is added or removed or when certain port flags
  * change, this function is called to automatically manage
  * promiscuity setting of all the bridge ports.  We are always called
@@ -134,6 +159,7 @@ static void br_port_clear_promisc(struct net_bridge_port *p)
  */
 void br_manage_promisc(struct net_bridge *br)
 {
+	bool hw_offload = br_hw_offload(br);
 	struct net_bridge_port *p;
 	bool set_all = false;
 
@@ -161,7 +187,8 @@ void br_manage_promisc(struct net_bridge *br)
 			    (br->auto_cnt == 1 && br_auto_port(p)))
 				br_port_clear_promisc(p);
 			else
-				br_port_set_promisc(p);
+				if (!hw_offload)
+					br_port_set_promisc(p);
 		}
 	}
 }
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 6288e69..4e224fe 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -111,6 +111,7 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN]
 	[NETIF_F_HW_TLS_RECORD_BIT] =	"tls-hw-record",
 	[NETIF_F_HW_TLS_TX_BIT] =	 "tls-hw-tx-offload",
 	[NETIF_F_HW_TLS_RX_BIT] =	 "tls-hw-rx-offload",
+	[NETIF_F_HW_BRIDGE_BIT] =	 "hw-bridge-offload",
 };
 
 static const char
-- 
2.7.4


^ permalink raw reply related

* [PATCH 2/3] net: mscc: Use NETIF_F_HW_BRIDGE
From: Horatiu Vultur @ 2019-08-22 19:07 UTC (permalink / raw)
  To: roopa, nikolay, davem, UNGLinuxDriver, alexandre.belloni,
	allan.nielsen, netdev, linux-kernel, bridge
  Cc: Horatiu Vultur
In-Reply-To: <1566500850-6247-1-git-send-email-horatiu.vultur@microchip.com>

Enable HW_BRIDGE feature for ocelot. In this way the HW will do all the
switching of the frames so it is not needed for the ports to be in promisc
mode.

Signed-off-by: Horatiu Vultur <horatiu.vultur@microchip.com>
---
 drivers/net/ethernet/mscc/ocelot.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/mscc/ocelot.c b/drivers/net/ethernet/mscc/ocelot.c
index 4d1bce4..c9cf2bee 100644
--- a/drivers/net/ethernet/mscc/ocelot.c
+++ b/drivers/net/ethernet/mscc/ocelot.c
@@ -2017,8 +2017,10 @@ int ocelot_probe_port(struct ocelot *ocelot, u8 port,
 	dev->ethtool_ops = &ocelot_ethtool_ops;
 
 	dev->hw_features |= NETIF_F_HW_VLAN_CTAG_FILTER | NETIF_F_RXFCS |
-		NETIF_F_HW_TC;
-	dev->features |= NETIF_F_HW_VLAN_CTAG_FILTER | NETIF_F_HW_TC;
+		NETIF_F_HW_TC | NETIF_F_HW_BRIDGE;
+	dev->features |= NETIF_F_HW_VLAN_CTAG_FILTER | NETIF_F_HW_TC |
+		NETIF_F_HW_BRIDGE;
+	dev->priv_flags |= IFF_UNICAST_FLT;
 
 	memcpy(dev->dev_addr, ocelot->base_mac, ETH_ALEN);
 	dev->dev_addr[ETH_ALEN - 1] += port;
-- 
2.7.4


^ permalink raw reply related

* [PATCH 3/3] net: mscc: Implement promisc mode.
From: Horatiu Vultur @ 2019-08-22 19:07 UTC (permalink / raw)
  To: roopa, nikolay, davem, UNGLinuxDriver, alexandre.belloni,
	allan.nielsen, netdev, linux-kernel, bridge
  Cc: Horatiu Vultur
In-Reply-To: <1566500850-6247-1-git-send-email-horatiu.vultur@microchip.com>

Before when a port was added to a bridge then the port was added in
promisc mode. But because of the patches:
commit 6657c3d812dc5d ("net: Add HW_BRIDGE offload feature")
commit e2e3678c292f9c (net: mscc: Use NETIF_F_HW_BRIDGE")

the port is not needed to be in promisc mode to be part of the bridge.
So it is possible to togle the promisc mode of the port even if it is or
not part of the bridge.

Signed-off-by: Horatiu Vultur <horatiu.vultur@microchip.com>
---
 drivers/net/ethernet/mscc/ocelot.c | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/drivers/net/ethernet/mscc/ocelot.c b/drivers/net/ethernet/mscc/ocelot.c
index c9cf2bee..9fa97fe 100644
--- a/drivers/net/ethernet/mscc/ocelot.c
+++ b/drivers/net/ethernet/mscc/ocelot.c
@@ -691,6 +691,25 @@ static void ocelot_set_rx_mode(struct net_device *dev)
 	__dev_mc_sync(dev, ocelot_mc_sync, ocelot_mc_unsync);
 }
 
+static void ocelot_change_rx_flags(struct net_device *dev, int flags)
+{
+	struct ocelot_port *port = netdev_priv(dev);
+	struct ocelot *ocelot = port->ocelot;
+	u32 val;
+
+	if (!(flags & IFF_PROMISC))
+		return;
+
+	val = ocelot_read_gix(ocelot, ANA_PORT_CPU_FWD_CFG,
+			      port->chip_port);
+	if (dev->flags & IFF_PROMISC)
+		val |= ANA_PORT_CPU_FWD_CFG_CPU_SRC_COPY_ENA;
+	else
+		val &= ~(ANA_PORT_CPU_FWD_CFG_CPU_SRC_COPY_ENA);
+
+	ocelot_write_gix(ocelot, val, ANA_PORT_CPU_FWD_CFG, port->chip_port);
+}
+
 static int ocelot_port_get_phys_port_name(struct net_device *dev,
 					  char *buf, size_t len)
 {
@@ -1070,6 +1089,7 @@ static const struct net_device_ops ocelot_port_netdev_ops = {
 	.ndo_stop			= ocelot_port_stop,
 	.ndo_start_xmit			= ocelot_port_xmit,
 	.ndo_set_rx_mode		= ocelot_set_rx_mode,
+	.ndo_change_rx_flags		= ocelot_change_rx_flags,
 	.ndo_get_phys_port_name		= ocelot_port_get_phys_port_name,
 	.ndo_set_mac_address		= ocelot_port_set_mac_address,
 	.ndo_get_stats64		= ocelot_get_stats64,
-- 
2.7.4


^ permalink raw reply related

* Re: [PATCH 0/3] rework netlink skb allocation
From: David Miller @ 2019-08-22 19:04 UTC (permalink / raw)
  To: jan.dakinevich
  Cc: linux-kernel, den, khorenko, jan.dakinevich, kuznet, yoshfuji,
	pablo, kadlec, fw, johannes.berg, dsahern, christian, stephen,
	Jason, jakub.kicinski, willemb, xiyou.wangcong, simon.horman,
	john.hurley, pabeni, brouer, bigeasy, edumazet, lirongqing,
	ap420073, ptalbert, herbert, tglx, dima, netdev, netfilter-devel,
	coreteam
In-Reply-To: <1566470851-4694-1-git-send-email-jan.dakinevich@virtuozzo.com>

From: Jan Dakinevich <jan.dakinevich@virtuozzo.com>
Date: Thu, 22 Aug 2019 10:48:08 +0000

> Currently, userspace is able to initiate costly high-order allocation in 
> kernel sending large broadcast netlink message, which is considered 
> undesirable. At the same time, unicast message are safe in this regard, 
> because they uses vmalloc-ed memory.
> 
> This series introduces changes, that allow broadcast messages to be 
> allocated with vmalloc() as well as unicast.

I'm tossing this series for the same reason I tossed the AF_UNIX change.

^ permalink raw reply

* Re: [PATCH] af_unix: utilize skb's fragment list for sending large datagrams
From: David Miller @ 2019-08-22 19:04 UTC (permalink / raw)
  To: jan.dakinevich
  Cc: linux-kernel, den, khorenko, pabeni, viro, axboe, hare, kgraul,
	kyeongdon.kim, tglx, netdev
In-Reply-To: <1566470311-4089-1-git-send-email-jan.dakinevich@virtuozzo.com>

From: Jan Dakinevich <jan.dakinevich@virtuozzo.com>
Date: Thu, 22 Aug 2019 10:38:39 +0000

> However, paged part can not exceed MAX_SKB_FRAGS * PAGE_SIZE, and large
> datagram causes increasing skb's data buffer. Thus, if any user-space
> program sets send buffer (by calling setsockopt(SO_SNDBUF, ...)) to
> maximum allowed size (wmem_max) it becomes able to cause any amount
> of uncontrolled high-order kernel allocations.

So?  You want huge SKBs you get the high order allocations, seems
rather reasonable to me.

SKBs using fragment lists are the most difficult and cpu intensive
geometry for an SKB to have and we should avoid using it where
feasible.

I don't want to apply this, sorry.

^ permalink raw reply

* Re: [PATCH net-next,v4, 4/6] net/mlx5: Add HV VHCA infrastructure
From: Leon Romanovsky @ 2019-08-22 18:58 UTC (permalink / raw)
  To: Haiyang Zhang
  Cc: sashal@kernel.org, davem@davemloft.net, saeedm@mellanox.com,
	eranbe@mellanox.com, lorenzo.pieralisi@arm.com,
	bhelgaas@google.com, linux-pci@vger.kernel.org,
	linux-hyperv@vger.kernel.org, netdev@vger.kernel.org,
	KY Srinivasan, Stephen Hemminger, linux-kernel@vger.kernel.org
In-Reply-To: <1566450236-36757-5-git-send-email-haiyangz@microsoft.com>

On Thu, Aug 22, 2019 at 05:05:51AM +0000, Haiyang Zhang wrote:
> From: Eran Ben Elisha <eranbe@mellanox.com>
>
> HV VHCA is a layer which provides PF to VF communication channel based on
> HyperV PCI config channel. It implements Mellanox's Inter VHCA control
> communication protocol. The protocol contains control block in order to
> pass messages between the PF and VF drivers, and data blocks in order to
> pass actual data.
>
> The infrastructure is agent based. Each agent will be responsible of
> contiguous buffer blocks in the VHCA config space. This infrastructure will
> bind agents to their blocks, and those agents can only access read/write
> the buffer blocks assigned to them. Each agent will provide three
> callbacks (control, invalidate, cleanup). Control will be invoked when
> block-0 is invalidated with a command that concerns this agent. Invalidate
> callback will be invoked if one of the blocks assigned to this agent was
> invalidated. Cleanup will be invoked before the agent is being freed in
> order to clean all of its open resources or deferred works.
>
> Block-0 serves as the control block. All execution commands from the PF
> will be written by the PF over this block. VF will ack on those by
> writing on block-0 as well. Its format is described by struct
> mlx5_hv_vhca_control_block layout.
>
> Signed-off-by: Eran Ben Elisha <eranbe@mellanox.com>
> Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
> Signed-off-by: Haiyang Zhang <haiyangz@microsoft.com>
> ---
>  drivers/net/ethernet/mellanox/mlx5/core/Makefile   |   2 +-
>  .../net/ethernet/mellanox/mlx5/core/lib/hv_vhca.c  | 253 +++++++++++++++++++++
>  .../net/ethernet/mellanox/mlx5/core/lib/hv_vhca.h  | 102 +++++++++
>  drivers/net/ethernet/mellanox/mlx5/core/main.c     |   7 +
>  include/linux/mlx5/driver.h                        |   2 +
>  5 files changed, 365 insertions(+), 1 deletion(-)
>  create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.c
>  create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.h
>
> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
> index fd32a5b..8d443fc 100644
> --- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
> @@ -45,7 +45,7 @@ mlx5_core-$(CONFIG_MLX5_ESWITCH)   += eswitch.o eswitch_offloads.o eswitch_offlo
>  mlx5_core-$(CONFIG_MLX5_MPFS)      += lib/mpfs.o
>  mlx5_core-$(CONFIG_VXLAN)          += lib/vxlan.o
>  mlx5_core-$(CONFIG_PTP_1588_CLOCK) += lib/clock.o
> -mlx5_core-$(CONFIG_PCI_HYPERV_INTERFACE) += lib/hv.o
> +mlx5_core-$(CONFIG_PCI_HYPERV_INTERFACE) += lib/hv.o lib/hv_vhca.o
>
>  #
>  # Ipoib netdev
> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.c
> new file mode 100644
> index 0000000..84d1d75
> --- /dev/null
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.c
> @@ -0,0 +1,253 @@
> +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
> +// Copyright (c) 2018 Mellanox Technologies
> +
> +#include <linux/hyperv.h>
> +#include "mlx5_core.h"
> +#include "lib/hv.h"
> +#include "lib/hv_vhca.h"
> +
> +struct mlx5_hv_vhca {
> +	struct mlx5_core_dev       *dev;
> +	struct workqueue_struct    *work_queue;
> +	struct mlx5_hv_vhca_agent  *agents[MLX5_HV_VHCA_AGENT_MAX];
> +	struct mutex                agents_lock; /* Protect agents array */
> +};
> +
> +struct mlx5_hv_vhca_work {
> +	struct work_struct     invalidate_work;
> +	struct mlx5_hv_vhca   *hv_vhca;
> +	u64                    block_mask;
> +};
> +
> +struct mlx5_hv_vhca_data_block {
> +	u16     sequence;
> +	u16     offset;
> +	u8      reserved[4];
> +	u64     data[15];
> +};
> +
> +struct mlx5_hv_vhca_agent {
> +	enum mlx5_hv_vhca_agent_type	 type;
> +	struct mlx5_hv_vhca		*hv_vhca;
> +	void				*priv;
> +	u16                              seq;
> +	void (*control)(struct mlx5_hv_vhca_agent *agent,
> +			struct mlx5_hv_vhca_control_block *block);
> +	void (*invalidate)(struct mlx5_hv_vhca_agent *agent,
> +			   u64 block_mask);
> +	void (*cleanup)(struct mlx5_hv_vhca_agent *agent);
> +};
> +
> +struct mlx5_hv_vhca *mlx5_hv_vhca_create(struct mlx5_core_dev *dev)
> +{
> +	struct mlx5_hv_vhca *hv_vhca = NULL;
> +
> +	hv_vhca = kzalloc(sizeof(*hv_vhca), GFP_KERNEL);
> +	if (!hv_vhca)
> +		return ERR_PTR(-ENOMEM);
> +
> +	hv_vhca->work_queue = create_singlethread_workqueue("mlx5_hv_vhca");

I was under impression that usage of create_* interfaces is discouraged,
It has WQ_MEMORY_LEGACY flag inside and commit b71ab8c2025ca talks about
this interface as legacy one.

> +	if (!hv_vhca->work_queue) {
> +		kfree(hv_vhca);
> +		return ERR_PTR(-ENOMEM);
> +	}
> +
> +	hv_vhca->dev = dev;
> +	mutex_init(&hv_vhca->agents_lock);
> +
> +	return hv_vhca;
> +}
> +
> +void mlx5_hv_vhca_destroy(struct mlx5_hv_vhca *hv_vhca)
> +{
> +	if (IS_ERR_OR_NULL(hv_vhca))
> +		return;
> +
> +	destroy_workqueue(hv_vhca->work_queue);
> +	kfree(hv_vhca);
> +}
> +
> +static void mlx5_hv_vhca_invalidate_work(struct work_struct *work)
> +{
> +	struct mlx5_hv_vhca_work *hwork;
> +	struct mlx5_hv_vhca *hv_vhca;
> +	int i;
> +
> +	hwork = container_of(work, struct mlx5_hv_vhca_work, invalidate_work);
> +	hv_vhca = hwork->hv_vhca;
> +
> +	mutex_lock(&hv_vhca->agents_lock);
> +	for (i = 0; i < MLX5_HV_VHCA_AGENT_MAX; i++) {
> +		struct mlx5_hv_vhca_agent *agent = hv_vhca->agents[i];
> +
> +		if (!agent || !agent->invalidate)
> +			continue;
> +
> +		if (!(BIT(agent->type) & hwork->block_mask))
> +			continue;
> +
> +		agent->invalidate(agent, hwork->block_mask);
> +	}
> +	mutex_unlock(&hv_vhca->agents_lock);
> +
> +	kfree(hwork);
> +}
> +
> +void mlx5_hv_vhca_invalidate(void *context, u64 block_mask)
> +{
> +	struct mlx5_hv_vhca *hv_vhca = (struct mlx5_hv_vhca *)context;
> +	struct mlx5_hv_vhca_work *work;
> +
> +	work = kzalloc(sizeof(*work), GFP_ATOMIC);
> +	if (!work)
> +		return;
> +
> +	INIT_WORK(&work->invalidate_work, mlx5_hv_vhca_invalidate_work);
> +	work->hv_vhca    = hv_vhca;
> +	work->block_mask = block_mask;
> +
> +	queue_work(hv_vhca->work_queue, &work->invalidate_work);
> +}
> +
> +int mlx5_hv_vhca_init(struct mlx5_hv_vhca *hv_vhca)
> +{
> +	if (IS_ERR_OR_NULL(hv_vhca))
> +		return IS_ERR_OR_NULL(hv_vhca);
> +
> +	return mlx5_hv_register_invalidate(hv_vhca->dev, hv_vhca,
> +					   mlx5_hv_vhca_invalidate);
> +}
> +
> +void mlx5_hv_vhca_cleanup(struct mlx5_hv_vhca *hv_vhca)
> +{
> +	int i;
> +
> +	if (IS_ERR_OR_NULL(hv_vhca))
> +		return;
> +
> +	mutex_lock(&hv_vhca->agents_lock);
> +	for (i = 0; i < MLX5_HV_VHCA_AGENT_MAX; i++)
> +		WARN_ON(hv_vhca->agents[i]);
> +
> +	mutex_unlock(&hv_vhca->agents_lock);
> +
> +	mlx5_hv_unregister_invalidate(hv_vhca->dev);
> +}
> +
> +struct mlx5_hv_vhca_agent *
> +mlx5_hv_vhca_agent_create(struct mlx5_hv_vhca *hv_vhca,
> +			  enum mlx5_hv_vhca_agent_type type,
> +			  void (*control)(struct mlx5_hv_vhca_agent*,
> +					  struct mlx5_hv_vhca_control_block *block),
> +			  void (*invalidate)(struct mlx5_hv_vhca_agent*,
> +					     u64 block_mask),
> +			  void (*cleaup)(struct mlx5_hv_vhca_agent *agent),
> +			  void *priv)
> +{
> +	struct mlx5_hv_vhca_agent *agent;
> +
> +	if (IS_ERR_OR_NULL(hv_vhca))
> +		return ERR_PTR(-ENOMEM);
> +
> +	if (type >= MLX5_HV_VHCA_AGENT_MAX)
> +		return ERR_PTR(-EINVAL);
> +
> +	mutex_lock(&hv_vhca->agents_lock);
> +	if (hv_vhca->agents[type]) {
> +		mutex_unlock(&hv_vhca->agents_lock);
> +		return ERR_PTR(-EINVAL);
> +	}
> +	mutex_unlock(&hv_vhca->agents_lock);
> +
> +	agent = kzalloc(sizeof(*agent), GFP_KERNEL);
> +	if (!agent)
> +		return ERR_PTR(-ENOMEM);
> +
> +	agent->type      = type;
> +	agent->hv_vhca   = hv_vhca;
> +	agent->priv      = priv;
> +	agent->control   = control;
> +	agent->invalidate = invalidate;
> +	agent->cleanup   = cleaup;
> +
> +	mutex_lock(&hv_vhca->agents_lock);
> +	hv_vhca->agents[type] = agent;
> +	mutex_unlock(&hv_vhca->agents_lock);
> +
> +	return agent;
> +}
> +
> +void mlx5_hv_vhca_agent_destroy(struct mlx5_hv_vhca_agent *agent)
> +{
> +	struct mlx5_hv_vhca *hv_vhca = agent->hv_vhca;
> +
> +	mutex_lock(&hv_vhca->agents_lock);
> +
> +	if (WARN_ON(agent != hv_vhca->agents[agent->type])) {
> +		mutex_unlock(&hv_vhca->agents_lock);
> +		return;
> +	}
> +
> +	hv_vhca->agents[agent->type] = NULL;
> +	mutex_unlock(&hv_vhca->agents_lock);
> +
> +	if (agent->cleanup)
> +		agent->cleanup(agent);
> +
> +	kfree(agent);
> +}
> +
> +static int mlx5_hv_vhca_data_block_prepare(struct mlx5_hv_vhca_agent *agent,
> +					   struct mlx5_hv_vhca_data_block *data_block,
> +					   void *src, int len, int *offset)
> +{
> +	int bytes = min_t(int, (int)sizeof(data_block->data), len);
> +
> +	data_block->sequence = agent->seq;
> +	data_block->offset   = (*offset)++;
> +	memcpy(data_block->data, src, bytes);
> +
> +	return bytes;
> +}
> +
> +static void mlx5_hv_vhca_agent_seq_update(struct mlx5_hv_vhca_agent *agent)
> +{
> +	agent->seq++;
> +}
> +
> +int mlx5_hv_vhca_agent_write(struct mlx5_hv_vhca_agent *agent,
> +			     void *buf, int len)
> +{
> +	int offset = agent->type * HV_CONFIG_BLOCK_SIZE_MAX;
> +	int block_offset = 0;
> +	int total = 0;
> +	int err;
> +
> +	while (len) {
> +		struct mlx5_hv_vhca_data_block data_block = {0};
> +		int bytes;
> +
> +		bytes = mlx5_hv_vhca_data_block_prepare(agent, &data_block,
> +							buf + total,
> +							len, &block_offset);
> +		if (!bytes)
> +			return -ENOMEM;
> +
> +		err = mlx5_hv_write_config(agent->hv_vhca->dev, &data_block,
> +					   sizeof(data_block), offset);
> +		if (err)
> +			return err;
> +
> +		total += bytes;
> +		len   -= bytes;
> +	}
> +
> +	mlx5_hv_vhca_agent_seq_update(agent);
> +
> +	return 0;
> +}
> +
> +void *mlx5_hv_vhca_agent_priv(struct mlx5_hv_vhca_agent *agent)
> +{
> +	return agent->priv;
> +}
> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.h
> new file mode 100644
> index 0000000..cdf1303
> --- /dev/null
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.h
> @@ -0,0 +1,102 @@
> +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
> +/* Copyright (c) 2019 Mellanox Technologies. */
> +
> +#ifndef __LIB_HV_VHCA_H__
> +#define __LIB_HV_VHCA_H__
> +
> +#include "en.h"
> +#include "lib/hv.h"
> +
> +struct mlx5_hv_vhca_agent;
> +struct mlx5_hv_vhca;
> +struct mlx5_hv_vhca_control_block;
> +
> +enum mlx5_hv_vhca_agent_type {
> +	MLX5_HV_VHCA_AGENT_MAX = 32,
> +};
> +
> +#if IS_ENABLED(CONFIG_PCI_HYPERV_INTERFACE)
> +
> +struct mlx5_hv_vhca_control_block {
> +	u32     capabilities;
> +	u32     control;
> +	u16     command;
> +	u16     command_ack;
> +	u16     version;
> +	u16     rings;
> +	u32     reserved1[28];
> +};
> +
> +struct mlx5_hv_vhca *mlx5_hv_vhca_create(struct mlx5_core_dev *dev);
> +void mlx5_hv_vhca_destroy(struct mlx5_hv_vhca *hv_vhca);
> +int mlx5_hv_vhca_init(struct mlx5_hv_vhca *hv_vhca);
> +void mlx5_hv_vhca_cleanup(struct mlx5_hv_vhca *hv_vhca);
> +void mlx5_hv_vhca_invalidate(void *context, u64 block_mask);
> +
> +struct mlx5_hv_vhca_agent *
> +mlx5_hv_vhca_agent_create(struct mlx5_hv_vhca *hv_vhca,
> +			  enum mlx5_hv_vhca_agent_type type,
> +			  void (*control)(struct mlx5_hv_vhca_agent*,
> +					  struct mlx5_hv_vhca_control_block *block),
> +			  void (*invalidate)(struct mlx5_hv_vhca_agent*,
> +					     u64 block_mask),
> +			  void (*cleanup)(struct mlx5_hv_vhca_agent *agent),
> +			  void *context);
> +
> +void mlx5_hv_vhca_agent_destroy(struct mlx5_hv_vhca_agent *agent);
> +int mlx5_hv_vhca_agent_write(struct mlx5_hv_vhca_agent *agent,
> +			     void *buf, int len);
> +void *mlx5_hv_vhca_agent_priv(struct mlx5_hv_vhca_agent *agent);
> +
> +#else
> +
> +static inline struct mlx5_hv_vhca *
> +mlx5_hv_vhca_create(struct mlx5_core_dev *dev)
> +{
> +	return NULL;
> +}
> +
> +static inline void mlx5_hv_vhca_destroy(struct mlx5_hv_vhca *hv_vhca)
> +{
> +}
> +
> +static inline int mlx5_hv_vhca_init(struct mlx5_hv_vhca *hv_vhca)
> +{
> +	return 0;
> +}
> +
> +static inline void mlx5_hv_vhca_cleanup(struct mlx5_hv_vhca *hv_vhca)
> +{
> +}
> +
> +static inline void mlx5_hv_vhca_invalidate(void *context,
> +					   u64 block_mask)
> +{
> +}
> +
> +static inline struct mlx5_hv_vhca_agent *
> +mlx5_hv_vhca_agent_create(struct mlx5_hv_vhca *hv_vhca,
> +			  enum mlx5_hv_vhca_agent_type type,
> +			  void (*control)(struct mlx5_hv_vhca_agent*,
> +					  struct mlx5_hv_vhca_control_block *block),
> +			  void (*invalidate)(struct mlx5_hv_vhca_agent*,
> +					     u64 block_mask),
> +			  void (*cleanup)(struct mlx5_hv_vhca_agent *agent),
> +			  void *context)
> +{
> +	return NULL;
> +}
> +
> +static inline void mlx5_hv_vhca_agent_destroy(struct mlx5_hv_vhca_agent *agent)
> +{
> +}
> +
> +static inline int
> +mlx5_hv_vhca_write_agent(struct mlx5_hv_vhca_agent *agent,
> +			 void *buf, int len)
> +{
> +	return 0;
> +}
> +#endif
> +
> +#endif /* __LIB_HV_VHCA_H__ */
> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
> index 0b70b1d..61388ca 100644
> --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
> @@ -69,6 +69,7 @@
>  #include "lib/pci_vsc.h"
>  #include "diag/fw_tracer.h"
>  #include "ecpf.h"
> +#include "lib/hv_vhca.h"
>
>  MODULE_AUTHOR("Eli Cohen <eli@mellanox.com>");
>  MODULE_DESCRIPTION("Mellanox 5th generation network adapters (ConnectX series) core driver");
> @@ -870,6 +871,7 @@ static int mlx5_init_once(struct mlx5_core_dev *dev)
>  	}
>
>  	dev->tracer = mlx5_fw_tracer_create(dev);
> +	dev->hv_vhca = mlx5_hv_vhca_create(dev);
>
>  	return 0;
>
> @@ -900,6 +902,7 @@ static int mlx5_init_once(struct mlx5_core_dev *dev)
>
>  static void mlx5_cleanup_once(struct mlx5_core_dev *dev)
>  {
> +	mlx5_hv_vhca_destroy(dev->hv_vhca);
>  	mlx5_fw_tracer_destroy(dev->tracer);
>  	mlx5_fpga_cleanup(dev);
>  	mlx5_eswitch_cleanup(dev->priv.eswitch);
> @@ -1067,6 +1070,8 @@ static int mlx5_load(struct mlx5_core_dev *dev)
>  		goto err_fw_tracer;
>  	}
>
> +	mlx5_hv_vhca_init(dev->hv_vhca);

What is the point to declare this function as "int ..." if you are not
interested in result?

> +
>  	err = mlx5_fpga_device_start(dev);
>  	if (err) {
>  		mlx5_core_err(dev, "fpga device start failed %d\n", err);
> @@ -1122,6 +1127,7 @@ static int mlx5_load(struct mlx5_core_dev *dev)
>  err_ipsec_start:
>  	mlx5_fpga_device_stop(dev);
>  err_fpga_start:
> +	mlx5_hv_vhca_cleanup(dev->hv_vhca);
>  	mlx5_fw_tracer_cleanup(dev->tracer);
>  err_fw_tracer:
>  	mlx5_eq_table_destroy(dev);
> @@ -1142,6 +1148,7 @@ static void mlx5_unload(struct mlx5_core_dev *dev)
>  	mlx5_accel_ipsec_cleanup(dev);
>  	mlx5_accel_tls_cleanup(dev);
>  	mlx5_fpga_device_stop(dev);
> +	mlx5_hv_vhca_cleanup(dev->hv_vhca);
>  	mlx5_fw_tracer_cleanup(dev->tracer);
>  	mlx5_eq_table_destroy(dev);
>  	mlx5_irq_table_destroy(dev);
> diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
> index df23f17..13b4cf2 100644
> --- a/include/linux/mlx5/driver.h
> +++ b/include/linux/mlx5/driver.h
> @@ -659,6 +659,7 @@ struct mlx5_clock {
>  struct mlx5_fw_tracer;
>  struct mlx5_vxlan;
>  struct mlx5_geneve;
> +struct mlx5_hv_vhca;
>
>  struct mlx5_core_dev {
>  	struct device *device;
> @@ -706,6 +707,7 @@ struct mlx5_core_dev {
>  	struct mlx5_ib_clock_info  *clock_info;
>  	struct mlx5_fw_tracer   *tracer;
>  	u32                      vsc_addr;
> +	struct mlx5_hv_vhca	*hv_vhca;
>  };
>
>  struct mlx5_db {
> --
> 1.8.3.1
>

^ permalink raw reply

* Re: [PATCH net-next] r8152: saving the settings of EEE
From: David Miller @ 2019-08-22 18:52 UTC (permalink / raw)
  To: hayeswang; +Cc: netdev, nic_swsd, linux-kernel
In-Reply-To: <1394712342-15778-304-Taiwan-albertk@realtek.com>

From: Hayes Wang <hayeswang@realtek.com>
Date: Thu, 22 Aug 2019 16:07:18 +0800

> +	if (tp->eee_en) {
> +		r8152_eee_en(tp, true);
> +		r8152_mmd_write(tp, MDIO_MMD_AN, MDIO_AN_EEE_ADV, tp->eee_adv);
> +	} else {
> +		r8152_eee_en(tp, false);
> +		r8152_mmd_write(tp, MDIO_MMD_AN, MDIO_AN_EEE_ADV, 0);
> +	}

I see this same exact code sequence at least 4 times in your patch, please
make a helper function.

^ permalink raw reply

* Re: New skb extension for use by LSMs (skb "security blob")?
From: Casey Schaufler @ 2019-08-22 18:50 UTC (permalink / raw)
  To: David Miller, paul; +Cc: netdev, linux-security-module, selinux, casey
In-Reply-To: <20190821.205454.2103510420957943248.davem@davemloft.net>

On 8/21/2019 8:54 PM, David Miller wrote:
> From: Paul Moore <paul@paul-moore.com>
> Date: Wed, 21 Aug 2019 23:27:03 -0400
>
>> On Wed, Aug 21, 2019 at 6:50 PM David Miller <davem@davemloft.net> wrote:
>>> From: Paul Moore <paul@paul-moore.com>
>>> Date: Wed, 21 Aug 2019 18:00:09 -0400
>>>
>>>> I was just made aware of the skb extension work, and it looks very
>>>> appealing from a LSM perspective.  As some of you probably remember,
>>>> we (the LSM folks) have wanted a proper security blob in the skb for
>>>> quite some time, but netdev has been resistant to this idea thus far.
>>>>
>>>> If I were to propose a patchset to add a SKB_EXT_SECURITY skb
>>>> extension (a single extension ID to be shared among the different
>>>> LSMs), would that be something that netdev would consider merging, or
>>>> is there still a philosophical objection to things like this?
>>> Unlike it's main intended user (MPTCP), it sounds like LSM's would use
>>> this in a way such that it would be enabled on most systems all the
>>> time.

Only SELinux and Smack use the networking hooks today,
although I understand that AppArmor has plans to do so
in the not too distant future. Smack enables labeled
networking at all times. While Smack doesn't have the
expansive use that SELinux does because of Android, it
is used extensively in embedded systems via Tizen and
Yocto Project deployments.

>>> That really defeats the whole purpose of making it dynamic. :-/

It argues that fulfilling the needs of LSMs ought to be a basic
feature of the skb, rather than a dynamic extension. When LSMs were
introduced 20 years ago it was assumed their use would be rare, and
it was. Today almost all Linux systems use LSMs, and once AppArmor
adds network labeling it will be quite difficult to find a major
distribution that doesn't need the support.

>> I would be okay with only adding a skb extension when we needed it,
>> which I'm currently thinking would only be when we had labeled
>> networking actually configured at runtime and not just built into the
>> kernel.  In SELinux we do something similar today when it comes to our
>> per-packet access controls; if labeled networking is not configured we
>> bail out of the LSM hooks early to improve performance (we would just
>> be comparing unlabeled_t to unlabeled_t anyway).  I think the other
>> LSMs would be okay with this usage as well.

Smack uses labeled (CIPSO now, CALIPSO 'soon') networking by default
and depends on it heavily for basic system policy enforcement.

>> While a number of distros due enable some form of LSM and the labeled
>> networking bits at build time, vary few (if any?) provide a default
>> configuration so I would expect no additional overhead in the common
>> case.

Tizen isn't a distro, but neither is Android.

>> Would that be acceptable?
> I honestly don't know, I kinda feared that once the SKB extension went in
> people would start dumping things there and that's exactly what's happening.

As Paul has mentioned, the LSM community (Paul and me in particular)
have been looking for a better way to deal with the network stack
for a long time.

> I just so happened to be reviewing:
>
> 	https://patchwork.ozlabs.org/patch/1150091/
>
> while you were writing this email.
>
> It's rediculous, the vultures are out.


^ permalink raw reply

* Re: [PATCHv4 net 1/2] ipv4/icmp: fix rt dst dev null pointer dereference
From: Julian Anastasov @ 2019-08-22 18:46 UTC (permalink / raw)
  To: Hangbin Liu
  Cc: netdev, Stefano Brivio, wenxu, Alexei Starovoitov,
	David S . Miller, Eric Dumazet
In-Reply-To: <20190822141949.29561-2-liuhangbin@gmail.com>


	Hello,

On Thu, 22 Aug 2019, Hangbin Liu wrote:

> In __icmp_send() there is a possibility that the rt->dst.dev is NULL,
> e,g, with tunnel collect_md mode, which will cause kernel crash.
> Here is what the code path looks like, for GRE:
> 
> - ip6gre_tunnel_xmit
>   - ip6gre_xmit_ipv4
>     - __gre6_xmit
>       - ip6_tnl_xmit
>         - if skb->len - t->tun_hlen - eth_hlen > mtu; return -EMSGSIZE
>     - icmp_send
>       - net = dev_net(rt->dst.dev); <-- here
> 
> The reason is __metadata_dst_init() init dst->dev to NULL by default.
> We could not fix it in __metadata_dst_init() as there is no dev supplied.
> On the other hand, the reason we need rt->dst.dev is to get the net.
> So we can just try get it from skb->dev when rt->dst.dev is NULL.
> 
> v4: Julian Anastasov remind skb->dev also could be NULL. We'd better
> still use dst.dev and do a check to avoid crash.
> 
> v3: No changes.
> 
> v2: fix the issue in __icmp_send() instead of updating shared dst dev
> in {ip_md, ip6}_tunnel_xmit.
> 
> Fixes: c8b34e680a09 ("ip_tunnel: Add tnl_update_pmtu in ip_md_tunnel_xmit")
> Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>

	This patch looks good to me, thanks!

Reviewed-by: Julian Anastasov <ja@ssi.bg>

> ---
>  net/ipv4/icmp.c | 8 +++++++-
>  1 file changed, 7 insertions(+), 1 deletion(-)
> 
> diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
> index 1510e951f451..001f03f76bc4 100644
> --- a/net/ipv4/icmp.c
> +++ b/net/ipv4/icmp.c
> @@ -582,7 +582,13 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info,
>  
>  	if (!rt)
>  		goto out;
> -	net = dev_net(rt->dst.dev);
> +
> +	if (rt->dst.dev)
> +		net = dev_net(rt->dst.dev);
> +	else if (skb_in->dev)
> +		net = dev_net(skb_in->dev);
> +	else
> +		goto out;
>  
>  	/*
>  	 *	Find the original header. It is expected to be valid, of course.
> -- 
> 2.19.2

Regards

--
Julian Anastasov <ja@ssi.bg>

^ permalink raw reply

* Re: [PATCH bpf-next v5 03/11] xsk: add support to allow unaligned chunk placement
From: Jonathan Lemon @ 2019-08-22 18:43 UTC (permalink / raw)
  To: Kevin Laatz
  Cc: netdev, ast, daniel, bjorn.topel, magnus.karlsson, jakub.kicinski,
	saeedm, maximmi, stephen, bruce.richardson, ciara.loftus, bpf,
	intel-wired-lan
In-Reply-To: <20190822014427.49800-4-kevin.laatz@intel.com>



On 21 Aug 2019, at 18:44, Kevin Laatz wrote:

> Currently, addresses are chunk size aligned. This means, we are very
> restricted in terms of where we can place chunk within the umem. For
> example, if we have a chunk size of 2k, then our chunks can only be 
> placed
> at 0,2k,4k,6k,8k... and so on (ie. every 2k starting from 0).
>
> This patch introduces the ability to use unaligned chunks. With these
> changes, we are no longer bound to having to place chunks at a 2k (or
> whatever your chunk size is) interval. Since we are no longer dealing 
> with
> aligned chunks, they can now cross page boundaries. Checks for page
> contiguity have been added in order to keep track of which pages are
> followed by a physically contiguous page.
>
> Signed-off-by: Kevin Laatz <kevin.laatz@intel.com>
> Signed-off-by: Ciara Loftus <ciara.loftus@intel.com>
> Signed-off-by: Bruce Richardson <bruce.richardson@intel.com>
>
> ---
> v2:
>   - Add checks for the flags coming from userspace
>   - Fix how we get chunk_size in xsk_diag.c
>   - Add defines for masking the new descriptor format
>   - Modified the rx functions to use new descriptor format
>   - Modified the tx functions to use new descriptor format
>
> v3:
>   - Add helper function to do address/offset masking/addition
>
> v4:
>   - fixed page_start calculation in __xsk_rcv_memcpy().
>   - move offset handling to the xdp_umem_get_* functions
>   - modified the len field in xdp_umem_reg struct. We now use 16 bits 
> from
>     this for the flags field.
>   - removed next_pg_contig field from xdp_umem_page struct. Using low 
> 12
>     bits of addr to store flags instead.
>   - other minor changes based on review comments
>
> v5:
>   - Added accessors for getting addr and offset
>   - Added helper function to add offset to addr
>   - Fixed offset handling in xsk_rcv
>   - Removed bitfields from xdp_umem_reg
>   - Added struct size checking for xdp_umem_reg in xsk_setsockopt to 
> handle
>     different versions of the struct.
>   - fix conflicts after 'bpf-af-xdp-wakeup' was merged.
> ---
>  include/net/xdp_sock.h      | 75 +++++++++++++++++++++++++++--
>  include/uapi/linux/if_xdp.h |  9 ++++
>  net/xdp/xdp_umem.c          | 19 ++++++--
>  net/xdp/xsk.c               | 96 
> +++++++++++++++++++++++++++++--------
>  net/xdp/xsk_diag.c          |  2 +-
>  net/xdp/xsk_queue.h         | 68 ++++++++++++++++++++++----
>  6 files changed, 232 insertions(+), 37 deletions(-)
>
> diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
> index f023b9940d64..c9398ce7960f 100644
> --- a/include/net/xdp_sock.h
> +++ b/include/net/xdp_sock.h
> @@ -16,6 +16,13 @@
>  struct net_device;
>  struct xsk_queue;
>
> +/* Masks for xdp_umem_page flags.
> + * The low 12-bits of the addr will be 0 since this is the page 
> address, so we
> + * can use them for flags.
> + */
> +#define XSK_NEXT_PG_CONTIG_SHIFT 0
> +#define XSK_NEXT_PG_CONTIG_MASK (1ULL << XSK_NEXT_PG_CONTIG_SHIFT)
> +
>  struct xdp_umem_page {
>  	void *addr;
>  	dma_addr_t dma;
> @@ -27,8 +34,12 @@ struct xdp_umem_fq_reuse {
>  	u64 handles[];
>  };
>
> -/* Flags for the umem flags field. */
> -#define XDP_UMEM_USES_NEED_WAKEUP (1 << 0)
> +/* Flags for the umem flags field.
> + *
> + * The NEED_WAKEUP flag is 1 due to the reuse of the flags field for 
> public
> + * flags. See inlude/uapi/include/linux/if_xdp.h.
> + */
> +#define XDP_UMEM_USES_NEED_WAKEUP (1 << 1)
>
>  struct xdp_umem {
>  	struct xsk_queue *fq;
> @@ -124,14 +135,36 @@ void xsk_map_try_sock_delete(struct xsk_map 
> *map, struct xdp_sock *xs,
>  int xsk_map_inc(struct xsk_map *map);
>  void xsk_map_put(struct xsk_map *map);
>
> +static inline u64 xsk_umem_extract_addr(u64 addr)
> +{
> +	return addr & XSK_UNALIGNED_BUF_ADDR_MASK;
> +}
> +
> +static inline u64 xsk_umem_extract_offset(u64 addr)
> +{
> +	return addr >> XSK_UNALIGNED_BUF_OFFSET_SHIFT;
> +}
> +
> +static inline u64 xsk_umem_add_offset_to_addr(u64 addr)
> +{
> +	return xsk_umem_extract_addr(addr) + xsk_umem_extract_offset(addr);
> +}
> +
>  static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 
> addr)
>  {
> -	return umem->pages[addr >> PAGE_SHIFT].addr + (addr & (PAGE_SIZE - 
> 1));
> +	unsigned long page_addr;
> +
> +	addr = xsk_umem_add_offset_to_addr(addr);
> +	page_addr = (unsigned long)umem->pages[addr >> PAGE_SHIFT].addr;
> +
> +	return (char *)(page_addr & PAGE_MASK) + (addr & ~PAGE_MASK);
>  }
>
>  static inline dma_addr_t xdp_umem_get_dma(struct xdp_umem *umem, u64 
> addr)
>  {
> -	return umem->pages[addr >> PAGE_SHIFT].dma + (addr & (PAGE_SIZE - 
> 1));
> +	addr = xsk_umem_add_offset_to_addr(addr);
> +
> +	return umem->pages[addr >> PAGE_SHIFT].dma + (addr & ~PAGE_MASK);
>  }
>
>  /* Reuse-queue aware version of FILL queue helpers */
> @@ -172,6 +205,19 @@ static inline void xsk_umem_fq_reuse(struct 
> xdp_umem *umem, u64 addr)
>
>  	rq->handles[rq->length++] = addr;
>  }
> +
> +/* Handle the offset appropriately depending on aligned or unaligned 
> mode.
> + * For unaligned mode, we store the offset in the upper 16-bits of 
> the address.
> + * For aligned mode, we simply add the offset to the address.
> + */
> +static inline u64 xsk_umem_adjust_offset(struct xdp_umem *umem, u64 
> address,
> +					 u64 offset)
> +{
> +	if (umem->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG)
> +		return address + (offset << XSK_UNALIGNED_BUF_OFFSET_SHIFT);
> +	else
> +		return address + offset;
> +}
>  #else
>  static inline int xsk_generic_rcv(struct xdp_sock *xs, struct 
> xdp_buff *xdp)
>  {
> @@ -241,6 +287,21 @@ static inline struct xdp_umem 
> *xdp_get_umem_from_qid(struct net_device *dev,
>  	return NULL;
>  }
>
> +static inline u64 xsk_umem_extract_addr(u64 addr)
> +{
> +	return 0;
> +}
> +
> +static inline u64 xsk_umem_extract_offset(u64 addr)
> +{
> +	return 0;
> +}
> +
> +static inline u64 xsk_umem_add_offset_to_addr(u64 addr)
> +{
> +	return 0;
> +}
> +
>  static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 
> addr)
>  {
>  	return NULL;
> @@ -290,6 +351,12 @@ static inline bool 
> xsk_umem_uses_need_wakeup(struct xdp_umem *umem)
>  	return false;
>  }
>
> +static inline u64 xsk_umem_adjust_offset(struct xdp_umem *umem, u64 
> handle,
> +					 u64 offset)
> +{
> +	return 0;
> +}
> +
>  #endif /* CONFIG_XDP_SOCKETS */
>
>  #endif /* _LINUX_XDP_SOCK_H */
> diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h
> index 62b80d57b72a..be328c59389d 100644
> --- a/include/uapi/linux/if_xdp.h
> +++ b/include/uapi/linux/if_xdp.h
> @@ -26,6 +26,9 @@
>   */
>  #define XDP_USE_NEED_WAKEUP (1 << 3)
>
> +/* Flags for xsk_umem_config flags */
> +#define XDP_UMEM_UNALIGNED_CHUNK_FLAG (1 << 0)
> +
>  struct sockaddr_xdp {
>  	__u16 sxdp_family;
>  	__u16 sxdp_flags;
> @@ -66,6 +69,7 @@ struct xdp_umem_reg {
>  	__u64 len; /* Length of packet data area */
>  	__u32 chunk_size;
>  	__u32 headroom;
> +	__u32 flags;
>  };
>
>  struct xdp_statistics {
> @@ -87,6 +91,11 @@ struct xdp_options {
>  #define XDP_UMEM_PGOFF_FILL_RING	0x100000000ULL
>  #define XDP_UMEM_PGOFF_COMPLETION_RING	0x180000000ULL
>
> +/* Masks for unaligned chunks mode */
> +#define XSK_UNALIGNED_BUF_OFFSET_SHIFT 48
> +#define XSK_UNALIGNED_BUF_ADDR_MASK \
> +	((1ULL << XSK_UNALIGNED_BUF_OFFSET_SHIFT) - 1)
> +
>  /* Rx/Tx descriptor */
>  struct xdp_desc {
>  	__u64 addr;
> diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
> index 2d65779282a1..e997b263a0dd 100644
> --- a/net/xdp/xdp_umem.c
> +++ b/net/xdp/xdp_umem.c
> @@ -340,6 +340,7 @@ static int xdp_umem_account_pages(struct xdp_umem 
> *umem)
>
>  static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg 
> *mr)
>  {
> +	bool unaligned_chunks = mr->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG;
>  	u32 chunk_size = mr->chunk_size, headroom = mr->headroom;
>  	unsigned int chunks, chunks_per_page;
>  	u64 addr = mr->addr, size = mr->len;
> @@ -355,7 +356,11 @@ static int xdp_umem_reg(struct xdp_umem *umem, 
> struct xdp_umem_reg *mr)
>  		return -EINVAL;
>  	}
>
> -	if (!is_power_of_2(chunk_size))
> +	if (mr->flags & ~(XDP_UMEM_UNALIGNED_CHUNK_FLAG |
> +			XDP_UMEM_USES_NEED_WAKEUP))
> +		return -EINVAL;
> +
> +	if (!unaligned_chunks && !is_power_of_2(chunk_size))
>  		return -EINVAL;
>
>  	if (!PAGE_ALIGNED(addr)) {
> @@ -372,9 +377,11 @@ static int xdp_umem_reg(struct xdp_umem *umem, 
> struct xdp_umem_reg *mr)
>  	if (chunks == 0)
>  		return -EINVAL;
>
> -	chunks_per_page = PAGE_SIZE / chunk_size;
> -	if (chunks < chunks_per_page || chunks % chunks_per_page)
> -		return -EINVAL;
> +	if (!unaligned_chunks) {
> +		chunks_per_page = PAGE_SIZE / chunk_size;
> +		if (chunks < chunks_per_page || chunks % chunks_per_page)
> +			return -EINVAL;
> +	}
>
>  	headroom = ALIGN(headroom, 64);
>
> @@ -383,13 +390,15 @@ static int xdp_umem_reg(struct xdp_umem *umem, 
> struct xdp_umem_reg *mr)
>  		return -EINVAL;
>
>  	umem->address = (unsigned long)addr;
> -	umem->chunk_mask = ~((u64)chunk_size - 1);
> +	umem->chunk_mask = unaligned_chunks ? XSK_UNALIGNED_BUF_ADDR_MASK
> +					    : ~((u64)chunk_size - 1);
>  	umem->size = size;
>  	umem->headroom = headroom;
>  	umem->chunk_size_nohr = chunk_size - headroom;
>  	umem->npgs = size / PAGE_SIZE;
>  	umem->pgs = NULL;
>  	umem->user = NULL;
> +	umem->flags = mr->flags;
>  	INIT_LIST_HEAD(&umem->xsk_list);
>  	spin_lock_init(&umem->xsk_list_lock);
>
> diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
> index ee4428a892fa..907e5f12338f 100644
> --- a/net/xdp/xsk.c
> +++ b/net/xdp/xsk.c
> @@ -45,7 +45,7 @@ EXPORT_SYMBOL(xsk_umem_has_addrs);
>
>  u64 *xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr)
>  {
> -	return xskq_peek_addr(umem->fq, addr);
> +	return xskq_peek_addr(umem->fq, addr, umem);
>  }
>  EXPORT_SYMBOL(xsk_umem_peek_addr);
>
> @@ -115,21 +115,43 @@ bool xsk_umem_uses_need_wakeup(struct xdp_umem 
> *umem)
>  }
>  EXPORT_SYMBOL(xsk_umem_uses_need_wakeup);
>
> +/* If a buffer crosses a page boundary, we need to do 2 memcpy's, one 
> for
> + * each page. This is only required in copy mode.
> + */
> +static void __xsk_rcv_memcpy(struct xdp_umem *umem, u64 addr, void 
> *from_buf,
> +			     u32 len, u32 metalen)
> +{
> +	void *to_buf = xdp_umem_get_data(umem, addr);
> +
> +	addr = xsk_umem_add_offset_to_addr(addr);
> +	if (xskq_crosses_non_contig_pg(umem, addr, len + metalen)) {
> +		void *next_pg_addr = umem->pages[(addr >> PAGE_SHIFT) + 1].addr;
> +		u64 page_start = addr & ~(PAGE_SIZE - 1);
> +		u64 first_len = PAGE_SIZE - (addr - page_start);
> +
> +		memcpy(to_buf, from_buf, first_len + metalen);
> +		memcpy(next_pg_addr, from_buf + first_len, len - first_len);
> +
> +		return;
> +	}
> +
> +	memcpy(to_buf, from_buf, len + metalen);
> +}
> +
>  static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 
> len)
>  {
> -	void *to_buf, *from_buf;
> +	u64 offset = xs->umem->headroom;
> +	u64 addr, memcpy_addr;
> +	void *from_buf;
>  	u32 metalen;
> -	u64 addr;
>  	int err;
>
> -	if (!xskq_peek_addr(xs->umem->fq, &addr) ||
> +	if (!xskq_peek_addr(xs->umem->fq, &addr, xs->umem) ||
>  	    len > xs->umem->chunk_size_nohr - XDP_PACKET_HEADROOM) {
>  		xs->rx_dropped++;
>  		return -ENOSPC;
>  	}
>
> -	addr += xs->umem->headroom;
> -
>  	if (unlikely(xdp_data_meta_unsupported(xdp))) {
>  		from_buf = xdp->data;
>  		metalen = 0;
> @@ -138,9 +160,11 @@ static int __xsk_rcv(struct xdp_sock *xs, struct 
> xdp_buff *xdp, u32 len)
>  		metalen = xdp->data - xdp->data_meta;
>  	}
>
> -	to_buf = xdp_umem_get_data(xs->umem, addr);
> -	memcpy(to_buf, from_buf, len + metalen);
> -	addr += metalen;
> +	memcpy_addr = xsk_umem_adjust_offset(xs->umem, addr, offset);
> +	__xsk_rcv_memcpy(xs->umem, memcpy_addr, from_buf, len, metalen);
> +
> +	offset += metalen;
> +	addr = xsk_umem_adjust_offset(xs->umem, addr, offset);
>  	err = xskq_produce_batch_desc(xs->rx, addr, len);
>  	if (!err) {
>  		xskq_discard_addr(xs->umem->fq);
> @@ -185,6 +209,7 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct 
> xdp_buff *xdp)
>  {
>  	u32 metalen = xdp->data - xdp->data_meta;
>  	u32 len = xdp->data_end - xdp->data;
> +	u64 offset = xs->umem->headroom;
>  	void *buffer;
>  	u64 addr;
>  	int err;
> @@ -196,17 +221,17 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct 
> xdp_buff *xdp)
>  		goto out_unlock;
>  	}
>
> -	if (!xskq_peek_addr(xs->umem->fq, &addr) ||
> +	if (!xskq_peek_addr(xs->umem->fq, &addr, xs->umem) ||
>  	    len > xs->umem->chunk_size_nohr - XDP_PACKET_HEADROOM) {
>  		err = -ENOSPC;
>  		goto out_drop;
>  	}
>
> -	addr += xs->umem->headroom;
> -
> -	buffer = xdp_umem_get_data(xs->umem, addr);
> +	buffer = xdp_umem_get_data(xs->umem, addr + offset);
>  	memcpy(buffer, xdp->data_meta, len + metalen);
> -	addr += metalen;
> +	offset += metalen;
> +
> +	addr = xsk_umem_adjust_offset(xs->umem, addr, offset);
>  	err = xskq_produce_batch_desc(xs->rx, addr, len);
>  	if (err)
>  		goto out_drop;

Can't just add address and offset any longer.  This should read:

	addr = xsk_umem_adjust_offset(xs->umem, addr, offset);
	buffer = xdp_umem_get_data(xs->umem, addr);

	addr = xsk_umem_adjust_offset(xs->umem, addr, metalen);
	

so that offset and then metalen are added.  (or preserve the
address across the calls like memcpy_addr earlier).
-- 
Jonathan

> @@ -250,7 +275,7 @@ bool xsk_umem_consume_tx(struct xdp_umem *umem, 
> struct xdp_desc *desc)
>
>  	rcu_read_lock();
>  	list_for_each_entry_rcu(xs, &umem->xsk_list, list) {
> -		if (!xskq_peek_desc(xs->tx, desc))
> +		if (!xskq_peek_desc(xs->tx, desc, umem))
>  			continue;
>
>  		if (xskq_produce_addr_lazy(umem->cq, desc->addr))
> @@ -304,7 +329,7 @@ static int xsk_generic_xmit(struct sock *sk, 
> struct msghdr *m,
>  	if (xs->queue_id >= xs->dev->real_num_tx_queues)
>  		goto out;
>
> -	while (xskq_peek_desc(xs->tx, &desc)) {
> +	while (xskq_peek_desc(xs->tx, &desc, xs->umem)) {
>  		char *buffer;
>  		u64 addr;
>  		u32 len;
> @@ -333,7 +358,7 @@ static int xsk_generic_xmit(struct sock *sk, 
> struct msghdr *m,
>  		skb->dev = xs->dev;
>  		skb->priority = sk->sk_priority;
>  		skb->mark = sk->sk_mark;
> -		skb_shinfo(skb)->destructor_arg = (void *)(long)addr;
> +		skb_shinfo(skb)->destructor_arg = (void *)(long)desc.addr;
>  		skb->destructor = xsk_destruct_skb;
>
>  		err = dev_direct_xmit(skb, xs->queue_id);
> @@ -526,6 +551,24 @@ static struct socket *xsk_lookup_xsk_from_fd(int 
> fd)
>  	return sock;
>  }
>
> +/* Check if umem pages are contiguous.
> + * If zero-copy mode, use the DMA address to do the page contiguity 
> check
> + * For all other modes we use addr (kernel virtual address)
> + * Store the result in the low bits of addr.
> + */
> +static void xsk_check_page_contiguity(struct xdp_umem *umem, u32 
> flags)
> +{
> +	struct xdp_umem_page *pgs = umem->pages;
> +	int i, is_contig;
> +
> +	for (i = 0; i < umem->npgs - 1; i++) {
> +		is_contig = (flags & XDP_ZEROCOPY) ?
> +			(pgs[i].dma + PAGE_SIZE == pgs[i + 1].dma) :
> +			(pgs[i].addr + PAGE_SIZE == pgs[i + 1].addr);
> +		pgs[i].addr += is_contig << XSK_NEXT_PG_CONTIG_SHIFT;
> +	}
> +}
> +
>  static int xsk_bind(struct socket *sock, struct sockaddr *addr, int 
> addr_len)
>  {
>  	struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr;
> @@ -616,6 +659,8 @@ static int xsk_bind(struct socket *sock, struct 
> sockaddr *addr, int addr_len)
>  		err = xdp_umem_assign_dev(xs->umem, dev, qid, flags);
>  		if (err)
>  			goto out_unlock;
> +
> +		xsk_check_page_contiguity(xs->umem, flags);
>  	}
>
>  	xs->dev = dev;
> @@ -636,6 +681,13 @@ static int xsk_bind(struct socket *sock, struct 
> sockaddr *addr, int addr_len)
>  	return err;
>  }
>
> +struct xdp_umem_reg_v1 {
> +	__u64 addr; /* Start of packet data area */
> +	__u64 len; /* Length of packet data area */
> +	__u32 chunk_size;
> +	__u32 headroom;
> +};
> +
>  static int xsk_setsockopt(struct socket *sock, int level, int 
> optname,
>  			  char __user *optval, unsigned int optlen)
>  {
> @@ -673,10 +725,16 @@ static int xsk_setsockopt(struct socket *sock, 
> int level, int optname,
>  	}
>  	case XDP_UMEM_REG:
>  	{
> -		struct xdp_umem_reg mr;
> +		size_t mr_size = sizeof(struct xdp_umem_reg);
> +		struct xdp_umem_reg mr = {};
>  		struct xdp_umem *umem;
>
> -		if (copy_from_user(&mr, optval, sizeof(mr)))
> +		if (optlen < sizeof(struct xdp_umem_reg_v1))
> +			return -EINVAL;
> +		else if (optlen < sizeof(mr))
> +			mr_size = sizeof(struct xdp_umem_reg_v1);
> +
> +		if (copy_from_user(&mr, optval, mr_size))
>  			return -EFAULT;
>
>  		mutex_lock(&xs->mutex);
> diff --git a/net/xdp/xsk_diag.c b/net/xdp/xsk_diag.c
> index d5e06c8e0cbf..9986a759fe06 100644
> --- a/net/xdp/xsk_diag.c
> +++ b/net/xdp/xsk_diag.c
> @@ -56,7 +56,7 @@ static int xsk_diag_put_umem(const struct xdp_sock 
> *xs, struct sk_buff *nlskb)
>  	du.id = umem->id;
>  	du.size = umem->size;
>  	du.num_pages = umem->npgs;
> -	du.chunk_size = (__u32)(~umem->chunk_mask + 1);
> +	du.chunk_size = umem->chunk_size_nohr + umem->headroom;
>  	du.headroom = umem->headroom;
>  	du.ifindex = umem->dev ? umem->dev->ifindex : 0;
>  	du.queue_id = umem->queue_id;
> diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
> index dd9e985c2461..6c67c9d0294f 100644
> --- a/net/xdp/xsk_queue.h
> +++ b/net/xdp/xsk_queue.h
> @@ -134,6 +134,17 @@ static inline bool xskq_has_addrs(struct 
> xsk_queue *q, u32 cnt)
>
>  /* UMEM queue */
>
> +static inline bool xskq_crosses_non_contig_pg(struct xdp_umem *umem, 
> u64 addr,
> +					      u64 length)
> +{
> +	bool cross_pg = (addr & (PAGE_SIZE - 1)) + length > PAGE_SIZE;
> +	bool next_pg_contig =
> +		(unsigned long)umem->pages[(addr >> PAGE_SHIFT)].addr &
> +			XSK_NEXT_PG_CONTIG_MASK;
> +
> +	return cross_pg && !next_pg_contig;
> +}
> +
>  static inline bool xskq_is_valid_addr(struct xsk_queue *q, u64 addr)
>  {
>  	if (addr >= q->size) {
> @@ -144,23 +155,49 @@ static inline bool xskq_is_valid_addr(struct 
> xsk_queue *q, u64 addr)
>  	return true;
>  }
>
> -static inline u64 *xskq_validate_addr(struct xsk_queue *q, u64 *addr)
> +static inline bool xskq_is_valid_addr_unaligned(struct xsk_queue *q, 
> u64 addr,
> +						u64 length,
> +						struct xdp_umem *umem)
> +{
> +	addr = xsk_umem_add_offset_to_addr(addr);
> +	if (addr >= q->size ||
> +	    xskq_crosses_non_contig_pg(umem, addr, length)) {
> +		q->invalid_descs++;
> +		return false;
> +	}
> +
> +	return true;
> +}
> +
> +static inline u64 *xskq_validate_addr(struct xsk_queue *q, u64 *addr,
> +				      struct xdp_umem *umem)
>  {
>  	while (q->cons_tail != q->cons_head) {
>  		struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring;
>  		unsigned int idx = q->cons_tail & q->ring_mask;
>
>  		*addr = READ_ONCE(ring->desc[idx]) & q->chunk_mask;
> +
> +		if (umem->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG) {
> +			if (xskq_is_valid_addr_unaligned(q, *addr,
> +							 umem->chunk_size_nohr,
> +							 umem))
> +				return addr;
> +			goto out;
> +		}
> +
>  		if (xskq_is_valid_addr(q, *addr))
>  			return addr;
>
> +out:
>  		q->cons_tail++;
>  	}
>
>  	return NULL;
>  }
>
> -static inline u64 *xskq_peek_addr(struct xsk_queue *q, u64 *addr)
> +static inline u64 *xskq_peek_addr(struct xsk_queue *q, u64 *addr,
> +				  struct xdp_umem *umem)
>  {
>  	if (q->cons_tail == q->cons_head) {
>  		smp_mb(); /* D, matches A */
> @@ -171,7 +208,7 @@ static inline u64 *xskq_peek_addr(struct xsk_queue 
> *q, u64 *addr)
>  		smp_rmb();
>  	}
>
> -	return xskq_validate_addr(q, addr);
> +	return xskq_validate_addr(q, addr, umem);
>  }
>
>  static inline void xskq_discard_addr(struct xsk_queue *q)
> @@ -230,8 +267,21 @@ static inline int xskq_reserve_addr(struct 
> xsk_queue *q)
>
>  /* Rx/Tx queue */
>
> -static inline bool xskq_is_valid_desc(struct xsk_queue *q, struct 
> xdp_desc *d)
> +static inline bool xskq_is_valid_desc(struct xsk_queue *q, struct 
> xdp_desc *d,
> +				      struct xdp_umem *umem)
>  {
> +	if (umem->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG) {
> +		if (!xskq_is_valid_addr_unaligned(q, d->addr, d->len, umem))
> +			return false;
> +
> +		if (d->len > umem->chunk_size_nohr || d->options) {
> +			q->invalid_descs++;
> +			return false;
> +		}
> +
> +		return true;
> +	}
> +
>  	if (!xskq_is_valid_addr(q, d->addr))
>  		return false;
>
> @@ -245,14 +295,15 @@ static inline bool xskq_is_valid_desc(struct 
> xsk_queue *q, struct xdp_desc *d)
>  }
>
>  static inline struct xdp_desc *xskq_validate_desc(struct xsk_queue 
> *q,
> -						  struct xdp_desc *desc)
> +						  struct xdp_desc *desc,
> +						  struct xdp_umem *umem)
>  {
>  	while (q->cons_tail != q->cons_head) {
>  		struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring;
>  		unsigned int idx = q->cons_tail & q->ring_mask;
>
>  		*desc = READ_ONCE(ring->desc[idx]);
> -		if (xskq_is_valid_desc(q, desc))
> +		if (xskq_is_valid_desc(q, desc, umem))
>  			return desc;
>
>  		q->cons_tail++;
> @@ -262,7 +313,8 @@ static inline struct xdp_desc 
> *xskq_validate_desc(struct xsk_queue *q,
>  }
>
>  static inline struct xdp_desc *xskq_peek_desc(struct xsk_queue *q,
> -					      struct xdp_desc *desc)
> +					      struct xdp_desc *desc,
> +					      struct xdp_umem *umem)
>  {
>  	if (q->cons_tail == q->cons_head) {
>  		smp_mb(); /* D, matches A */
> @@ -273,7 +325,7 @@ static inline struct xdp_desc 
> *xskq_peek_desc(struct xsk_queue *q,
>  		smp_rmb(); /* C, matches B */
>  	}
>
> -	return xskq_validate_desc(q, desc);
> +	return xskq_validate_desc(q, desc, umem);
>  }
>
>  static inline void xskq_discard_desc(struct xsk_queue *q)
> -- 
> 2.17.1

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox