Netdev List

Netdev List
 help / color / mirror / Atom feed

* [patch net-next 09/16] mlxsw: spectrum_switchdev: Disable mdb when mc is disabled
From: Jiri Pirko @ 2017-09-20 14:15 UTC (permalink / raw)
  To: netdev; +Cc: davem, nogahf, idosch, mlxsw
In-Reply-To: <20170920141516.1402-1-jiri@resnulli.us>

From: Nogah Frankel <nogahf@mellanox.com>

Remove all the mdb entries from the HW when mc is being disabled and
re-write them when it is being enabled.

Signed-off-by: Nogah Frankel <nogahf@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
---
 .../ethernet/mellanox/mlxsw/spectrum_switchdev.c   | 41 +++++++++++++++++++---
 1 file changed, 37 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
index cea257a..79806af 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
@@ -121,6 +121,11 @@ mlxsw_sp_bridge_port_fdb_flush(struct mlxsw_sp *mlxsw_sp,
 			       struct mlxsw_sp_bridge_port *bridge_port,
 			       u16 fid_index);
 
+static void
+mlxsw_sp_bridge_mdb_mc_enable_sync(struct mlxsw_sp_port *mlxsw_sp_port,
+				   struct mlxsw_sp_bridge_device
+				   *bridge_device);
+
 static struct mlxsw_sp_bridge_device *
 mlxsw_sp_bridge_device_find(const struct mlxsw_sp_bridge *bridge,
 			    const struct net_device *br_dev)
@@ -757,6 +762,12 @@ static int mlxsw_sp_port_mc_disabled_set(struct mlxsw_sp_port *mlxsw_sp_port,
 	if (!bridge_device)
 		return 0;
 
+	if (bridge_device->multicast_enabled != !mc_disabled) {
+		bridge_device->multicast_enabled = !mc_disabled;
+		mlxsw_sp_bridge_mdb_mc_enable_sync(mlxsw_sp_port,
+						   bridge_device);
+	}
+
 	list_for_each_entry(bridge_port, &bridge_device->ports_list, list) {
 		enum mlxsw_sp_flood_type packet_type = MLXSW_SP_FLOOD_TYPE_MC;
 		bool member = mc_disabled ? true : bridge_port->mrouter;
@@ -1207,9 +1218,8 @@ static int mlxsw_sp_port_mdb_op(struct mlxsw_sp *mlxsw_sp, const char *addr,
 	return err;
 }
 
-/* clean the an entry from the HW and write there a full new entry */
-static int mlxsw_sp_port_smid_full_entry(struct mlxsw_sp *mlxsw_sp,
-					 u16 mid_idx)
+static int mlxsw_sp_port_smid_full_entry(struct mlxsw_sp *mlxsw_sp, u16 mid_idx,
+					 long *ports_bitmap)
 {
 	char *smid_pl;
 	int err, i;
@@ -1224,6 +1234,9 @@ static int mlxsw_sp_port_smid_full_entry(struct mlxsw_sp *mlxsw_sp,
 			mlxsw_reg_smid_port_mask_set(smid_pl, i, 1);
 	}
 
+	for_each_set_bit(i, ports_bitmap, mlxsw_core_max_ports(mlxsw_sp->core))
+		mlxsw_reg_smid_port_set(smid_pl, i, 1);
+
 	err = mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(smid), smid_pl);
 	kfree(smid_pl);
 	return err;
@@ -1273,7 +1286,8 @@ mlxsw_sp_mc_write_mdb_entry(struct mlxsw_sp *mlxsw_sp,
 		return false;
 
 	mid->mid = mid_idx;
-	err = mlxsw_sp_port_smid_full_entry(mlxsw_sp, mid_idx);
+	err = mlxsw_sp_port_smid_full_entry(mlxsw_sp, mid_idx,
+					    mid->ports_in_mid);
 	if (err)
 		return false;
 
@@ -1414,6 +1428,25 @@ static int mlxsw_sp_port_mdb_add(struct mlxsw_sp_port *mlxsw_sp_port,
 	return err;
 }
 
+static void
+mlxsw_sp_bridge_mdb_mc_enable_sync(struct mlxsw_sp_port *mlxsw_sp_port,
+				   struct mlxsw_sp_bridge_device
+				   *bridge_device)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	struct mlxsw_sp_mid *mid;
+	bool mc_enabled;
+
+	mc_enabled = bridge_device->multicast_enabled;
+
+	list_for_each_entry(mid, &bridge_device->mids_list, list) {
+		if (mc_enabled)
+			mlxsw_sp_mc_write_mdb_entry(mlxsw_sp, mid);
+		else
+			mlxsw_sp_mc_remove_mdb_entry(mlxsw_sp, mid);
+	}
+}
+
 static int mlxsw_sp_port_obj_add(struct net_device *dev,
 				 const struct switchdev_obj *obj,
 				 struct switchdev_trans *trans)
-- 
2.9.5

^ permalink raw reply related

* [patch net-next 10/16] mlxsw: spectrum_switchdev: Use generic mc flood function
From: Jiri Pirko @ 2017-09-20 14:15 UTC (permalink / raw)
  To: netdev; +Cc: davem, nogahf, idosch, mlxsw
In-Reply-To: <20170920141516.1402-1-jiri@resnulli.us>

From: Nogah Frankel <nogahf@mellanox.com>

Use the generic mc flood function to decide whether to flood mc to a port
when mc is being enabled / disabled.
Move this function in the file to avoid forward declaration.

Signed-off-by: Nogah Frankel <nogahf@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
---
 .../net/ethernet/mellanox/mlxsw/spectrum_switchdev.c   | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
index 79806af..19ac206 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
@@ -742,6 +742,14 @@ static int mlxsw_sp_port_attr_mrouter_set(struct mlxsw_sp_port *mlxsw_sp_port,
 	return 0;
 }
 
+static bool mlxsw_sp_mc_flood(const struct mlxsw_sp_bridge_port *bridge_port)
+{
+	const struct mlxsw_sp_bridge_device *bridge_device;
+
+	bridge_device = bridge_port->bridge_device;
+	return !bridge_device->multicast_enabled ? true : bridge_port->mrouter;
+}
+
 static int mlxsw_sp_port_mc_disabled_set(struct mlxsw_sp_port *mlxsw_sp_port,
 					 struct switchdev_trans *trans,
 					 struct net_device *orig_dev,
@@ -770,7 +778,7 @@ static int mlxsw_sp_port_mc_disabled_set(struct mlxsw_sp_port *mlxsw_sp_port,
 
 	list_for_each_entry(bridge_port, &bridge_device->ports_list, list) {
 		enum mlxsw_sp_flood_type packet_type = MLXSW_SP_FLOOD_TYPE_MC;
-		bool member = mc_disabled ? true : bridge_port->mrouter;
+		bool member = mlxsw_sp_mc_flood(bridge_port);
 
 		err = mlxsw_sp_bridge_port_flood_table_set(mlxsw_sp_port,
 							   bridge_port,
@@ -829,14 +837,6 @@ static int mlxsw_sp_port_attr_set(struct net_device *dev,
 	return err;
 }
 
-static bool mlxsw_sp_mc_flood(const struct mlxsw_sp_bridge_port *bridge_port)
-{
-	const struct mlxsw_sp_bridge_device *bridge_device;
-
-	bridge_device = bridge_port->bridge_device;
-	return !bridge_device->multicast_enabled ? true : bridge_port->mrouter;
-}
-
 static int
 mlxsw_sp_port_vlan_fid_join(struct mlxsw_sp_port_vlan *mlxsw_sp_port_vlan,
 			    struct mlxsw_sp_bridge_port *bridge_port)
-- 
2.9.5

^ permalink raw reply related

* [patch net-next 11/16] mlxsw: spectrum_switchdev: Flood mc when mc is disabled by user flag
From: Jiri Pirko @ 2017-09-20 14:15 UTC (permalink / raw)
  To: netdev; +Cc: davem, nogahf, idosch, mlxsw
In-Reply-To: <20170920141516.1402-1-jiri@resnulli.us>

From: Nogah Frankel <nogahf@mellanox.com>

When multicast is disabled, flood mc packets only to port that are marked
BR_MCAST_FLOOD (instead to all).

Signed-off-by: Nogah Frankel <nogahf@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
---
 .../ethernet/mellanox/mlxsw/spectrum_switchdev.c    | 21 +++++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
index 19ac206..50c4d7c 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
@@ -262,7 +262,8 @@ mlxsw_sp_bridge_port_create(struct mlxsw_sp_bridge_device *bridge_device,
 	bridge_port->dev = brport_dev;
 	bridge_port->bridge_device = bridge_device;
 	bridge_port->stp_state = BR_STATE_DISABLED;
-	bridge_port->flags = BR_LEARNING | BR_FLOOD | BR_LEARNING_SYNC;
+	bridge_port->flags = BR_LEARNING | BR_FLOOD | BR_LEARNING_SYNC |
+			     BR_MCAST_FLOOD;
 	INIT_LIST_HEAD(&bridge_port->vlans_list);
 	list_add(&bridge_port->list, &bridge_device->ports_list);
 	bridge_port->ref_count = 1;
@@ -468,7 +469,8 @@ static int mlxsw_sp_port_attr_get(struct net_device *dev,
 					       &attr->u.brport_flags);
 		break;
 	case SWITCHDEV_ATTR_ID_PORT_BRIDGE_FLAGS_SUPPORT:
-		attr->u.brport_flags_support = BR_LEARNING | BR_FLOOD;
+		attr->u.brport_flags_support = BR_LEARNING | BR_FLOOD |
+					       BR_MCAST_FLOOD;
 		break;
 	default:
 		return -EOPNOTSUPP;
@@ -653,8 +655,18 @@ static int mlxsw_sp_port_attr_br_flags_set(struct mlxsw_sp_port *mlxsw_sp_port,
 	if (err)
 		return err;
 
-	memcpy(&bridge_port->flags, &brport_flags, sizeof(brport_flags));
+	if (bridge_port->bridge_device->multicast_enabled)
+		goto out;
 
+	err = mlxsw_sp_bridge_port_flood_table_set(mlxsw_sp_port, bridge_port,
+						   MLXSW_SP_FLOOD_TYPE_MC,
+						   brport_flags &
+						   BR_MCAST_FLOOD);
+	if (err)
+		return err;
+
+out:
+	memcpy(&bridge_port->flags, &brport_flags, sizeof(brport_flags));
 	return 0;
 }
 
@@ -747,7 +759,8 @@ static bool mlxsw_sp_mc_flood(const struct mlxsw_sp_bridge_port *bridge_port)
 	const struct mlxsw_sp_bridge_device *bridge_device;
 
 	bridge_device = bridge_port->bridge_device;
-	return !bridge_device->multicast_enabled ? true : bridge_port->mrouter;
+	return bridge_device->multicast_enabled ? bridge_port->mrouter :
+					bridge_port->flags & BR_MCAST_FLOOD;
 }
 
 static int mlxsw_sp_port_mc_disabled_set(struct mlxsw_sp_port *mlxsw_sp_port,
-- 
2.9.5

^ permalink raw reply related

* [patch net-next 12/16] mlxsw: spectrum_switchdev: Flush the mdb when a port is being removed
From: Jiri Pirko @ 2017-09-20 14:15 UTC (permalink / raw)
  To: netdev; +Cc: davem, nogahf, idosch, mlxsw
In-Reply-To: <20170920141516.1402-1-jiri@resnulli.us>

From: Nogah Frankel <nogahf@mellanox.com>

When a port is being removed from a bridge, flush the bridge mdb to remove
the mids of that port.

Signed-off-by: Nogah Frankel <nogahf@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
---
 .../ethernet/mellanox/mlxsw/spectrum_switchdev.c   | 39 ++++++++++++++++------
 1 file changed, 29 insertions(+), 10 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
index 50c4d7c..bc07873 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
@@ -122,6 +122,10 @@ mlxsw_sp_bridge_port_fdb_flush(struct mlxsw_sp *mlxsw_sp,
 			       u16 fid_index);
 
 static void
+mlxsw_sp_bridge_port_mdb_flush(struct mlxsw_sp_port *mlxsw_sp_port,
+			       struct mlxsw_sp_bridge_port *bridge_port);
+
+static void
 mlxsw_sp_bridge_mdb_mc_enable_sync(struct mlxsw_sp_port *mlxsw_sp_port,
 				   struct mlxsw_sp_bridge_device
 				   *bridge_device);
@@ -176,17 +180,11 @@ static void
 mlxsw_sp_bridge_device_destroy(struct mlxsw_sp_bridge *bridge,
 			       struct mlxsw_sp_bridge_device *bridge_device)
 {
-	struct mlxsw_sp_mid *mid, *tmp;
-
 	list_del(&bridge_device->list);
 	if (bridge_device->vlan_enabled)
 		bridge->vlan_enabled_exists = false;
 	WARN_ON(!list_empty(&bridge_device->ports_list));
-	list_for_each_entry_safe(mid, tmp, &bridge_device->mids_list, list) {
-		list_del(&mid->list);
-		clear_bit(mid->mid, bridge->mids_bitmap);
-		kfree(mid);
-	}
+	WARN_ON(!list_empty(&bridge_device->mids_list));
 	kfree(bridge_device);
 }
 
@@ -987,24 +985,28 @@ mlxsw_sp_port_vlan_bridge_leave(struct mlxsw_sp_port_vlan *mlxsw_sp_port_vlan)
 	struct mlxsw_sp_bridge_vlan *bridge_vlan;
 	struct mlxsw_sp_bridge_port *bridge_port;
 	u16 vid = mlxsw_sp_port_vlan->vid;
-	bool last;
+	bool last_port, last_vlan;
 
 	if (WARN_ON(mlxsw_sp_fid_type(fid) != MLXSW_SP_FID_TYPE_8021Q &&
 		    mlxsw_sp_fid_type(fid) != MLXSW_SP_FID_TYPE_8021D))
 		return;
 
 	bridge_port = mlxsw_sp_port_vlan->bridge_port;
+	last_vlan = list_is_singular(&bridge_port->vlans_list);
 	bridge_vlan = mlxsw_sp_bridge_vlan_find(bridge_port, vid);
-	last = list_is_singular(&bridge_vlan->port_vlan_list);
+	last_port = list_is_singular(&bridge_vlan->port_vlan_list);
 
 	list_del(&mlxsw_sp_port_vlan->bridge_vlan_node);
 	mlxsw_sp_bridge_vlan_put(bridge_vlan);
 	mlxsw_sp_port_vid_stp_set(mlxsw_sp_port, vid, BR_STATE_DISABLED);
 	mlxsw_sp_port_vid_learning_set(mlxsw_sp_port, vid, false);
-	if (last)
+	if (last_port)
 		mlxsw_sp_bridge_port_fdb_flush(mlxsw_sp_port->mlxsw_sp,
 					       bridge_port,
 					       mlxsw_sp_fid_index(fid));
+	if (last_vlan)
+		mlxsw_sp_bridge_port_mdb_flush(mlxsw_sp_port, bridge_port);
+
 	mlxsw_sp_port_vlan_fid_leave(mlxsw_sp_port_vlan);
 
 	mlxsw_sp_bridge_port_put(mlxsw_sp_port->mlxsw_sp->bridge, bridge_port);
@@ -1580,6 +1582,23 @@ static int mlxsw_sp_port_mdb_del(struct mlxsw_sp_port *mlxsw_sp_port,
 	return __mlxsw_sp_port_mdb_del(mlxsw_sp_port, bridge_port, mid);
 }
 
+static void
+mlxsw_sp_bridge_port_mdb_flush(struct mlxsw_sp_port *mlxsw_sp_port,
+			       struct mlxsw_sp_bridge_port *bridge_port)
+{
+	struct mlxsw_sp_bridge_device *bridge_device;
+	struct mlxsw_sp_mid *mid, *tmp;
+
+	bridge_device = bridge_port->bridge_device;
+
+	list_for_each_entry_safe(mid, tmp, &bridge_device->mids_list, list) {
+		if (test_bit(mlxsw_sp_port->local_port, mid->ports_in_mid)) {
+			__mlxsw_sp_port_mdb_del(mlxsw_sp_port, bridge_port,
+						mid);
+		}
+	}
+}
+
 static int mlxsw_sp_port_obj_del(struct net_device *dev,
 				 const struct switchdev_obj *obj)
 {
-- 
2.9.5

^ permalink raw reply related

* [patch net-next 13/16] mlxsw: spectrum_switchdev: Flood all mc packets to mrouter ports
From: Jiri Pirko @ 2017-09-20 14:15 UTC (permalink / raw)
  To: netdev; +Cc: davem, nogahf, idosch, mlxsw
In-Reply-To: <20170920141516.1402-1-jiri@resnulli.us>

From: Nogah Frankel <nogahf@mellanox.com>

When mc is enabled, whenever a mc packet doesn't hit any mdb entry it is
being flood to the ports marked as mrouters. However, all mc packets should
be flooded to them even if they match an entry in the mdb.
This patch adds the mrouter ports to every mdb entry that is being written
to the HW.

Signed-off-by: Nogah Frankel <nogahf@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
---
 .../ethernet/mellanox/mlxsw/spectrum_switchdev.c   | 65 ++++++++++++++++++++--
 1 file changed, 60 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
index bc07873..146beaa 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
@@ -1288,10 +1288,55 @@ mlxsw_sp_mid *__mlxsw_sp_mc_get(struct mlxsw_sp_bridge_device *bridge_device,
 	return NULL;
 }
 
+static void
+mlxsw_sp_bridge_port_get_ports_bitmap(struct mlxsw_sp *mlxsw_sp,
+				      struct mlxsw_sp_bridge_port *bridge_port,
+				      unsigned long *ports_bitmap)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port;
+	u64 max_lag_members, i;
+	int lag_id;
+
+	if (!bridge_port->lagged) {
+		set_bit(bridge_port->system_port, ports_bitmap);
+	} else {
+		max_lag_members = MLXSW_CORE_RES_GET(mlxsw_sp->core,
+						     MAX_LAG_MEMBERS);
+		lag_id = bridge_port->lag_id;
+		for (i = 0; i < max_lag_members; i++) {
+			mlxsw_sp_port = mlxsw_sp_port_lagged_get(mlxsw_sp,
+								 lag_id, i);
+			if (mlxsw_sp_port)
+				set_bit(mlxsw_sp_port->local_port,
+					ports_bitmap);
+		}
+	}
+}
+
+static void
+mlxsw_sp_mc_get_mrouters_bitmap(unsigned long *flood_bitmap,
+				struct mlxsw_sp_bridge_device *bridge_device,
+				struct mlxsw_sp *mlxsw_sp)
+{
+	struct mlxsw_sp_bridge_port *bridge_port;
+
+	list_for_each_entry(bridge_port, &bridge_device->ports_list, list) {
+		if (bridge_port->mrouter) {
+			mlxsw_sp_bridge_port_get_ports_bitmap(mlxsw_sp,
+							      bridge_port,
+							      flood_bitmap);
+		}
+	}
+}
+
 static bool
 mlxsw_sp_mc_write_mdb_entry(struct mlxsw_sp *mlxsw_sp,
-			    struct mlxsw_sp_mid *mid)
+			    struct mlxsw_sp_mid *mid,
+			    struct mlxsw_sp_bridge_device *bridge_device)
 {
+	long *flood_bitmap;
+	int num_of_ports;
+	int alloc_size;
 	u16 mid_idx;
 	int err;
 
@@ -1300,9 +1345,18 @@ mlxsw_sp_mc_write_mdb_entry(struct mlxsw_sp *mlxsw_sp,
 	if (mid_idx == MLXSW_SP_MID_MAX)
 		return false;
 
+	num_of_ports = mlxsw_core_max_ports(mlxsw_sp->core);
+	alloc_size = sizeof(long) * BITS_TO_LONGS(num_of_ports);
+	flood_bitmap = kzalloc(alloc_size, GFP_KERNEL);
+	if (!flood_bitmap)
+		return false;
+
+	bitmap_copy(flood_bitmap,  mid->ports_in_mid, num_of_ports);
+	mlxsw_sp_mc_get_mrouters_bitmap(flood_bitmap, bridge_device, mlxsw_sp);
+
 	mid->mid = mid_idx;
-	err = mlxsw_sp_port_smid_full_entry(mlxsw_sp, mid_idx,
-					    mid->ports_in_mid);
+	err = mlxsw_sp_port_smid_full_entry(mlxsw_sp, mid_idx, flood_bitmap);
+	kfree(flood_bitmap);
 	if (err)
 		return false;
 
@@ -1355,7 +1409,7 @@ mlxsw_sp_mid *__mlxsw_sp_mc_alloc(struct mlxsw_sp *mlxsw_sp,
 	if (!bridge_device->multicast_enabled)
 		goto out;
 
-	if (!mlxsw_sp_mc_write_mdb_entry(mlxsw_sp, mid))
+	if (!mlxsw_sp_mc_write_mdb_entry(mlxsw_sp, mid, bridge_device))
 		goto err_write_mdb_entry;
 
 out:
@@ -1456,7 +1510,8 @@ mlxsw_sp_bridge_mdb_mc_enable_sync(struct mlxsw_sp_port *mlxsw_sp_port,
 
 	list_for_each_entry(mid, &bridge_device->mids_list, list) {
 		if (mc_enabled)
-			mlxsw_sp_mc_write_mdb_entry(mlxsw_sp, mid);
+			mlxsw_sp_mc_write_mdb_entry(mlxsw_sp, mid,
+						    bridge_device);
 		else
 			mlxsw_sp_mc_remove_mdb_entry(mlxsw_sp, mid);
 	}
-- 
2.9.5

^ permalink raw reply related

* [patch net-next 14/16] mlxsw: spectrum_switchdev: Update the mdb of mrouter port change
From: Jiri Pirko @ 2017-09-20 14:15 UTC (permalink / raw)
  To: netdev; +Cc: davem, nogahf, idosch, mlxsw
In-Reply-To: <20170920141516.1402-1-jiri@resnulli.us>

From: Nogah Frankel <nogahf@mellanox.com>

Whenever a port starts / stops being mrouter, update all the mdb entries
in the HW to flood / stop flooding mc packets there.
The change should happen only if the port is not in the mid. (If it is,
the mid should flood mc packets to this port anyway)

Signed-off-by: Nogah Frankel <nogahf@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
---
 .../ethernet/mellanox/mlxsw/spectrum_switchdev.c   | 23 ++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
index 146beaa..bf1a175 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
@@ -130,6 +130,11 @@ mlxsw_sp_bridge_mdb_mc_enable_sync(struct mlxsw_sp_port *mlxsw_sp_port,
 				   struct mlxsw_sp_bridge_device
 				   *bridge_device);
 
+static void
+mlxsw_sp_port_mrouter_update_mdb(struct mlxsw_sp_port *mlxsw_sp_port,
+				 struct mlxsw_sp_bridge_port *bridge_port,
+				 bool add);
+
 static struct mlxsw_sp_bridge_device *
 mlxsw_sp_bridge_device_find(const struct mlxsw_sp_bridge *bridge,
 			    const struct net_device *br_dev)
@@ -747,6 +752,8 @@ static int mlxsw_sp_port_attr_mrouter_set(struct mlxsw_sp_port *mlxsw_sp_port,
 	if (err)
 		return err;
 
+	mlxsw_sp_port_mrouter_update_mdb(mlxsw_sp_port, bridge_port,
+					 is_port_mrouter);
 out:
 	bridge_port->mrouter = is_port_mrouter;
 	return 0;
@@ -1517,6 +1524,22 @@ mlxsw_sp_bridge_mdb_mc_enable_sync(struct mlxsw_sp_port *mlxsw_sp_port,
 	}
 }
 
+static void
+mlxsw_sp_port_mrouter_update_mdb(struct mlxsw_sp_port *mlxsw_sp_port,
+				 struct mlxsw_sp_bridge_port *bridge_port,
+				 bool add)
+{
+	struct mlxsw_sp_bridge_device *bridge_device;
+	struct mlxsw_sp_mid *mid;
+
+	bridge_device = bridge_port->bridge_device;
+
+	list_for_each_entry(mid, &bridge_device->mids_list, list) {
+		if (!test_bit(mlxsw_sp_port->local_port, mid->ports_in_mid))
+			mlxsw_sp_port_smid_set(mlxsw_sp_port, mid->mid, add);
+	}
+}
+
 static int mlxsw_sp_port_obj_add(struct net_device *dev,
 				 const struct switchdev_obj *obj,
 				 struct switchdev_trans *trans)
-- 
2.9.5

^ permalink raw reply related

* [patch net-next 15/16] mlxsw: spectrum_switchdev: Remove mrouter flood in mdb flush
From: Jiri Pirko @ 2017-09-20 14:15 UTC (permalink / raw)
  To: netdev; +Cc: davem, nogahf, idosch, mlxsw
In-Reply-To: <20170920141516.1402-1-jiri@resnulli.us>

From: Nogah Frankel <nogahf@mellanox.com>

In mdb flush the port is being removed from all the mids it is registered
to. But if the port is mrouter, all the mids floods to it.
This patch remove mrouter ports from mids it is not registered to in the
mdb flush.

Signed-off-by: Nogah Frankel <nogahf@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
index bf1a175..459cedc 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
@@ -1673,6 +1673,9 @@ mlxsw_sp_bridge_port_mdb_flush(struct mlxsw_sp_port *mlxsw_sp_port,
 		if (test_bit(mlxsw_sp_port->local_port, mid->ports_in_mid)) {
 			__mlxsw_sp_port_mdb_del(mlxsw_sp_port, bridge_port,
 						mid);
+		} else if (bridge_device->multicast_enabled &&
+			   bridge_port->mrouter) {
+			mlxsw_sp_port_smid_set(mlxsw_sp_port, mid->mid, false);
 		}
 	}
 }
-- 
2.9.5

^ permalink raw reply related

* [patch net-next 16/16] mlxsw: spectrum_switchdev: Consider mrouter status for mdb changes
From: Jiri Pirko @ 2017-09-20 14:15 UTC (permalink / raw)
  To: netdev; +Cc: davem, nogahf, idosch, mlxsw
In-Reply-To: <20170920141516.1402-1-jiri@resnulli.us>

From: Nogah Frankel <nogahf@mellanox.com>

When a mrouter is registered or leaves a mid, don't update the HW.

Signed-off-by: Nogah Frankel <nogahf@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
index 459cedc..0f9eac5 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
@@ -1491,6 +1491,9 @@ static int mlxsw_sp_port_mdb_add(struct mlxsw_sp_port *mlxsw_sp_port,
 	if (!bridge_device->multicast_enabled)
 		return 0;
 
+	if (bridge_port->mrouter)
+		return 0;
+
 	err = mlxsw_sp_port_smid_set(mlxsw_sp_port, mid->mid, true);
 	if (err) {
 		netdev_err(dev, "Unable to set SMID\n");
@@ -1613,10 +1616,12 @@ __mlxsw_sp_port_mdb_del(struct mlxsw_sp_port *mlxsw_sp_port,
 	int err;
 
 	if (bridge_port->bridge_device->multicast_enabled) {
-		err = mlxsw_sp_port_smid_set(mlxsw_sp_port, mid->mid, false);
-
-		if (err)
-			netdev_err(dev, "Unable to remove port from SMID\n");
+		if (bridge_port->bridge_device->multicast_enabled) {
+			err = mlxsw_sp_port_smid_set(mlxsw_sp_port, mid->mid,
+						     false);
+			if (err)
+				netdev_err(dev, "Unable to remove port from SMID\n");
+		}
 	}
 
 	err = mlxsw_sp_port_remove_from_mid(mlxsw_sp_port, mid);
-- 
2.9.5

^ permalink raw reply related

* Re: [RFC PATCH] net: Introduce a socket option to enable picking tx queue based on rx queue.
From: Tom Herbert @ 2017-09-20 14:18 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Samudrala, Sridhar, Alexander Duyck,
	Linux Kernel Network Developers
In-Reply-To: <1505884427.29839.84.camel@edumazet-glaptop3.roam.corp.google.com>

On Tue, Sep 19, 2017 at 10:13 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> On Tue, 2017-09-19 at 21:59 -0700, Samudrala, Sridhar wrote:
>> On 9/19/2017 5:48 PM, Tom Herbert wrote:
>> > On Tue, Sep 19, 2017 at 5:34 PM, Samudrala, Sridhar
>> > <sridhar.samudrala@intel.com> wrote:
>> > > On 9/12/2017 3:53 PM, Tom Herbert wrote:
>> > > > On Tue, Sep 12, 2017 at 3:31 PM, Samudrala, Sridhar
>> > > > <sridhar.samudrala@intel.com> wrote:
>> > > > >
>> > > > > On 9/12/2017 8:47 AM, Eric Dumazet wrote:
>> > > > > > On Mon, 2017-09-11 at 23:27 -0700, Samudrala, Sridhar wrote:
>> > > > > > > On 9/11/2017 8:53 PM, Eric Dumazet wrote:
>> > > > > > > > On Mon, 2017-09-11 at 20:12 -0700, Tom Herbert wrote:
>> > > > > > > >
>> > > > > > > > > Two ints in sock_common for this purpose is quite expensive and the
>> > > > > > > > > use case for this is limited-- even if a RX->TX queue mapping were
>> > > > > > > > > introduced to eliminate the queue pair assumption this still won't
>> > > > > > > > > help if the receive and transmit interfaces are different for the
>> > > > > > > > > connection. I think we really need to see some very compelling
>> > > > > > > > > results
>> > > > > > > > > to be able to justify this.
>> > > > > > > Will try to collect and post some perf data with symmetric queue
>> > > > > > > configuration.
>> > >
>> > > Here is some performance data i collected with memcached workload over
>> > > ixgbe 10Gb NIC with mcblaster benchmark.
>> > > ixgbe is configured with 16 queues and rx-usecs is set to 1000 for a very
>> > > low
>> > > interrupt rate.
>> > >       ethtool -L p1p1 combined 16
>> > >       ethtool -C p1p1 rx-usecs 1000
>> > > and busy poll is set to 1000usecs
>> > >       sysctl net.core.busy_poll = 1000
>> > >
>> > > 16 threads  800K requests/sec
>> > > =============================
>> > >                   rtt(min/avg/max)usecs     intr/sec contextswitch/sec
>> > > -----------------------------------------------------------------------
>> > > Default                2/182/10641            23391 61163
>> > > Symmetric Queues       2/50/6311              20457 32843
>> > >
>> > > 32 threads  800K requests/sec
>> > > =============================
>> > >                  rtt(min/avg/max)usecs     intr/sec contextswitch/sec
>> > > ------------------------------------------------------------------------
>> > > Default                2/162/6390            32168 69450
>> > > Symmetric Queues        2/50/3853            35044 35847
>> > >
>> > No idea what "Default" configuration is. Please report how xps_cpus is
>> > being set, how many RSS queues there are, and what the mapping is
>> > between RSS queues and CPUs and shared caches. Also, whether and
>> > threads are pinned.
>> Default is linux 4.13 with the settings i listed above.
>>         ethtool -L p1p1 combined 16
>>         ethtool -C p1p1 rx-usecs 1000
>>         sysctl net.core.busy_poll = 1000
>>
>> # ethtool -x p1p1
>> RX flow hash indirection table for p1p1 with 16 RX ring(s):
>>     0:      0     1     2     3     4     5     6     7
>>     8:      8     9    10    11    12    13    14    15
>>    16:      0     1     2     3     4     5     6     7
>>    24:      8     9    10    11    12    13    14    15
>>    32:      0     1     2     3     4     5     6     7
>>    40:      8     9    10    11    12    13    14    15
>>    48:      0     1     2     3     4     5     6     7
>>    56:      8     9    10    11    12    13    14    15
>>    64:      0     1     2     3     4     5     6     7
>>    72:      8     9    10    11    12    13    14    15
>>    80:      0     1     2     3     4     5     6     7
>>    88:      8     9    10    11    12    13    14    15
>>    96:      0     1     2     3     4     5     6     7
>>   104:      8     9    10    11    12    13    14    15
>>   112:      0     1     2     3     4     5     6     7
>>   120:      8     9    10    11    12    13    14    15
>>
>> smp_affinity for the 16 queuepairs
>>         141 p1p1-TxRx-0 0000,00000001
>>         142 p1p1-TxRx-1 0000,00000002
>>         143 p1p1-TxRx-2 0000,00000004
>>         144 p1p1-TxRx-3 0000,00000008
>>         145 p1p1-TxRx-4 0000,00000010
>>         146 p1p1-TxRx-5 0000,00000020
>>         147 p1p1-TxRx-6 0000,00000040
>>         148 p1p1-TxRx-7 0000,00000080
>>         149 p1p1-TxRx-8 0000,00000100
>>         150 p1p1-TxRx-9 0000,00000200
>>         151 p1p1-TxRx-10 0000,00000400
>>         152 p1p1-TxRx-11 0000,00000800
>>         153 p1p1-TxRx-12 0000,00001000
>>         154 p1p1-TxRx-13 0000,00002000
>>         155 p1p1-TxRx-14 0000,00004000
>>         156 p1p1-TxRx-15 0000,00008000
>> xps_cpus for the 16 Tx queues
>>         0000,00000001
>>         0000,00000002
>>         0000,00000004
>>         0000,00000008
>>         0000,00000010
>>         0000,00000020
>>         0000,00000040
>>         0000,00000080
>>         0000,00000100
>>         0000,00000200
>>         0000,00000400
>>         0000,00000800
>>         0000,00001000
>>         0000,00002000
>>         0000,00004000
>>         0000,00008000
>> memcached threads are not pinned.
>>
>
> ...
>
> I urge you to take the time to properly tune this host.
>
> linux kernel does not do automagic configuration. This is user policy.
>
> Documentation/networking/scaling.txt has everything you need.
>
Yes, tuning a system for optimal performance is difficult. Even if you
find a performance benefit for a configuration on one system, that
might not translate to another. In other words, if you've produced
some code that seems to perform better than previous implementation on
a test machine it's not enough to be satisfied with that. We want
understand _why_ there is a difference. If you can show there is
intrinsic benefits to the queue-pair model that we can't achieve with
existing implementation _and_ can show there are ill effects in other
circumstances, then you should have a good case to make changes.

In the case of memcached, threads inevitably migrate off the CPU they
were created on, the data follows the thread but the RX-queue does not
change which means that the receive path is crosses CPUs or caches.
But, then in the queuepair case that also means transmit completions
are crossing CPUs. We don't normally expect that to be a good thing.
However, transmit completion processing does not happen in the
critical path, so if that work is being deferred to a less busy CPU
there may benefits. That's only a theory, analysis and experimentation
should be able to get to the root cause.

Thanks,
Tom

^ permalink raw reply

* Re: [PATCH net-next] net: dsa: Utilize dsa_slave_dev_check()
From: Vivien Didelot @ 2017-09-20 14:19 UTC (permalink / raw)
  To: Florian Fainelli, netdev
  Cc: Florian Fainelli, Andrew Lunn, David S. Miller, open list
In-Reply-To: <20170920010038.12393-1-f.fainelli@gmail.com>

Hi Florian,

Florian Fainelli <f.fainelli@gmail.com> writes:

> Instead of open coding the check.
>
> Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>

If we do need to use it outside one day, we may think about renaming
netdev_uses_dsa() to netdev_is_dsa_master() and renaming
dsa_slave_dev_check() to netdev_is_dsa_slave().

In the meantime, looks good!

Reviewed-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>

^ permalink raw reply

* Re: [PATCH net-next 08/10] net/smc: introduce a delay
From: Ursula Braun @ 2017-09-20 14:37 UTC (permalink / raw)
  To: Leon Romanovsky
  Cc: davem, netdev, linux-rdma, linux-s390, jwi, schwidefsky,
	heiko.carstens, raspl
In-Reply-To: <20170920140315.GR5788@mtr-leonro.local>

[-- Attachment #1.1.1: Type: text/plain, Size: 1004 bytes --]

On 09/20/2017 04:03 PM, Leon Romanovsky wrote:
> On Wed, Sep 20, 2017 at 01:58:11PM +0200, Ursula Braun wrote:
>> The number of outstanding work requests is limited. If all work
>> requests are in use, tx processing is postponed to another scheduling
>> of the tx worker. Switch to a delayed worker to have a gap for tx
>> completion queue events before the next retry.
>>
> 
> How will delay prevent and protect the resource exhausting?
> 
> Thanks
> 

SMC runs with a fixed number of in-flight work requests per QP (constant
SMC_WR_BUF_CNT) to prevent resource exhausting. If all work requests are
currently in use, sending of another work request has to wait till some
outstanding work request is confirmed via send completion queue. If sending
is done in a context which is not allowed to wait, the tx_worker is
scheduled instead.
With this patch a small delay is added to avoid too many unsuccessful send
retries due to a still ongoing "all work requests in use" condition.

[-- Attachment #1.1.2: 0xC5ED6645.asc --]
[-- Type: application/pgp-keys, Size: 9949 bytes --]

[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 836 bytes --]

^ permalink raw reply

* Re: Latest net-next from GIT panic
From: Eric Dumazet @ 2017-09-20 14:40 UTC (permalink / raw)
  To: Paweł Staszewski; +Cc: Wei Wang, Linux Kernel Network Developers, edumazet
In-Reply-To: <cf94cafb-07b0-f04d-016d-cbdb46b557c1@itcare.pl>

On Wed, 2017-09-20 at 16:03 +0200, Paweł Staszewski wrote:
> Nit much more after adding this patch
> 
> https://bugzilla.kernel.org/attachment.cgi?id=258529
> 

This is why I suggested to replace the BUG() in another mail

So :

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index f535779d9dc1dfe36934c2abba4e43d053ac5d6f..220cd12456754876edf2d3ef13195e82d70d5c74 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3331,7 +3331,15 @@ void netdev_run_todo(void);
  */
 static inline void dev_put(struct net_device *dev)
 {
-	this_cpu_dec(*dev->pcpu_refcnt);
+	int __percpu *pref = READ_ONCE(dev->pcpu_refcnt);
+
+	if (!pref) {
+		pr_err("no pcpu_refcnt on dev %p(%s) state %d dismantle %d\n",
+		       dev, dev->name, dev->reg_state, dev->dismantle);
+		for (;;)
+			cpu_relax();
+	}
+	this_cpu_dec(*pref);
 }
 
 /**

^ permalink raw reply related

* vhost_net: VM looses network when using vhost over time
From: Bernd Naumann @ 2017-09-20 14:44 UTC (permalink / raw)
  To: qemu-discuss; +Cc: Linux Kernel Network Developers

Hi @all,

We have encountered/experience a bug which is more or less reproducible, but we do not know how to do it exactly or how to debug the issue in the first place.

# Background

In our setup we have a Ganti Cluser (kvm) with atm ~60 nodes running ~500 VMs, we are using tap interfaces on L2 bridges, L3 routed tap interfaces, and tap interfaces on a bridge with a VTEP attached to it. (For the vxlan setup we have a home grown daemon to maintain the FDB).

# The issue

On some VMs we loose network-connectivity under certain/unknown circumstances. 
"Looseing" means that the VM is not reachable and can therefor not reach any other host in the network.

However with `tcpdump` on the host (phy NIC + bridge) we can see the traffic going in; but with `tcpdump` on the VM we only see arp goes in, but nothing goes out. Manually setting the ARP entry does not help at all, or only for a moment, like `ip link set $DEV set arp off; ip link set $DEV arp on`. The only way we found to "fix" it, is rebooting the VM, or do `modprobe -r virtio_net; modprobe virtio_net`, but this seams also not the best workaround and can fail in a short time again. Also it is difficult to determinate when the issue is kicking in. Counting 'FAILED' neighbors is a indicator but nothing to rely on.

The frequence of the issue ranges from once in a few days, to multiple times per day or even after some minutes after boot. Most impact we see on VMs with higher network traffic like our gateway-VMs (multiple NICs in different networks, IPsec, iptables, ...); ha-proxy-VMs (similar to our gateways), but also (with reduced frequency) on /normal/ application VMs.

For what we have found so far, it looks like kind of: 
* https://bugs.launchpad.net/ubuntu/+source/qemu-kvm/+bug/997978 -- Bug #997978 “KVM images lose connectivity with bridged network” : Bugs : qemu-kvm package : Ubuntu
* https://bugs.centos.org/view.php?id=5526 -- 0005526: KVM Guest with virtio network loses network connectivity - CentOS Bug Tracker

Via `rtmon` we can observe that it starts with some "FAILED" neighbor entries and that they increase over time. As we know that this is only one consequence of not sending ARP replys to the requester; or that requested ARP is unanswered (cause the packet is not leaving the VM), the increasing count of 'FAILED' neighbors is /normal/. BUT: This can start on any interface, bridged tap interface for WAN, bridged tap in VXLAN, routed tap; it does not matter, or is not directly linked to the "kind" of interface.

# General overview of the setup

* ganiti-cluster with ~60 nodes
* each node has 2 x 50G (mlnx5 dual-port) connected to 2 x MLNX SN2700 switches
* each node runs `bird` with OSPF and ECMP (and OSPF with ECMP on SN2700 too)
* each VM has one or more vNICs in a bridged or routed network
* networks: bridged tap in WAN; bridged tap with attached VTEP; routed tap
* host OS: Ubuntu 16.04.3 with Ubuntu Kernel 4.12.13; first tested with qemu-kvm 1:2.5+dfsg-5ubuntu10.15, and later upgraded to qemu-kvm 2.10~rc3+dfsg-0ubuntu1, same issue; guest OS Ubutnu 14.04, Ubuntu 16.04 and Ubuntu 16.04 with latest Ubuntu mainline kernel PPA

# So far we can "verify" it is 'vhost'

Without "vhost=on" for the kvm process we can not observe this issue. While using "vhost=on", a effected VM can be "fixed" by `rmmod` and `insmod virtio_net`, but reboot seams to provide a "fix" for a "longer" period. (But as you may know, virtio has not the performance we expect.)

So we have some questions:

* How can we debug the main issue to provide a meaningful bug report? Debug flags on the kernel but where to hang gdb on it? Sadly we are no kernel hackers :/, but we can compile our own kernel and qemu-kvm to test also release candidates and/or put patches in place.
* Does someone have seen this too? Can provide a better workaround, or patch or anything?
* Where to file/reopen this issue? qemu, netdev?
* Is qemu-kvm even the right place to look for answers?

We are happy to provide more information or collect debug information if someone wants to investigate.

Thanks for your time!
Best,
Bernd Naumann

Spreadshirt 
Bernd Naumann 
Systems Engineer, Networking & Operations 
bernd.naumann@spreadshirt.net 

http://www.spreadshirt.com 

sprd.net AG 
Gießerstraße 27 
D-04229 Leipzig 

Fon: +49 341 594 00 - 5900 
Fax: +49 341 594 00 - 5149 

Vorstand / executive board: Philip Rooke (CEO/Vorsitzender) · Tobias Schaugg 
Aufsichtsratsvorsitzender / chairman of the supervisory board: Lukasz Gadowski 
Handelsregister / trade register: Amtsgericht Leipzig, HRB 22478 
Umsatzsteuer-IdentNummer / VAT-ID: DE 8138 7149 4

^ permalink raw reply

* Re: Latest net-next from GIT panic
From: Paweł Staszewski @ 2017-09-20 15:05 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Wei Wang, Linux Kernel Network Developers, edumazet
In-Reply-To: <1505918451.29839.97.camel@edumazet-glaptop3.roam.corp.google.com>

W dniu 2017-09-20 o 16:40, Eric Dumazet pisze:
> On Wed, 2017-09-20 at 16:03 +0200, Paweł Staszewski wrote:
>> Nit much more after adding this patch
>>
>> https://bugzilla.kernel.org/attachment.cgi?id=258529
>>
> This is why I suggested to replace the BUG() in another mail
>
> So :
>
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index f535779d9dc1dfe36934c2abba4e43d053ac5d6f..220cd12456754876edf2d3ef13195e82d70d5c74 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -3331,7 +3331,15 @@ void netdev_run_todo(void);
>    */
>   static inline void dev_put(struct net_device *dev)
>   {
> -	this_cpu_dec(*dev->pcpu_refcnt);
> +	int __percpu *pref = READ_ONCE(dev->pcpu_refcnt);
> +
> +	if (!pref) {
> +		pr_err("no pcpu_refcnt on dev %p(%s) state %d dismantle %d\n",
> +		       dev, dev->name, dev->reg_state, dev->dismantle);
> +		for (;;)
> +			cpu_relax();
> +	}
> +	this_cpu_dec(*pref);
>   }
>   
>   /**
>
>
>

Full panic

https://bugzilla.kernel.org/attachment.cgi?id=258531


I will change patch and apply but later today cause now cant use backup 
router as testlab - Internet rush hours if something happens this will 
be bed when second router will have bugged kernel :)

^ permalink raw reply

* RFC iproute2 doc files
From: Stephen Hemminger @ 2017-09-20 15:11 UTC (permalink / raw)
  To: netdev

I noticed that the iproute man pages are up to date but the LaTex documentation
is very out of date. Rarely updated since the Linux 2.2 days.

Either someone needs to do a massive editing job on them, or they should just
be dropped. My preference would be to just drop everything in the doc/ directory.
The current versions are so old, they can't be helping.

^ permalink raw reply

* Re: [PATCH net-next 5/5] tls: Add generic NIC offload infrastructure.
From: Hannes Frederic Sowa @ 2017-09-20 15:16 UTC (permalink / raw)
  To: Boris Pismenny
  Cc: Ilya Lesokhin, netdev@vger.kernel.org, davem@davemloft.net,
	davejwatson@fb.com, tom@herbertland.com, Aviad Yehezkel,
	Liran Liss
In-Reply-To: <DB6PR05MB3176E39DDA1771A6883860FBB0600@DB6PR05MB3176.eurprd05.prod.outlook.com>

Hello,

Boris Pismenny <borisp@mellanox.com> writes:

> Hello,
>
> Hannes Frederic Sowa <hannes@stressinduktion.org> writes:
>> Hello,
>> 
>> Ilya Lesokhin <ilyal@mellanox.com> writes:
>> 
>> > Hannes Frederic Sowa <hannes@stressinduktion.org> writes:
>> >
>> >> The user should be aware of that they can't migrate the socket to
>> >> another interface if they got hw offloaded. This is not the case for
>> software offload.
>> >> Thus I think the user has to opt in and it shouldn't be a heuristic
>> >> until we can switch back to sw offload path.
>> >>
>> >> Maybe change flowi_oif to sk_bound_dev_if and somwhow lock it against
>> >> further changes if hw tls is in use?
>> >>
>> >
>> > I'm not sure I follow.
>> > We do set sk->sk_bound_dev_if to prevent further changes.
>> >
>> > Do you recommend we enable TLS offload only if SO_BINDTODEVICE
>> > was previously used on that socket?
>> > and prevent even users with CAP_NET_RAW from unbinding it?
>> >
>> > I would rather avoid requiring CAP_NET_RAW to use TLS offload.
>> > But admittedly I'm not sure setting sk->sk_bound_dev_if without
>> > CAP_NET_RAW like we do is legit either.
>> >
>> > Finally, the reason we made HW offload the default is that the user
>> > can use sudo ethtool -K enp0s4 tls-hw-tx-offload off to opt out of HW
>> > offload and we currently don't have anything equivalent for opting out of
>> SW KTLS.
>> 
>> IMHO the decision if a TCP flow should be bounded to hw and thus never
>> push traffic to another interface should a decision the administrator and the
>> application should opt in. You might have your management application
>> which is accessible over multiple interfaces and your production application
>> which might want to use hw offloaded tls. Thus I don't think only a single
>> ethtool knob will do it.
>
> IMO the configuration knob should be at the kTLS level and not at the
> HW vs. SW level. The management application shouldn't be using kTLS.
> I'd like to view TLS offload similarly to LSO. The default is opt-in if
> possible, and the Kernel decides that based on device capabilities.
>
>> 
>> I agree that SO_BINDTODEVICE is bad for this use case. First, the
>> CAP_NET_RAW limitation seems annoying and we don't want to enforce TLS
>> apps to have this capability. Second, the user space application doesn't care
>> which interface it should talk to (maybe?) but leave the routing decision to
>> the kernel and just opt in to TLS. SO_BINDTODEVICE doesn't allow this.
>> 
>> sk_bound_dev_if can be rebound later with CAP_NET_RAW privileges, will
>> this be a problem?
>
> Yes it is a problem and we have some ideas for a software fallback that should
> catch this. 

Ok.

> Is the software fallback a prerequisite for kTLS offload in Kernel?

I don't know. I would assume yes because it will change how uAPI will
look like?

>> 
>> Have you thought how the user space will configure the various offloading
>> features (sw, hw, none)? Will it in e.g. OpenSSL be part of the Cipher Spec or
>> will there be new functions around SSL_CTX to do so?
>> 
>> Maybe an enhancement of the TLS_TX setsockopt with a boolean for hw
>> offload is a solution?
>
> Yes, we think that OpenSSL should first configure whether it complies with
> kTLS support. Next, we thought of using an environment variable to control
> kTLS globally in OpenSSL as follows:

0. no kernel tls at all but use e.g. OpenSSL crypto code.

> 1. only software kTLS
> 2. only hardware kTLS - no fallback to software.
> 3. Try to use hardware kTLS and if it isn't supported fallback to
> software kTLS.

Hmm, environment variable and global control contradicts itself. ;)

In some form or another there is a need to have all options for
debugging. I also wonder if it makes sense to disable ktls based on
reordering and fast path vs. slow path hit ratio. But that is something
to think about later.

> The above is something we plan for the future, assuming that kTLS
> wouldn't fit for all use-cases. What do you think?
>
> If you'd like to have more fine-grained control of kTLS, e.g. per socket,
> then the application would need to be modified to configure that,
> which is something we try to avoid.

That is why I proposed signaling over ciphers(1) for openssl. If you
e.g. look at apache/mod_ssl, they loop the cipher list from the
configuration file directly to OpenSSL. Same for a lot of other web
servers, nginx etc. Thus you just need to modify openssl and don't need
to touch the users of the library.

E.g. in Fedora/RHEL the crypto libs load a default cipher list from
/etc/crypto-policies/, which you can update centrally with
update-crypto-policies. Maybe the kTLS switches fit nicely in there?

For that to do, OpenSSL needs still to have more fine grain control over
which kTLS sw/hw to use, right?

>> 
>> Another question:
>> 
>> How is the dependency management done between socket layer and driver
>> layer? It seems a bit cyclic but judging from this code you don't hold
>> references to the device (dev_hold) (which is good, you don't want to have
>> users creating refs to devices). OTOH you somehow need to match sockets
>> from the device layer up to the socket. Will those be reference counted or
>> does that work without?
>
> Not sure I follow your question.
> We use the socket from the device layer through the SKB that carries it,
> so I think it should work without.
> We don't attempt to perform a socket lookup or anything of this sort.

The socket from skb is only valid as long as you have the skb. Basically
the question is: do you ever increase the ref counter of sockets from
the device drivers?

Thanks,
Hannes

^ permalink raw reply

* IP Expo show Europe 2017 Attendees List
From: Aspen Ella @ 2017-09-20 14:44 UTC (permalink / raw)
  To: netdev@vger.kernel.org

Hi,
Would you be interested in the "IP Expo show Europe 2017 Attendees List ?"

Please Let me know your interest to send you the number of attendees and cost.
Just let me know if you have any questions.
Awaiting your reply
 
Regards,
Aspen
Marketing Executive
 
 To remove from this mailing: reply with subject line as "leave out."


^ permalink raw reply

* Re: [PATCH net-next 00/10] net/smc: updates 2017-09-20
From: Bart Van Assche @ 2017-09-20 15:22 UTC (permalink / raw)
  To: davem-fT/PcQaiUtIeIZ0/mPfg9Q@public.gmane.org,
	ubraun-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org
  Cc: raspl-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org,
	netdev-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	heiko.carstens-tA70FqPdS9bQT0dZR+AlfA@public.gmane.org,
	linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	jwi-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org,
	schwidefsky-tA70FqPdS9bQT0dZR+AlfA@public.gmane.org,
	linux-s390-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
In-Reply-To: <20170920115813.63745-1-ubraun-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #1: Type: text/plain; charset="utf-8", Size: 535 bytes --]

On Wed, 2017-09-20 at 13:58 +0200, Ursula Braun wrote:
> here is a collection of small smc-patches built for net-next improving
> the smc code in different areas.

Hello Ursula,

Can you provide us an update for the timeline of the plan to transition from
PF_SMC to PF_INET/PF_INET6 + SOCK_STREAM? See also
https://www.mail-archive.com/netdev@vger.kernel.org/msg166744.html.

Thanks,

Bart.N‹§²æìr¸›yúèšØb²X¬¶Ç§vØ^–)Þº{.nÇ+‰·¥Š{±ÙšŠ{ayº\x1dÊ‡Ú™ë,j\a¢f£¢·hš‹»öì\x17/oSc¾™Ú³9˜uÀ¦æå‰È&jw¨®\x03(éšŽŠÝ¢j"ú\x1a¶^[m§ÿïêäz¹Þ–Šàþf£¢·hšˆ§~ˆmš

^ permalink raw reply

* Re: [PATCH] VSOCK: fix uapi/linux/vm_sockets.h incomplete types
From: Stefan Hajnoczi @ 2017-09-20 15:22 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, jhansen
In-Reply-To: <20170919.103840.379059301652331060.davem@davemloft.net>

On Tue, Sep 19, 2017 at 10:38:40AM -0700, David Miller wrote:
> From: Stefan Hajnoczi <stefanha@redhat.com>
> Date: Mon, 18 Sep 2017 16:21:00 +0100
> 
> > On Fri, Sep 15, 2017 at 02:14:32PM -0700, David Miller wrote:
> >> > diff --git a/include/uapi/linux/vm_sockets.h b/include/uapi/linux/vm_sockets.h
> >> > index b4ed5d895699..4ae5c625ac56 100644
> >> > --- a/include/uapi/linux/vm_sockets.h
> >> > +++ b/include/uapi/linux/vm_sockets.h
> >> > @@ -18,6 +18,10 @@
> >> >  
> >> >  #include <linux/socket.h>
> >> >  
> >> > +#ifndef __KERNEL__
> >> > +#include <sys/socket.h> /* struct sockaddr */
> >> > +#endif
> >> > +
> >> 
> >> There is no precedence whatsoever to include sys/socket.h in _any_ UAPI
> >> header file provided by the kernel.
> > 
> > <linux/if.h> does it for the same reason:
> > 
> > include/uapi/linux/if.h:#include <sys/socket.h>                 /* for struct sockaddr.         */
> 
> You don't need it for struct sockaddr, you need it for sa_family_t,
> the comment is very misleading.
> 
> Please do as I have instructed and it will fix this problem.

No, you really cannot rely on struct sockaddr from <linux/socket.h> in
uapi headers.  You can check this yourself:

  $ cd /tmp && gcc -o a.o -c /usr/include/linux/vm_sockets.h
  /usr/include/linux/vm_sockets.h:148:32: error: invalid application of ‘sizeof’ to incomplete type ‘struct sockaddr’
  unsigned char svm_zero[sizeof(struct sockaddr) -
                                ^~~~~~

The weird situation is:

1. When compiling the kernel, <linux/socket.h> brings in struct sockaddr
   because the compiler finds include/linux/socket.h first before
   include/uapi/linux/socket.h.

2. When compiling a userspace application, <linux/socket.h> does not
   bring in struct sockaddr because include/uapi/linux/socket.h is
   found.

This is why I added the #include <sys/socket> when !__KERNEL__.  Sorry
that the commit description wasn't clear on this.

Am I misunderstanding something?

Stefan

^ permalink raw reply

* RE: [PATCH v4 net 2/3] lan78xx: Allow EEPROM write for less than MAX_EEPROM_SIZE
From: Nisar.Sayed @ 2017-09-20 15:27 UTC (permalink / raw)
  To: sergei.shtylyov, davem; +Cc: UNGLinuxDriver, netdev
In-Reply-To: <17bc270e-9cc1-485a-d6bb-0ebfa143e65e@cogentembedded.com>

Thanks Sergei, I will update it and submit next version.

- Nisar

 > Hello!
> 
> On 09/19/2017 01:02 AM, Nisar Sayed wrote:
> 
> > Allow EEPROM write for less than MAX_EEPROM_SIZE
> >
> > Fixes: 55d7de9de6c3 ("Microchip's LAN7800 family USB 2/3 to
> > 10/100/1000 Ethernet device driver")
> > Signed-off-by: Nisar Sayed <Nisar.Sayed@microchip.com>
> > ---
> >   drivers/net/usb/lan78xx.c | 9 ++++-----
> >   1 file changed, 4 insertions(+), 5 deletions(-)
> >
> > diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c
> > index fcf85ae37435..3292f56ffe02 100644
> > --- a/drivers/net/usb/lan78xx.c
> > +++ b/drivers/net/usb/lan78xx.c
> > @@ -1290,11 +1290,10 @@ static int lan78xx_ethtool_set_eeprom(struct
> net_device *netdev,
> >   	if (ret)
> >   		return ret;
> >
> > -	/* Allow entire eeprom update only */
> > -	if ((ee->magic == LAN78XX_EEPROM_MAGIC) &&
> > -	    (ee->offset == 0) &&
> > -	    (ee->len == 512) &&
> > -	    (data[0] == EEPROM_INDICATOR))
> > +	/* Invalid EEPROM_INDICATOR at offset zero will result in fail to
> 
>     s/fail/a failure/.
> 
> > +	 * load data from EEPROM
> > +	 */
> > +	if (ee->magic == LAN78XX_EEPROM_MAGIC)
> >   		ret = lan78xx_write_raw_eeprom(dev, ee->offset, ee->len,
> data);
> >   	else if ((ee->magic == LAN78XX_OTP_MAGIC) &&
> >   		 (ee->offset == 0) &&
> >
> 
> MBR, Sergei

^ permalink raw reply

* Re: [RFC PATCH] net: Introduce a socket option to enable picking tx queue based on rx queue.
From: Hannes Frederic Sowa @ 2017-09-20 15:30 UTC (permalink / raw)
  To: Sridhar Samudrala; +Cc: alexander.h.duyck, netdev
In-Reply-To: <1504222032-6337-1-git-send-email-sridhar.samudrala@intel.com>

Sridhar Samudrala <sridhar.samudrala@intel.com> writes:

> This patch introduces a new socket option SO_SYMMETRIC_QUEUES that can be used
> to enable symmetric tx and rx queues on a socket.
>
> This option is specifically useful for epoll based multi threaded workloads
> where each thread handles packets received on a single RX queue . In this model,
> we have noticed that it helps to send the packets on the same TX queue
> corresponding to the queue-pair associated with the RX queue specifically when
> busy poll is enabled with epoll().
>
> Two new fields are added to struct sock_common to cache the last rx ifindex and
> the rx queue in the receive path of an SKB. __netdev_pick_tx() returns the cached
> rx queue when this option is enabled and the TX is happening on the same device.

Would it help to make the rx and tx skb hashes symmetric
(skb_get_hash_symmetric) on request?

^ permalink raw reply

* Re: [PATCH net-next 00/14] gtp: Additional feature support
From: Andreas Schultz @ 2017-09-20 15:34 UTC (permalink / raw)
  To: Harald Welte, Tom Herbert
  Cc: Tom Herbert, David S. Miller, Linux Kernel Network Developers,
	Pablo Neira Ayuso, Rohit Seth
In-Reply-To: <20170919231947.bt4am3els3p26l6p@nataraja>

Hi Harald,

On 20/09/17 01:19, Harald Welte wrote:
> Hi Tom,
> 
> On Tue, Sep 19, 2017 at 08:59:28AM -0700, Tom Herbert wrote:
>> On Tue, Sep 19, 2017 at 5:43 AM, Harald Welte <laforge@gnumonks.org>
>> wrote:
>>> On Mon, Sep 18, 2017 at 05:38:50PM -0700, Tom Herbert wrote:
>>>>    - IPv6 support
>>>
>>> see my detailed comments in other mails.  It's unfortunately only
>>> support for the already "deprecated" IPv6-only PDP contexts, not the
>>> more modern v4v6 type.  In order to interoperate with old and new
>>> approach, all three cases (v4, v6 and v4v6) should be supported from
>>> one code base.
>>>
>> It sounds like something that can be subsequently added.
> 
> Not entirely, at least on the netlink (and any other configuration
> interface) you will have to reflect this from the very beginning.  You
> have to have an explicit PDP type and cannot rely on the address type to
> specify the type of PDP context.  Whatever interfaces are introduced
> now will have to remain compatible to any future change.
> 
> My strategy to avoid any such possible 'road blocks' from being
> introduced would be to simply add v4v6 and v6 support in one go.  The
> differences are marginal (having both an IPv6 prefix and a v4 address in
> parallel, rather than mutually exclusive only).
> 
>> Do you have a reference to the spec?
> 
> See http://osmocom.org/issues/2418#note-7 which lists Section 11.2.1.3.2
> of 3GPP TS 29.061 in combination with RFC3314, RFC7066, RFC6459 and
> 3GPP TS 23.060 9.2.1 as well as a summary of my understanding of it some
> months ago.
> 
>>>>    - Configurable networking interfaces so that GTP kernel can be
>>>>    used and tested without needing GSN network emulation (i.e. no
>>>>    user space daemon needed).
>>>
>>> We have some pretty decent userspace utilities for configuring the
>>> GTP interfaces and tunnels in the libgtpnl repository, but if it
>>> helps people to have another way of configuration, I won't be
>>> against it.
>>>
>> AFAIK those userspace utilities don't support IPv6.
> 
> Of course not [yet]. libgtpnl and the command line tools have been
> implemented specifically for the in-kernel GTP driver, and you have to
> make sure to add related support on both the kernel and the userspace
> side (libgtpnl). So there's little point in adding features on either
> side before the other side.  There would be no way to test...
> 
>> Being able to configure GTP like any other encapsulation will
>> facilitate development of IPv6 and other features.
> 
> That may very well be the case, but adding "IPv6 support" to kernel GTP
> in a way that is not in line with the existing userspace libraries and
> control-plane implementations means that you're developing those
> features in an artificial environment that doesn't resemble real 3GPP
> interoperable networks out there.
> 
> As indicated, I'm not against adding additional interfaces, but we have
> to make sure that we add IPv6 support (or any new feature support) to at
> least libgtpnl, and to make sure we test interoperability with existing
> 3GPP network equipment such as real IPv6 capable phones and SGSNs.
> 
>>> I'm not sure if this is a useful feature.  GTP is used only in
>>> operator-controlled networks and only on standard ports.  It's not
>>> possible to negotiate any non-standard ports on the signaling plane
>>> either.
>>>
>> Bear in mind that we're not required to do everything the GTP spec
>> says.
> 
> Yes, we are, at least as long as it affects interoperability with other
> implemetations out there.
> 
> GTP uses well-known port numbers on *both* sides of the tunnel, and you
> cannot deviate from that.

Actually, the well-known port is only mandatory for the receiving side.
The sending side can choose any port it wishes as long as it is prepared
to receive possible error indication on the well-known port.

Of course, it makes the implementation simple to use only one port, but 
for scalability it might be a good idea to support per PDP context 
sending ports.

Regards
Andreas

> There's no point in having all kinds of feetures in the GTP user plane
> which are not interoperable with other implementations, and which are
> completely outside of the information model / architecture of GTP.
> 
> In the real world, GTP-U is only used in combination with GTP-C.  And in
> GTP-C you can only negotiate the IP address of both sides of GTP-U, and
> not the port number information.  As a result, the port numbers are
> static on both sides.
> 
>> My impression is GTP designers probably didn't think in terms of
>> getting best performance. But we can ;-)
> 
> I think it's wasted efforts if it's about "random udp ports" as no
> standards-compliant implementation out there with which you will have to
> interoperate will be able to support it.
> 
> GTP is used between home and roaming operator.  If you want to introduce
> changes to how it works, you will have to have control over both sides
> of the implementation of both the GTP-C and the GTP-u plane, which is
> very unlikely and rather the exception in the hundreds of operators you
> interoperate with.  Also keep in mind that there often are various
> "middleboxes" that will suddenly have to reflect your changes.  That
> starts from packet filters at various locations in the operator networks
> and/or roaming hubs, down to GTP hubs and others.
> 
> My opinion is: Non-standard GTP ports are not going to happen.
> 
>> I also brought up open_ggsn. ggsn to sgsn.
> 
> That's good to hear.  For both v4 and v6 PDP contexts?  Whcih phones
> did you use for testing?  Particularly given how convolved the address
> allocation is (see below), I'm surprised it would work.
> 
>>> For IPv6 (and v4v6) PDP contexts there is quite a bit of extra headache
>>> related to the way how router solicitation/advertisements are modified
>>> in the 3GPP world.
>>>
>>> The address allocation in v4 is simple:
>>> * MS/UE requests dynamic or fixed IPv4 address via EUA IE of PDP context
>>>    activation
>>> * GGSN responds with IPv4 address in EUA of Activate PDP context
>>>    response (and then uses netlink to tell the kernel about that
>>>    IPv4 address)
>>>
>>> In v6 or the v6 portion of v4v6 it works differently:
>>> * MS/UE requests dynamic or fixed IPv4 address in EUA IE of PDP context
>>>    activation
>>> * GGSN responds with an IPv6 address, but that address is *not* used
>>>    for communication, but simply used as an "interface identifier" to
>>>    build a link-local address.
>>> * MS then uses router solicitation using that link-local address
>>> * GGSN responds with router advertisement, allocating a single /64
>>>    prefix, from which the MS then generates a fully-qualified IPv6
>>>    source address for communication.
>>>
>>> How did you envision this to be done with the v6 support you just added?
>>> At the very least, the /64 prefix matching would have to be implemented
>>> so that in fact all addresses within that /64 prefix are matched +
>>> encapsulated for a given PDP context in the downlink (to phone)
>>> direction.
>>>
>>> [...]
>> I would hope all the above you're describing is mostly control plane
>> matters.
> 
> It is not.  The control plane is GTP-C and runs on different UDP ports
> (at least for GTPv1/v2).  The user plane is GTP-U and is what's done in
> the kernel.  And by its very nature, IPv6 router
> solicitations/advertisements (as well as neighbor
> solicitations/advertisements) are part of the user plane and thus
> handled in GTP-U.
> 
>> At least a good design decouples data palne and control
>> plane. I know that GTP is a bit convoluted in this regard.
> 
> The problem is that IPv6 has never been specified properly for
> point-to-point links.  There's no decent PPP specs for IPv6.  So the
> 3GPP folks had to try to be as close as possible to the existing
> (broadcast) link layer model to facilitate existing IPv6 implemetations
> to work over 3GPP bearers.  That's why they kept whatever possible to
> re-use in terms of neighbor/router discovery.
> 
> So the problem is now: Unless you handle GTP-U *entirely* in the kernel
> (including router + neighbor advertisement/solicitation), you will have
> a "split GTP-U" plane between kernel and userspace.  And in that context
> the question is who owns the sequence numbers, how will you avoid race
> conditions, ... - my simple suggestion is thus to keep with the current
> split and do everything GTP-U related inside the kernel and everything
> GTP-C related in userspace.
> 
> I think there has to be a clear plan/architecture on how to implement
> those bits in terms of the kernel/userspace split, and at least a proof
> of concept implementation that we can show works with some real phones
> out there - otherwise there's no point in having IPv6 support that works
> well with some custom tools.
> 
> Regards,
> 	Harald
> 

^ permalink raw reply

* Re: [PATCH net-next 09/14] gtp: Allow configuring GTP interface as standalone
From: Andreas Schultz @ 2017-09-20 15:27 UTC (permalink / raw)
  To: Tom Herbert, davem; +Cc: netdev, pablo, laforge, rohit
In-Reply-To: <20170919003904.5124-10-tom@quantonium.net>

On 19/09/17 02:38, Tom Herbert wrote:
> Add new configuration of GTP interfaces that allow specifying a port to
> listen on (as opposed to having to get sockets from a userspace control
> plane). This allows GTP interfaces to be configured and the data path
> tested without requiring a GTP-C daemon.

This would imply that you can have multiple independent GTP sockets on 
the same IP address.That is not permitted by the GTP specifications. 
3GPP TS 29.281, section 4.3 states clearly that there is "only" one GTP 
entity per IP address.A PDP context is defined by the destination IP and 
the TEID. The destination port is not part of the identity of a PDP context.

Even the source IP and source port are not part of the tunnel identity. 
This makes is possible to send traffic from a new SGSN/SGW during 
handover before the control protocol has announced the handover.

At this point the usual response is: THAT IS NOT SAFE. Yes, GTP has been 
designed for cooperative networks only and should not be used on 
hostile/unsecured networks.

On the sending side, using multiple ports is permitted as long as the 
default GTP port is always able to receive incoming messages.

Andreas

[...]

^ permalink raw reply

* Re: [PATCH net-next 03/14] gtp: Call common functions to get tunnel routes and add dst_cache
From: Andreas Schultz @ 2017-09-20 15:37 UTC (permalink / raw)
  To: Harald Welte, David Miller; +Cc: tom, netdev, pablo, rohit
In-Reply-To: <20170919120942.dpy5kmkhzws7pqd5@nataraja>



On 19/09/17 14:09, Harald Welte wrote:
> Hi Dave,
> 
> On Mon, Sep 18, 2017 at 09:17:51PM -0700, David Miller wrote:
>> This and the new dst caching code ignores any source address selection
>> done by ip_route_output_key() or the new tunnel route lookup helpers.
>>
>> Either source address selection should be respected, or if saddr will
>> never be modified by a route lookup for some specific reason here,
>> that should be documented.
> 
> The IP source address is fixed by signaling on the GTP-C control plane
> and nothing that the kernel can unilaterally decide to change.  Such a
> change of address would have to be decided by and first be signaled on
> GTP-C to the peer by the userspace daemon, which would then update the
> PDP context in the kernel.

I think we had this discussion before. The sending IP and port are not 
part of the identity of the PDP context. So IMHO the sender is permitted
to change the source IP at random.

Regards
Andreas

> 
> So I guess you're asking us to document that rationale as form of a
> source code comment ?
> 

^ permalink raw reply

* Re: [PATCH,net-next,0/2] Improve code coverage of syzkaller
From: Willem de Bruijn @ 2017-09-20 15:38 UTC (permalink / raw)
  To: David Miller; +Cc: peterpenkov96, Network Development
In-Reply-To: <20170919.230853.540610280718334640.davem@davemloft.net>

On Wed, Sep 20, 2017 at 2:08 AM, David Miller <davem@davemloft.net> wrote:
> From: Petar Penkov <peterpenkov96@gmail.com>
> Date: Tue, 19 Sep 2017 21:26:14 -0700
>
>> Furthermore, in a way testing already requires specific kernel
>> configuration.  In this particular example, syzkaller prefers
>> synchronous operation and therefore needs 4KSTACKS disabled. Other
>> features that require rebuilding are KASAN and dbx. From this point
>> of view, I still think that having the TUN_NAPI flag has value.
>
> Then I think this path could be enabled/disabled with a runtime flag
> just as easily, no?

I think that the compile time option was chosen because of the ns_capable
check, so that with user namespaces unprivileged processes can control this
path. Perhaps we can require capable() only to set IFF_NAPI_FRAGS.

Then we can convert the napi_gro_receive path to be conditional on a new
IFF_NAPI flag instead of this compile time option.

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox