Netdev List
 help / color / mirror / Atom feed
* [PATCH net 10/11] vxlan: add adjacent link to limit depth level
From: Taehee Yoo @ 2019-09-04 18:41 UTC (permalink / raw)
  To: davem, netdev, j.vosburgh, vfalico, andy, jiri, sd, roopa, saeedm,
	manishc, rahulv, kys, haiyangz, sthemmin, sashal, hare, varun,
	ubraun, kgraul
  Cc: ap420073

Current vxlan code doesn't limit the number of nested devices.
Nested devices would be handled recursively and this routine needs
huge stack memory. So, unlimited nested devices could make
stack overflow.

In order to fix this issue, this patch adds adjacent links.
The adjacent link APIs internally check the depth level.

Test commands:
    ip link add dummy0 type dummy
    ip link add vxlan0 type vxlan id 0 group 239.1.1.1 dev dummy0 \
	    dstport 4789
    for i in {1..100}
    do
            let A=$i-1
            ip link add vxlan$i type vxlan id $i group 239.1.1.1 \
		    dev vxlan$A dstport 4789
    done
    ip link del dummy0

The top upper link is vxlan100 and the lowest link is vxlan0.
When vxlan0 is deleting, the upper devices will be deleted recursively.
It needs huge stack memory so it makes stack overflow.

Splat looks like:
[  229.628477] =============================================================================
[  229.629785] BUG page->ptl (Not tainted): Padding overwritten. 0x0000000026abf214-0x0000000091f6abb2
[  229.629785] -----------------------------------------------------------------------------
[  229.629785]
[  229.655439] ==================================================================
[  229.629785] INFO: Slab 0x00000000ff7cfda8 objects=19 used=19 fp=0x00000000fe33776c flags=0x200000000010200
[  229.655688] BUG: KASAN: stack-out-of-bounds in unmap_single_vma+0x25a/0x2e0
[  229.655688] Read of size 8 at addr ffff888113076928 by task vlan-network-in/2334
[  229.655688]
[  229.629785] Padding 0000000026abf214: 00 80 14 0d 81 88 ff ff 68 91 81 14 81 88 ff ff  ........h.......
[  229.629785] Padding 0000000001e24790: 38 91 81 14 81 88 ff ff 68 91 81 14 81 88 ff ff  8.......h.......
[  229.629785] Padding 00000000b39397c8: 33 30 62 a7 ff ff ff ff ff eb 60 22 10 f1 ff 1f  30b.......`"....
[  229.629785] Padding 00000000bc98f53a: 80 60 07 13 81 88 ff ff 00 80 14 0d 81 88 ff ff  .`..............
[  229.629785] Padding 000000002aa8123d: 68 91 81 14 81 88 ff ff f7 21 17 a7 ff ff ff ff  h........!......
[  229.629785] Padding 000000001c8c2369: 08 81 14 0d 81 88 ff ff 03 02 00 00 00 00 00 00  ................
[  229.629785] Padding 000000004e290c5d: 21 90 a2 21 10 ed ff ff 00 00 00 00 00 fc ff df  !..!............
[  229.629785] Padding 000000000e25d731: 18 60 07 13 81 88 ff ff c0 8b 13 05 81 88 ff ff  .`..............
[  229.629785] Padding 000000007adc7ab3: b3 8a b5 41 00 00 00 00                          ...A....
[  229.629785] FIX page->ptl: Restoring 0x0000000026abf214-0x0000000091f6abb2=0x5a
[  ... ]


Fixes: acaf4e70997f ("net: vxlan: when lower dev unregisters remove vxlan dev as well")
Signed-off-by: Taehee Yoo <ap420073@gmail.com>
---
 drivers/net/vxlan.c | 71 ++++++++++++++++++++++++++++++++++++++-------
 include/net/vxlan.h |  1 +
 2 files changed, 62 insertions(+), 10 deletions(-)

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 3d9bcc957f7d..0d5c8d22d8a4 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -3567,6 +3567,8 @@ static int __vxlan_dev_create(struct net *net, struct net_device *dev,
 	struct vxlan_net *vn = net_generic(net, vxlan_net_id);
 	struct vxlan_dev *vxlan = netdev_priv(dev);
 	struct vxlan_fdb *f = NULL;
+	struct net_device *remote_dev = NULL;
+	struct vxlan_rdst *dst = &vxlan->default_dst;
 	bool unregister = false;
 	int err;
 
@@ -3577,14 +3579,14 @@ static int __vxlan_dev_create(struct net *net, struct net_device *dev,
 	dev->ethtool_ops = &vxlan_ethtool_ops;
 
 	/* create an fdb entry for a valid default destination */
-	if (!vxlan_addr_any(&vxlan->default_dst.remote_ip)) {
+	if (!vxlan_addr_any(&dst->remote_ip)) {
 		err = vxlan_fdb_create(vxlan, all_zeros_mac,
-				       &vxlan->default_dst.remote_ip,
+				       &dst->remote_ip,
 				       NUD_REACHABLE | NUD_PERMANENT,
 				       vxlan->cfg.dst_port,
-				       vxlan->default_dst.remote_vni,
-				       vxlan->default_dst.remote_vni,
-				       vxlan->default_dst.remote_ifindex,
+				       dst->remote_vni,
+				       dst->remote_vni,
+				       dst->remote_ifindex,
 				       NTF_SELF, &f);
 		if (err)
 			return err;
@@ -3595,26 +3597,43 @@ static int __vxlan_dev_create(struct net *net, struct net_device *dev,
 		goto errout;
 	unregister = true;
 
+	if (dst->remote_ifindex) {
+		remote_dev = __dev_get_by_index(net, dst->remote_ifindex);
+		if (!remote_dev)
+			goto errout;
+
+		err = netdev_upper_dev_link(remote_dev, dev, extack);
+		if (err)
+			goto errout;
+	}
+
 	err = rtnl_configure_link(dev, NULL);
 	if (err)
-		goto errout;
+		goto unlink;
 
 	if (f) {
-		vxlan_fdb_insert(vxlan, all_zeros_mac,
-				 vxlan->default_dst.remote_vni, f);
+		vxlan_fdb_insert(vxlan, all_zeros_mac, dst->remote_vni, f);
 
 		/* notify default fdb entry */
 		err = vxlan_fdb_notify(vxlan, f, first_remote_rtnl(f),
 				       RTM_NEWNEIGH, true, extack);
 		if (err) {
 			vxlan_fdb_destroy(vxlan, f, false, false);
+			if (remote_dev)
+				netdev_upper_dev_unlink(remote_dev, dev);
 			goto unregister;
 		}
 	}
 
 	list_add(&vxlan->next, &vn->vxlan_list);
+	if (remote_dev) {
+		dst->remote_dev = remote_dev;
+		dev_hold(remote_dev);
+	}
 	return 0;
-
+unlink:
+	if (remote_dev)
+		netdev_upper_dev_unlink(remote_dev, dev);
 errout:
 	/* unregister_netdevice() destroys the default FDB entry with deletion
 	 * notification. But the addition notification was not sent yet, so
@@ -3936,6 +3955,8 @@ static int vxlan_changelink(struct net_device *dev, struct nlattr *tb[],
 	struct net_device *lowerdev;
 	struct vxlan_config conf;
 	int err;
+	bool linked = false;
+	bool disabled = false;
 
 	err = vxlan_nl2conf(tb, data, dev, &conf, true, extack);
 	if (err)
@@ -3946,6 +3967,16 @@ static int vxlan_changelink(struct net_device *dev, struct nlattr *tb[],
 	if (err)
 		return err;
 
+	if (lowerdev) {
+		if (dst->remote_dev && lowerdev != dst->remote_dev) {
+			netdev_adjacent_dev_disable(dst->remote_dev, dev);
+			disabled = true;
+		}
+		err = netdev_upper_dev_link(lowerdev, dev, extack);
+		if (err)
+			goto err;
+		linked = true;
+	}
 	/* handle default dst entry */
 	if (!vxlan_addr_equal(&conf.remote_ip, &dst->remote_ip)) {
 		u32 hash_index = fdb_head_index(vxlan, all_zeros_mac, conf.vni);
@@ -3962,7 +3993,7 @@ static int vxlan_changelink(struct net_device *dev, struct nlattr *tb[],
 					       NTF_SELF, true, extack);
 			if (err) {
 				spin_unlock_bh(&vxlan->hash_lock[hash_index]);
-				return err;
+				goto err;
 			}
 		}
 		if (!vxlan_addr_any(&dst->remote_ip))
@@ -3979,8 +4010,24 @@ static int vxlan_changelink(struct net_device *dev, struct nlattr *tb[],
 	if (conf.age_interval != vxlan->cfg.age_interval)
 		mod_timer(&vxlan->age_timer, jiffies);
 
+	if (disabled) {
+		netdev_adjacent_dev_enable(dst->remote_dev, dev);
+		netdev_upper_dev_unlink(dst->remote_dev, dev);
+		dev_put(dst->remote_dev);
+	}
+	if (linked) {
+		dst->remote_dev = lowerdev;
+		dev_hold(dst->remote_dev);
+	}
+
 	vxlan_config_apply(dev, &conf, lowerdev, vxlan->net, true);
 	return 0;
+err:
+	if (linked)
+		netdev_upper_dev_unlink(lowerdev, dev);
+	if (disabled)
+		netdev_adjacent_dev_enable(dst->remote_dev, dev);
+	return err;
 }
 
 static void vxlan_dellink(struct net_device *dev, struct list_head *head)
@@ -3991,6 +4038,10 @@ static void vxlan_dellink(struct net_device *dev, struct list_head *head)
 
 	list_del(&vxlan->next);
 	unregister_netdevice_queue(dev, head);
+	if (vxlan->default_dst.remote_dev) {
+		netdev_upper_dev_unlink(vxlan->default_dst.remote_dev, dev);
+		dev_put(vxlan->default_dst.remote_dev);
+	}
 }
 
 static size_t vxlan_get_size(const struct net_device *dev)
diff --git a/include/net/vxlan.h b/include/net/vxlan.h
index dc1583a1fb8a..08e237d7aa73 100644
--- a/include/net/vxlan.h
+++ b/include/net/vxlan.h
@@ -197,6 +197,7 @@ struct vxlan_rdst {
 	u8			 offloaded:1;
 	__be32			 remote_vni;
 	u32			 remote_ifindex;
+	struct net_device	 *remote_dev;
 	struct list_head	 list;
 	struct rcu_head		 rcu;
 	struct dst_cache	 dst_cache;
-- 
2.17.1


^ permalink raw reply related

* [PATCH net 09/11] net: core: add ignore flag to netdev_adjacent structure
From: Taehee Yoo @ 2019-09-04 18:40 UTC (permalink / raw)
  To: davem, netdev, j.vosburgh, vfalico, andy, jiri, sd, roopa, saeedm,
	manishc, rahulv, kys, haiyangz, sthemmin, sashal, hare, varun,
	ubraun, kgraul
  Cc: ap420073

In order to link an adjacent node, netdev_upper_dev_link() is used
and in order to unlink an adjacent node, netdev_upper_dev_unlink() is used.
unlink operation does not fail, but link operation can fail.

In order to exchange adjacent nodes, we should unlink an old adjacent
node first. then, link a new adjacent node.
If link operation is failed, we should link an old adjacent node again.
But this link operation can fail too.
It eventually breaks the adjacent link relationship.

This patch adds an ignore flag into the netdev_adjacent structure.
If this flag is set, netdev_upper_dev_link() ignores an old adjacent
node for a moment.
So we can skip unlink operation before link operation.

Signed-off-by: Taehee Yoo <ap420073@gmail.com>
---
 include/linux/netdevice.h |   4 +
 net/core/dev.c            | 160 +++++++++++++++++++++++++++++++++-----
 2 files changed, 144 insertions(+), 20 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index a2c47f43e54b..d9ca4a79f715 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -4322,6 +4322,10 @@ int netdev_master_upper_dev_link(struct net_device *dev,
 				 struct netlink_ext_ack *extack);
 void netdev_upper_dev_unlink(struct net_device *dev,
 			     struct net_device *upper_dev);
+void netdev_adjacent_dev_disable(struct net_device *upper_dev,
+				struct net_device *lower_dev);
+void netdev_adjacent_dev_enable(struct net_device *upper_dev,
+				struct net_device *lower_dev);
 void netdev_adjacent_rename_links(struct net_device *dev, char *oldname);
 void *netdev_lower_dev_get_private(struct net_device *dev,
 				   struct net_device *lower_dev);
diff --git a/net/core/dev.c b/net/core/dev.c
index 6a4b4ce62204..ac055b531c96 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6448,6 +6448,9 @@ struct netdev_adjacent {
 	/* upper master flag, there can only be one master device per list */
 	bool master;
 
+	/* lookup ignore flag */
+	bool ignore;
+
 	/* counter for the number of times this device was added to us */
 	u16 ref_nr;
 
@@ -6553,6 +6556,22 @@ struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
 }
 EXPORT_SYMBOL(netdev_master_upper_dev_get);
 
+struct net_device *netdev_master_upper_dev_get_ignore(struct net_device *dev)
+{
+	struct netdev_adjacent *upper;
+
+	ASSERT_RTNL();
+
+	if (list_empty(&dev->adj_list.upper))
+		return NULL;
+
+	upper = list_first_entry(&dev->adj_list.upper,
+				 struct netdev_adjacent, list);
+	if (likely(upper->master) && !upper->ignore)
+		return upper->dev;
+	return NULL;
+}
+
 /**
  * netdev_has_any_lower_dev - Check if device is linked to some device
  * @dev: device
@@ -6603,8 +6622,9 @@ struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
 }
 EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
 
-static struct net_device *netdev_next_upper_dev(struct net_device *dev,
-						struct list_head **iter)
+static struct net_device *netdev_next_upper_dev_ignore(struct net_device *dev,
+						       struct list_head **iter,
+						       bool *ignore)
 {
 	struct netdev_adjacent *upper;
 
@@ -6614,6 +6634,7 @@ static struct net_device *netdev_next_upper_dev(struct net_device *dev,
 		return NULL;
 
 	*iter = &upper->list;
+	*ignore = upper->ignore;
 
 	return upper->dev;
 }
@@ -6635,26 +6656,29 @@ static struct net_device *netdev_next_upper_dev_rcu(struct net_device *dev,
 	return upper->dev;
 }
 
-int netdev_walk_all_upper_dev(struct net_device *dev,
-			      int (*fn)(struct net_device *dev,
-					void *data),
-			      void *data)
+int netdev_walk_all_upper_dev_ignore(struct net_device *dev,
+				     int (*fn)(struct net_device *dev,
+					       void *data),
+				     void *data)
 {
 	struct net_device *udev;
 	struct list_head *iter;
 	int ret;
+	bool ignore;
 
 	for (iter = &dev->adj_list.upper,
-	     udev = netdev_next_upper_dev(dev, &iter);
+	     udev = netdev_next_upper_dev_ignore(dev, &iter, &ignore);
 	     udev;
-	     udev = netdev_next_upper_dev(dev, &iter)) {
+	     udev = netdev_next_upper_dev_ignore(dev, &iter, &ignore)) {
+		if (ignore)
+			continue;
 		/* first is the upper device itself */
 		ret = fn(udev, data);
 		if (ret)
 			return ret;
 
 		/* then look at all of its upper devices */
-		ret = netdev_walk_all_upper_dev(udev, fn, data);
+		ret = netdev_walk_all_upper_dev_ignore(udev, fn, data);
 		if (ret)
 			return ret;
 	}
@@ -6690,6 +6714,15 @@ int netdev_walk_all_upper_dev_rcu(struct net_device *dev,
 }
 EXPORT_SYMBOL_GPL(netdev_walk_all_upper_dev_rcu);
 
+bool netdev_has_upper_dev_ignore(struct net_device *dev,
+				 struct net_device *upper_dev)
+{
+	ASSERT_RTNL();
+
+	return netdev_walk_all_upper_dev_ignore(dev, __netdev_has_upper_dev,
+						upper_dev);
+}
+
 /**
  * netdev_lower_get_next_private - Get the next ->private from the
  *				   lower neighbour list
@@ -6786,6 +6819,23 @@ static struct net_device *netdev_next_lower_dev(struct net_device *dev,
 	return lower->dev;
 }
 
+static struct net_device *netdev_next_lower_dev_ignore(struct net_device *dev,
+						       struct list_head **iter,
+						       bool *ignore)
+{
+	struct netdev_adjacent *lower;
+
+	lower = list_entry((*iter)->next, struct netdev_adjacent, list);
+
+	if (&lower->list == &dev->adj_list.lower)
+		return NULL;
+
+	*iter = &lower->list;
+	*ignore = lower->ignore;
+
+	return lower->dev;
+}
+
 int netdev_walk_all_lower_dev(struct net_device *dev,
 			      int (*fn)(struct net_device *dev,
 					void *data),
@@ -6814,6 +6864,36 @@ int netdev_walk_all_lower_dev(struct net_device *dev,
 }
 EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev);
 
+int netdev_walk_all_lower_dev_ignore(struct net_device *dev,
+				     int (*fn)(struct net_device *dev,
+					       void *data),
+				     void *data)
+{
+	struct net_device *ldev;
+	struct list_head *iter;
+	int ret;
+	bool ignore;
+
+	for (iter = &dev->adj_list.lower,
+	     ldev = netdev_next_lower_dev_ignore(dev, &iter, &ignore);
+	     ldev;
+	     ldev = netdev_next_lower_dev_ignore(dev, &iter, &ignore)) {
+		if (ignore)
+			continue;
+		/* first is the lower device itself */
+		ret = fn(ldev, data);
+		if (ret)
+			return ret;
+
+		/* then look at all of its lower devices */
+		ret = netdev_walk_all_lower_dev_ignore(ldev, fn, data);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
 static struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev,
 						    struct list_head **iter)
 {
@@ -6833,11 +6913,14 @@ static u8 __netdev_upper_depth(struct net_device *dev)
 	struct net_device *udev;
 	struct list_head *iter;
 	u8 max_depth = 0;
+	bool ignore;
 
 	for (iter = &dev->adj_list.upper,
-	     udev = netdev_next_upper_dev(dev, &iter);
+	     udev = netdev_next_upper_dev_ignore(dev, &iter, &ignore);
 	     udev;
-	     udev = netdev_next_upper_dev(dev, &iter)) {
+	     udev = netdev_next_upper_dev_ignore(dev, &iter, &ignore)) {
+		if (ignore)
+			continue;
 		if (max_depth < udev->upper_level)
 			max_depth = udev->upper_level;
 	}
@@ -6850,11 +6933,14 @@ static u8 __netdev_lower_depth(struct net_device *dev)
 	struct net_device *ldev;
 	struct list_head *iter;
 	u8 max_depth = 0;
+	bool ignore;
 
 	for (iter = &dev->adj_list.lower,
-	     ldev = netdev_next_lower_dev(dev, &iter);
+	     ldev = netdev_next_lower_dev_ignore(dev, &iter, &ignore);
 	     ldev;
-	     ldev = netdev_next_lower_dev(dev, &iter)) {
+	     ldev = netdev_next_lower_dev_ignore(dev, &iter, &ignore)) {
+		if (ignore)
+			continue;
 		if (max_depth < ldev->lower_level)
 			max_depth = ldev->lower_level;
 	}
@@ -6999,6 +7085,7 @@ static int __netdev_adjacent_dev_insert(struct net_device *dev,
 	adj->master = master;
 	adj->ref_nr = 1;
 	adj->private = private;
+	adj->ignore = false;
 	dev_hold(adj_dev);
 
 	pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d; dev_hold on %s\n",
@@ -7149,17 +7236,17 @@ static int __netdev_upper_dev_link(struct net_device *dev,
 		return -EBUSY;
 
 	/* To prevent loops, check if dev is not upper device to upper_dev. */
-	if (netdev_has_upper_dev(upper_dev, dev))
+	if (netdev_has_upper_dev_ignore(upper_dev, dev))
 		return -EBUSY;
 
 	if ((dev->lower_level + upper_dev->upper_level) > MAX_NEST_DEV)
 		return -EMLINK;
 
 	if (!master) {
-		if (netdev_has_upper_dev(dev, upper_dev))
+		if (netdev_has_upper_dev_ignore(dev, upper_dev))
 			return -EEXIST;
 	} else {
-		master_dev = netdev_master_upper_dev_get(dev);
+		master_dev = netdev_master_upper_dev_get_ignore(dev);
 		if (master_dev)
 			return master_dev == upper_dev ? -EEXIST : -EBUSY;
 	}
@@ -7182,10 +7269,12 @@ static int __netdev_upper_dev_link(struct net_device *dev,
 		goto rollback;
 
 	__netdev_update_upper_level(dev, NULL);
-	netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL);
+	netdev_walk_all_lower_dev_ignore(dev, __netdev_update_upper_level,
+					 NULL);
 
 	__netdev_update_lower_level(upper_dev, NULL);
-	netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level, NULL);
+	netdev_walk_all_upper_dev_ignore(upper_dev,
+					 __netdev_update_lower_level, NULL);
 
 	return 0;
 
@@ -7271,13 +7360,44 @@ void netdev_upper_dev_unlink(struct net_device *dev,
 				      &changeupper_info.info);
 
 	__netdev_update_upper_level(dev, NULL);
-	netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL);
+	netdev_walk_all_lower_dev_ignore(dev, __netdev_update_upper_level,
+					 NULL);
 
 	__netdev_update_lower_level(upper_dev, NULL);
-	netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level, NULL);
+	netdev_walk_all_upper_dev_ignore(upper_dev,
+					 __netdev_update_lower_level, NULL);
 }
 EXPORT_SYMBOL(netdev_upper_dev_unlink);
 
+void __netdev_adjacent_dev_set(struct net_device *upper_dev,
+			       struct net_device *lower_dev,
+			       bool val)
+{
+	struct netdev_adjacent *adj;
+
+	adj = __netdev_find_adj(lower_dev, &upper_dev->adj_list.lower);
+	if (adj)
+		adj->ignore = val;
+
+	adj = __netdev_find_adj(upper_dev, &lower_dev->adj_list.upper);
+	if (adj)
+		adj->ignore = val;
+}
+
+void netdev_adjacent_dev_disable(struct net_device *upper_dev,
+				 struct net_device *lower_dev)
+{
+	__netdev_adjacent_dev_set(upper_dev, lower_dev, true);
+}
+EXPORT_SYMBOL(netdev_adjacent_dev_disable);
+
+void netdev_adjacent_dev_enable(struct net_device *upper_dev,
+				struct net_device *lower_dev)
+{
+	__netdev_adjacent_dev_set(upper_dev, lower_dev, false);
+}
+EXPORT_SYMBOL(netdev_adjacent_dev_enable);
+
 /**
  * netdev_bonding_info_change - Dispatch event about slave change
  * @dev: device
-- 
2.17.1


^ permalink raw reply related

* [PATCH net 08/11] macsec: fix refcnt leak in module exit routine
From: Taehee Yoo @ 2019-09-04 18:40 UTC (permalink / raw)
  To: davem, netdev, j.vosburgh, vfalico, andy, jiri, sd, roopa, saeedm,
	manishc, rahulv, kys, haiyangz, sthemmin, sashal, hare, varun,
	ubraun, kgraul
  Cc: ap420073

When a macsec interface is created, it increases a refcnt to a lower
device(real device). when macsec interface is deleted, the refcnt is
decreased in macsec_free_netdev(), which is ->priv_destructor() of
macsec interface.

The problem scenario is this.
When nested macsec interfaces are exiting, the exit routine of the
macsec module makes refcnt leaks.

Test commands:
    ip link add dummy0 type dummy
    ip link add macsec0 link dummy0 type macsec
    ip link add macsec1 link macsec0 type macsec
    modprobe -rv macsec

[  208.629433] unregister_netdevice: waiting for macsec0 to become free. Usage count = 1

Steps of exit routine of macsec module are below.
1. Calls ->dellink() in __rtnl_link_unregister().
2. Checks refcnt and wait refcnt to be 0 if refcnt is not 0 in
netdev_run_todo().
3. Calls ->priv_destruvtor() in netdev_run_todo().

Step2 checks refcnt, but step3 decreases refcnt.
So, step2 waits forever.

This patch makes the macsec module do not hold a refcnt of the lower
device because it already holds a refcnt of the lower device with
netdev_upper_dev_link().

Fixes: c09440f7dcb3 ("macsec: introduce IEEE 802.1AE driver")
Signed-off-by: Taehee Yoo <ap420073@gmail.com>
---
 drivers/net/macsec.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/drivers/net/macsec.c b/drivers/net/macsec.c
index 25a4fc88145d..41ec1ed0d545 100644
--- a/drivers/net/macsec.c
+++ b/drivers/net/macsec.c
@@ -3031,12 +3031,10 @@ static const struct nla_policy macsec_rtnl_policy[IFLA_MACSEC_MAX + 1] = {
 static void macsec_free_netdev(struct net_device *dev)
 {
 	struct macsec_dev *macsec = macsec_priv(dev);
-	struct net_device *real_dev = macsec->real_dev;
 
 	free_percpu(macsec->stats);
 	free_percpu(macsec->secy.tx_sc.stats);
 
-	dev_put(real_dev);
 }
 
 static void macsec_setup(struct net_device *dev)
@@ -3291,8 +3289,6 @@ static int macsec_newlink(struct net *net, struct net_device *dev,
 	if (err < 0)
 		return err;
 
-	dev_hold(real_dev);
-
 	macsec->nest_level = dev_get_nest_level(real_dev) + 1;
 
 	err = netdev_upper_dev_link(real_dev, dev, extack);
-- 
2.17.1


^ permalink raw reply related

* [PATCH net 07/11] macvlan: use dynamic lockdep key instead of subclass
From: Taehee Yoo @ 2019-09-04 18:40 UTC (permalink / raw)
  To: davem, netdev, j.vosburgh, vfalico, andy, jiri, sd, roopa, saeedm,
	manishc, rahulv, kys, haiyangz, sthemmin, sashal, hare, varun,
	ubraun, kgraul
  Cc: ap420073

All macvlan device has same lockdep key and subclass is initialized with
nest_level.
But actual nest_level value can be changed when a lower device is attached.
And at this moment, the subclass should be updated but it seems to be
unsafe.
So this patch makes macvlan use dynamic lockdep key instead of the
subclass.

Test commands:
    ip link add bond0 type bond
    ip link add dummy0 type dummy
    ip link add macvlan0 link bond0 type macvlan mode bridge
    ip link add macvlan1 link dummy0 type macvlan mode bridge
    ip link set bond0 mtu 1000
    ip link set macvlan1 master bond0

    ip link set bond0 up
    ip link set macvlan0 up
    ip link set dummy0 up
    ip link set macvlan1 up

Splat looks like:
[  165.677603] ============================================
[  165.679642] WARNING: possible recursive locking detected
[  165.679642] 5.3.0-rc7+ #322 Not tainted
[  165.679642] --------------------------------------------
[  165.679642] ip/1812 is trying to acquire lock:
[  165.679642] 00000000ae6a8a03 (&macvlan_netdev_addr_lock_key/1){+...}, at: dev_uc_sync_multiple+0xfa/0x1a0
[  165.679642]
[  165.679642] but task is already holding lock:
[  165.679642] 00000000cec5da0b (&macvlan_netdev_addr_lock_key/1){+...}, at: dev_set_rx_mode+0x19/0x30
[  165.679642]
[  165.679642] other info that might help us debug this:
[  165.679642]  Possible unsafe locking scenario:
[  165.679642]
[  165.679642]        CPU0
[  165.679642]        ----
[  165.679642]   lock(&macvlan_netdev_addr_lock_key/1);
[  165.679642]   lock(&macvlan_netdev_addr_lock_key/1);
[  165.679642]
[  165.679642]  *** DEADLOCK ***
[  165.679642]
[  165.679642]  May be due to missing lock nesting notation
[  165.679642]
[  165.679642] 4 locks held by ip/1812:
[  165.679642]  #0: 0000000088d10bd8 (rtnl_mutex){+.+.}, at: rtnetlink_rcv_msg+0x466/0x8a0
[  165.679642]  #1: 00000000cec5da0b (&macvlan_netdev_addr_lock_key/1){+...}, at: dev_set_rx_mode+0x19/0x30
[  165.679642]  #2: 000000000ca6fdb5 (&dev_addr_list_lock_key/3){+...}, at: dev_uc_sync+0xfa/0x1a0
[  165.679642]  #3: 00000000dc1495a2 (rcu_read_lock){....}, at: bond_set_rx_mode+0x5/0x3c0 [bonding]
[  165.679642]
[  165.679642] stack backtrace:
[  165.679642] CPU: 1 PID: 1812 Comm: ip Not tainted 5.3.0-rc7+ #322
[  165.679642] Call Trace:
[  165.679642]  dump_stack+0x7c/0xbb
[  165.679642]  __lock_acquire+0x26a9/0x3de0
[  165.679642]  ? register_lock_class+0x14d0/0x14d0
[  165.679642]  ? mark_held_locks+0xa5/0xe0
[  165.679642]  ? trace_hardirqs_on_thunk+0x1a/0x20
[  165.679642]  ? register_lock_class+0x14d0/0x14d0
[  165.679642]  lock_acquire+0x164/0x3b0
[  165.679642]  ? dev_uc_sync_multiple+0xfa/0x1a0
[  165.679642]  _raw_spin_lock_nested+0x2e/0x60
[  165.679642]  ? dev_uc_sync_multiple+0xfa/0x1a0
[  165.679642]  dev_uc_sync_multiple+0xfa/0x1a0
[  165.679642]  bond_set_rx_mode+0x269/0x3c0 [bonding]
[  165.679642]  ? bond_init+0x6f0/0x6f0 [bonding]
[  165.679642]  dev_uc_sync+0x15a/0x1a0
[  165.679642]  macvlan_set_mac_lists+0x55/0x110 [macvlan]
[  165.679642]  dev_set_rx_mode+0x21/0x30
[  165.679642]  __dev_open+0x202/0x310
[  165.679642]  ? dev_set_rx_mode+0x30/0x30
[  165.679642]  ? mark_held_locks+0xa5/0xe0
[  165.679642]  ? __local_bh_enable_ip+0xe9/0x1b0
[  165.679642]  __dev_change_flags+0x3c3/0x500
[  165.679642]  ? dev_set_allmulti+0x10/0x10
[  165.679642]  dev_change_flags+0x7a/0x160
[  ...]

Fixes: c674ac30c549 ("macvlan: Fix lockdep warnings with stacked macvlan devices")
Signed-off-by: Taehee Yoo <ap420073@gmail.com>
---
 drivers/net/macvlan.c      | 35 +++++++++++++++++++++++++++--------
 include/linux/if_macvlan.h |  2 ++
 2 files changed, 29 insertions(+), 8 deletions(-)

diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index 940192c057b6..dae368a2e8d1 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -852,8 +852,6 @@ static int macvlan_do_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
  * "super class" of normal network devices; split their locks off into a
  * separate class since they always nest.
  */
-static struct lock_class_key macvlan_netdev_addr_lock_key;
-
 #define ALWAYS_ON_OFFLOADS \
 	(NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_GSO_SOFTWARE | \
 	 NETIF_F_GSO_ROBUST | NETIF_F_GSO_ENCAP_ALL)
@@ -874,12 +872,30 @@ static int macvlan_get_nest_level(struct net_device *dev)
 	return ((struct macvlan_dev *)netdev_priv(dev))->nest_level;
 }
 
-static void macvlan_set_lockdep_class(struct net_device *dev)
+static void macvlan_dev_set_lockdep_one(struct net_device *dev,
+					struct netdev_queue *txq,
+					void *_unused)
+{
+	struct macvlan_dev *macvlan = netdev_priv(dev);
+
+	lockdep_set_class(&txq->_xmit_lock, &macvlan->xmit_lock_key);
+}
+
+static struct lock_class_key qdisc_tx_busylock_key;
+static struct lock_class_key qdisc_running_key;
+
+static void macvlan_dev_set_lockdep_class(struct net_device *dev)
 {
-	netdev_lockdep_set_classes(dev);
-	lockdep_set_class_and_subclass(&dev->addr_list_lock,
-				       &macvlan_netdev_addr_lock_key,
-				       macvlan_get_nest_level(dev));
+	struct macvlan_dev *macvlan = netdev_priv(dev);
+
+	dev->qdisc_tx_busylock = &qdisc_tx_busylock_key;
+	dev->qdisc_running_key = &qdisc_running_key;
+
+	lockdep_register_key(&macvlan->addr_lock_key);
+	lockdep_set_class(&dev->addr_list_lock, &macvlan->addr_lock_key);
+
+	lockdep_register_key(&macvlan->xmit_lock_key);
+	netdev_for_each_tx_queue(dev, macvlan_dev_set_lockdep_one, NULL);
 }
 
 static int macvlan_init(struct net_device *dev)
@@ -900,7 +916,7 @@ static int macvlan_init(struct net_device *dev)
 	dev->gso_max_segs	= lowerdev->gso_max_segs;
 	dev->hard_header_len	= lowerdev->hard_header_len;
 
-	macvlan_set_lockdep_class(dev);
+	macvlan_dev_set_lockdep_class(dev);
 
 	vlan->pcpu_stats = netdev_alloc_pcpu_stats(struct vlan_pcpu_stats);
 	if (!vlan->pcpu_stats)
@@ -922,6 +938,9 @@ static void macvlan_uninit(struct net_device *dev)
 	port->count -= 1;
 	if (!port->count)
 		macvlan_port_destroy(port->dev);
+
+	lockdep_unregister_key(&vlan->addr_lock_key);
+	lockdep_unregister_key(&vlan->xmit_lock_key);
 }
 
 static void macvlan_dev_get_stats64(struct net_device *dev,
diff --git a/include/linux/if_macvlan.h b/include/linux/if_macvlan.h
index 2e55e4cdbd8a..ea5b41823287 100644
--- a/include/linux/if_macvlan.h
+++ b/include/linux/if_macvlan.h
@@ -31,6 +31,8 @@ struct macvlan_dev {
 	u16			flags;
 	int			nest_level;
 	unsigned int		macaddr_count;
+	struct lock_class_key xmit_lock_key;
+	struct lock_class_key addr_lock_key;
 #ifdef CONFIG_NET_POLL_CONTROLLER
 	struct netpoll		*netpoll;
 #endif
-- 
2.17.1


^ permalink raw reply related

* [PATCH net 06/11] macsec: use dynamic lockdep key instead of subclass
From: Taehee Yoo @ 2019-09-04 18:40 UTC (permalink / raw)
  To: davem, netdev, j.vosburgh, vfalico, andy, jiri, sd, roopa, saeedm,
	manishc, rahulv, kys, haiyangz, sthemmin, sashal, hare, varun,
	ubraun, kgraul
  Cc: ap420073

All macsec device has same lockdep key and subclass is initialized with
nest_level.
But actual nest_level value can be changed when a lower device is attached.
And at this moment, the subclass should be updated but it seems to be
unsafe.
So this patch makes macsec use dynamic lockdep key instead of the subclass.

Test commands:
    ip link add bond0 type bond
    ip link add dummy0 type dummy
    ip link add macsec0 link bond0 type macsec
    ip link add macsec1 link dummy0 type macsec
    ip link set bond0 mtu 1000
    ip link set macsec1 master bond0

    ip link set bond0 up
    ip link set macsec0 up
    ip link set dummy0 up
    ip link set macsec1 up

Splat looks like:
[  146.540123] ============================================
[  146.540123] WARNING: possible recursive locking detected
[  146.540123] 5.3.0-rc7+ #322 Not tainted
[  146.540123] --------------------------------------------
[  146.540123] ip/1340 is trying to acquire lock:
[  146.540123] 00000000446fd8bd (&macsec_netdev_addr_lock_key/1){+...}, at: dev_uc_sync_multiple+0xfa/0x1a0
[  146.540123]
[  146.540123] but task is already holding lock:
[  146.540123] 00000000a9ab6378 (&macsec_netdev_addr_lock_key/1){+...}, at: dev_set_rx_mode+0x19/0x30
[  146.540123]
[  146.540123] other info that might help us debug this:
[  146.540123]  Possible unsafe locking scenario:
[  146.540123]
[  146.540123]        CPU0
[  146.540123]        ----
[  146.540123]   lock(&macsec_netdev_addr_lock_key/1);
[  146.540123]   lock(&macsec_netdev_addr_lock_key/1);
[  146.623155]
[  146.623155]  *** DEADLOCK ***
[  146.623155]
[  146.623155]  May be due to missing lock nesting notation
[  146.623155]
[  146.623155] 4 locks held by ip/1340:
[  146.623155]  #0: 0000000026436ef0 (rtnl_mutex){+.+.}, at: rtnetlink_rcv_msg+0x466/0x8a0
[  146.623155]  #1: 00000000a9ab6378 (&macsec_netdev_addr_lock_key/1){+...}, at: dev_set_rx_mode+0x19/0x30
[  146.623155]  #2: 00000000a8947dd0 (&dev_addr_list_lock_key/3){+...}, at: dev_mc_sync+0xfa/0x1a0
[  146.623155]  #3: 00000000b62011e9 (rcu_read_lock){....}, at: bond_set_rx_mode+0x5/0x3c0 [bonding]
[  146.674970]
[  146.674970] stack backtrace:
[  146.687145] CPU: 0 PID: 1340 Comm: ip Not tainted 5.3.0-rc7+ #322
[  146.693024] Call Trace:
[  146.693024]  dump_stack+0x7c/0xbb
[  146.693024]  __lock_acquire+0x26a9/0x3de0
[  146.693024]  ? register_lock_class+0x14d0/0x14d0
[  146.693024]  ? register_lock_class+0x14d0/0x14d0
[  146.693024]  lock_acquire+0x164/0x3b0
[  146.693024]  ? dev_uc_sync_multiple+0xfa/0x1a0
[  146.693024]  _raw_spin_lock_nested+0x2e/0x60
[  146.693024]  ? dev_uc_sync_multiple+0xfa/0x1a0
[  146.693024]  dev_uc_sync_multiple+0xfa/0x1a0
[  146.693024]  bond_set_rx_mode+0x269/0x3c0 [bonding]
[  146.751163]  ? bond_init+0x6f0/0x6f0 [bonding]
[  146.757006]  ? do_raw_spin_trylock+0xa9/0x170
[  146.757006]  dev_mc_sync+0x15a/0x1a0
[  146.757006]  macsec_dev_set_rx_mode+0x3a/0x50 [macsec]
[  146.757006]  dev_set_rx_mode+0x21/0x30
[  146.757006]  __dev_open+0x202/0x310
[  146.757006]  ? dev_set_rx_mode+0x30/0x30
[  146.757006]  ? mark_held_locks+0xa5/0xe0
[  146.757006]  ? __local_bh_enable_ip+0xe9/0x1b0
[  146.757006]  __dev_change_flags+0x3c3/0x500
[  146.757006]  ? dev_set_allmulti+0x10/0x10
[  146.757006]  ? sched_clock_local+0xd4/0x140
[  146.757006]  ? check_chain_key+0x236/0x5d0
[  146.757006]  dev_change_flags+0x7a/0x160
[  146.757006]  do_setlink+0xa26/0x2f20
[  146.757006]  ? sched_clock_local+0xd4/0x140
[  ... ]

Fixes: e20038724552 ("macsec: fix lockdep splats when nesting devices")
Signed-off-by: Taehee Yoo <ap420073@gmail.com>
---
 drivers/net/macsec.c | 37 ++++++++++++++++++++++++++++++++-----
 1 file changed, 32 insertions(+), 5 deletions(-)

diff --git a/drivers/net/macsec.c b/drivers/net/macsec.c
index 8f46aa1ddec0..25a4fc88145d 100644
--- a/drivers/net/macsec.c
+++ b/drivers/net/macsec.c
@@ -267,6 +267,8 @@ struct macsec_dev {
 	struct pcpu_secy_stats __percpu *stats;
 	struct list_head secys;
 	struct gro_cells gro_cells;
+	struct lock_class_key xmit_lock_key;
+	struct lock_class_key addr_lock_key;
 	unsigned int nest_level;
 };
 
@@ -2749,7 +2751,32 @@ static netdev_tx_t macsec_start_xmit(struct sk_buff *skb,
 
 #define MACSEC_FEATURES \
 	(NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST)
-static struct lock_class_key macsec_netdev_addr_lock_key;
+
+static void macsec_dev_set_lockdep_one(struct net_device *dev,
+				       struct netdev_queue *txq,
+				       void *_unused)
+{
+	struct macsec_dev *macsec = macsec_priv(dev);
+
+	lockdep_set_class(&txq->_xmit_lock, &macsec->xmit_lock_key);
+}
+
+static struct lock_class_key qdisc_tx_busylock_key;
+static struct lock_class_key qdisc_running_key;
+
+static void macsec_dev_set_lockdep_class(struct net_device *dev)
+{
+	struct macsec_dev *macsec = macsec_priv(dev);
+
+	dev->qdisc_tx_busylock = &qdisc_tx_busylock_key;
+	dev->qdisc_running_key = &qdisc_running_key;
+
+	lockdep_register_key(&macsec->addr_lock_key);
+	lockdep_set_class(&dev->addr_list_lock, &macsec->addr_lock_key);
+
+	lockdep_register_key(&macsec->xmit_lock_key);
+	netdev_for_each_tx_queue(dev, macsec_dev_set_lockdep_one, NULL);
+}
 
 static int macsec_dev_init(struct net_device *dev)
 {
@@ -2780,6 +2807,7 @@ static int macsec_dev_init(struct net_device *dev)
 	if (is_zero_ether_addr(dev->broadcast))
 		memcpy(dev->broadcast, real_dev->broadcast, dev->addr_len);
 
+	macsec_dev_set_lockdep_class(dev);
 	return 0;
 }
 
@@ -2789,6 +2817,9 @@ static void macsec_dev_uninit(struct net_device *dev)
 
 	gro_cells_destroy(&macsec->gro_cells);
 	free_percpu(dev->tstats);
+
+	lockdep_unregister_key(&macsec->addr_lock_key);
+	lockdep_unregister_key(&macsec->xmit_lock_key);
 }
 
 static netdev_features_t macsec_fix_features(struct net_device *dev,
@@ -3263,10 +3294,6 @@ static int macsec_newlink(struct net *net, struct net_device *dev,
 	dev_hold(real_dev);
 
 	macsec->nest_level = dev_get_nest_level(real_dev) + 1;
-	netdev_lockdep_set_classes(dev);
-	lockdep_set_class_and_subclass(&dev->addr_list_lock,
-				       &macsec_netdev_addr_lock_key,
-				       macsec_get_nest_level(dev));
 
 	err = netdev_upper_dev_link(real_dev, dev, extack);
 	if (err < 0)
-- 
2.17.1


^ permalink raw reply related

* [PATCH net 05/11] team: use dynamic lockdep key instead of static key
From: Taehee Yoo @ 2019-09-04 18:39 UTC (permalink / raw)
  To: davem, netdev, j.vosburgh, vfalico, andy, jiri, sd, roopa, saeedm,
	manishc, rahulv, kys, haiyangz, sthemmin, sashal, hare, varun,
	ubraun, kgraul
  Cc: ap420073

In the current code, all team devices have same static lockdep key
and team devices could be nested so that it makes unnecessary
lockdep warning.

Test commands:
    ip link add team0 type team
    for i in {1..7}
    do
	    let A=$i-1
	    ip link add team$i type team
	    ip link set team$i master team$A
    done
    ip link del team0

Splat looks like:
[  137.406730] ============================================
[  137.412685] WARNING: possible recursive locking detected
[  137.418642] 5.3.0-rc7+ #322 Not tainted
[  137.422941] --------------------------------------------
[  137.428886] ip/1383 is trying to acquire lock:
[  137.433869] 0000000089571080 (&dev_addr_list_lock_key/1){+...}, at: dev_uc_sync_multiple+0xfa/0x1a0
[  137.444034]
[  137.444034] but task is already holding lock:
[  137.450572] 00000000d9597252 (&dev_addr_list_lock_key/1){+...}, at: dev_uc_unsync+0x10c/0x1b0
[  137.460142]
[  137.460142] other info that might help us debug this:
[  137.467458]  Possible unsafe locking scenario:
[  137.467458]
[  137.474096]        CPU0
[  137.476828]        ----
[  137.479569]   lock(&dev_addr_list_lock_key/1);
[  137.484554]   lock(&dev_addr_list_lock_key/1);
[  137.489539]
[  137.489539]  *** DEADLOCK ***
[  137.489539]
[  137.496178]  May be due to missing lock nesting notation
[  137.496178]
[  137.503789] 5 locks held by ip/1383:
[  137.507797]  #0: 00000000d497f415 (rtnl_mutex){+.+.}, at: rtnetlink_rcv_msg+0x466/0x8a0
[  137.516786]  #1: 000000008e4b4656 (&team->lock){+.+.}, at: team_uninit+0x3a/0x1a0 [team]
[  137.525882]  #2: 000000005cf248d1 (&dev_addr_list_lock_key){+...}, at: dev_uc_unsync+0x98/0x1b0
[  137.535649]  #3: 00000000d9597252 (&dev_addr_list_lock_key/1){+...}, at: dev_uc_unsync+0x10c/0x1b0
[  137.545709]  #4: 00000000bec134c3 (rcu_read_lock){....}, at: team_set_rx_mode+0x5/0x1d0 [team]
[  137.555384]
[  137.555384] stack backtrace:
[  137.560277] CPU: 0 PID: 1383 Comm: ip Not tainted 5.3.0-rc7+ #322
[  137.577826] Call Trace:
[  137.580586]  dump_stack+0x7c/0xbb
[  137.584307]  __lock_acquire+0x26a9/0x3de0
[  137.588820]  ? register_lock_class+0x14d0/0x14d0
[  137.594008]  ? register_lock_class+0x14d0/0x14d0
[  137.599194]  lock_acquire+0x164/0x3b0
[  137.603310]  ? dev_uc_sync_multiple+0xfa/0x1a0
[  137.608307]  _raw_spin_lock_nested+0x2e/0x60
[  137.613105]  ? dev_uc_sync_multiple+0xfa/0x1a0
[  137.618095]  dev_uc_sync_multiple+0xfa/0x1a0
[  137.622900]  team_set_rx_mode+0xa9/0x1d0 [team]
[  137.627993]  dev_uc_unsync+0x151/0x1b0
[  137.632205]  team_port_del+0x304/0x790 [team]
[  137.637110]  team_uninit+0xb0/0x1a0 [team]
[  137.641717]  rollback_registered_many+0x728/0xda0
[  137.647005]  ? generic_xdp_install+0x310/0x310
[  137.651994]  ? __set_pages_p+0xf4/0x150
[  137.656306]  ? check_chain_key+0x236/0x5d0
[  137.660914]  ? __nla_validate_parse+0x98/0x1ad0
[  137.666006]  unregister_netdevice_many.part.120+0x13/0x1b0
[  137.672167]  rtnl_delete_link+0xbc/0x100
[  137.676575]  ? rtnl_af_register+0xc0/0xc0
[  137.681084]  rtnl_dellink+0x2e7/0x870
[  137.685204]  ? find_held_lock+0x39/0x1d0
[  ... ]

Fixes: 3d249d4ca7d0 ("net: introduce ethernet teaming device")
Signed-off-by: Taehee Yoo <ap420073@gmail.com>
---
 drivers/net/team/team.c | 61 ++++++++++++++++++++++++++++++++++++++---
 include/linux/if_team.h |  5 ++++
 2 files changed, 62 insertions(+), 4 deletions(-)

diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c
index e8089def5a46..bfcd6ed57493 100644
--- a/drivers/net/team/team.c
+++ b/drivers/net/team/team.c
@@ -1607,6 +1607,34 @@ static const struct team_option team_options[] = {
 	},
 };
 
+static void team_dev_set_lockdep_one(struct net_device *dev,
+				     struct netdev_queue *txq,
+				     void *_unused)
+{
+	struct team *team = netdev_priv(dev);
+
+	lockdep_set_class(&txq->_xmit_lock, &team->xmit_lock_key);
+}
+
+static struct lock_class_key qdisc_tx_busylock_key;
+static struct lock_class_key qdisc_running_key;
+
+static void team_dev_set_lockdep_class(struct net_device *dev)
+{
+	struct team *team = netdev_priv(dev);
+
+	dev->qdisc_tx_busylock = &qdisc_tx_busylock_key;
+	dev->qdisc_running_key = &qdisc_running_key;
+
+	lockdep_register_key(&team->team_lock_key);
+	__mutex_init(&team->lock, "team->team_lock_key", &team->team_lock_key);
+
+	lockdep_register_key(&team->addr_lock_key);
+	lockdep_set_class(&dev->addr_list_lock, &team->addr_lock_key);
+
+	lockdep_register_key(&team->xmit_lock_key);
+	netdev_for_each_tx_queue(dev, team_dev_set_lockdep_one, NULL);
+}
 
 static int team_init(struct net_device *dev)
 {
@@ -1615,7 +1643,6 @@ static int team_init(struct net_device *dev)
 	int err;
 
 	team->dev = dev;
-	mutex_init(&team->lock);
 	team_set_no_mode(team);
 
 	team->pcpu_stats = netdev_alloc_pcpu_stats(struct team_pcpu_stats);
@@ -1642,7 +1669,7 @@ static int team_init(struct net_device *dev)
 		goto err_options_register;
 	netif_carrier_off(dev);
 
-	netdev_lockdep_set_classes(dev);
+	team_dev_set_lockdep_class(dev);
 
 	return 0;
 
@@ -1673,6 +1700,11 @@ static void team_uninit(struct net_device *dev)
 	team_queue_override_fini(team);
 	mutex_unlock(&team->lock);
 	netdev_change_features(dev);
+
+	lockdep_unregister_key(&team->team_lock_key);
+	lockdep_unregister_key(&team->addr_lock_key);
+	lockdep_unregister_key(&team->xmit_lock_key);
+
 }
 
 static void team_destructor(struct net_device *dev)
@@ -1967,6 +1999,23 @@ static int team_add_slave(struct net_device *dev, struct net_device *port_dev,
 	return err;
 }
 
+static void team_update_lock_key(struct net_device *dev)
+{
+	struct team *team = netdev_priv(dev);
+
+	lockdep_unregister_key(&team->team_lock_key);
+	lockdep_unregister_key(&team->addr_lock_key);
+	lockdep_unregister_key(&team->xmit_lock_key);
+
+	lockdep_register_key(&team->team_lock_key);
+	lockdep_register_key(&team->addr_lock_key);
+	lockdep_register_key(&team->xmit_lock_key);
+
+	lockdep_set_class(&team->lock, &team->team_lock_key);
+	lockdep_set_class(&dev->addr_list_lock, &team->addr_lock_key);
+	netdev_for_each_tx_queue(dev, team_dev_set_lockdep_one, NULL);
+}
+
 static int team_del_slave(struct net_device *dev, struct net_device *port_dev)
 {
 	struct team *team = netdev_priv(dev);
@@ -1976,8 +2025,12 @@ static int team_del_slave(struct net_device *dev, struct net_device *port_dev)
 	err = team_port_del(team, port_dev);
 	mutex_unlock(&team->lock);
 
-	if (!err)
-		netdev_change_features(dev);
+	if (err)
+		return err;
+
+	if (netif_is_team_master(port_dev))
+		team_update_lock_key(port_dev);
+	netdev_change_features(dev);
 
 	return err;
 }
diff --git a/include/linux/if_team.h b/include/linux/if_team.h
index 06faa066496f..9c97bb19ed34 100644
--- a/include/linux/if_team.h
+++ b/include/linux/if_team.h
@@ -223,6 +223,11 @@ struct team {
 		atomic_t count_pending;
 		struct delayed_work dw;
 	} mcast_rejoin;
+
+	struct lock_class_key team_lock_key;
+	struct lock_class_key xmit_lock_key;
+	struct lock_class_key addr_lock_key;
+
 	long mode_priv[TEAM_MODE_PRIV_LONGS];
 };
 
-- 
2.17.1


^ permalink raw reply related

* [PATCH net 04/11] bonding: use dynamic lockdep key instead of subclass
From: Taehee Yoo @ 2019-09-04 18:39 UTC (permalink / raw)
  To: davem, netdev, j.vosburgh, vfalico, andy, jiri, sd, roopa, saeedm,
	manishc, rahulv, kys, haiyangz, sthemmin, sashal, hare, varun,
	ubraun, kgraul
  Cc: ap420073

All bonding device has same lockdep key and subclass is initialized with
nest_level.
But actual nest_level value can be changed when a lower device is attached.
And at this moment, the subclass should be updated but it seems to be
unsafe.
So this patch makes bonding use dynamic lockdep key instead of the
subclass.

Test commands:
    ip link add bond0 type bond

    for i in {1..5}
    do
	    let A=$i-1
	    ip link add bond$i type bond
	    ip link set bond$i master bond$A
    done
    ip link set bond5 master bond0

Splat looks like:
[  327.477830] ============================================
[  327.477830] WARNING: possible recursive locking detected
[  327.477830] 5.3.0-rc7+ #322 Not tainted
[  327.477830] --------------------------------------------
[  327.477830] ip/1399 is trying to acquire lock:
[  327.477830] 00000000f604be63 (&(&bond->stats_lock)->rlock#2/2){+.+.}, at: bond_get_stats+0xb8/0x500 [bonding]
[  327.477830]
[  327.477830] but task is already holding lock:
[  327.477830] 00000000e9d31238 (&(&bond->stats_lock)->rlock#2/2){+.+.}, at: bond_get_stats+0xb8/0x500 [bonding]
[  327.477830]
[  327.477830] other info that might help us debug this:
[  327.477830]  Possible unsafe locking scenario:
[  327.477830]
[  327.477830]        CPU0
[  327.477830]        ----
[  327.477830]   lock(&(&bond->stats_lock)->rlock#2/2);
[  327.477830]   lock(&(&bond->stats_lock)->rlock#2/2);
[  327.477830]
[  327.477830]  *** DEADLOCK ***
[  327.477830]
[  327.477830]  May be due to missing lock nesting notation
[  327.477830]
[  327.477830] 3 locks held by ip/1399:
[  327.477830]  #0: 00000000a762c4e3 (rtnl_mutex){+.+.}, at: rtnetlink_rcv_msg+0x466/0x8a0
[  327.477830]  #1: 00000000e9d31238 (&(&bond->stats_lock)->rlock#2/2){+.+.}, at: bond_get_stats+0xb8/0x500 [bonding]
[  327.477830]  #2: 000000008f7ebff4 (rcu_read_lock){....}, at: bond_get_stats+0x9f/0x500 [bonding]
[  327.477830]
[  327.477830] stack backtrace:
[  327.477830] CPU: 0 PID: 1399 Comm: ip Not tainted 5.3.0-rc7+ #322
[  327.477830] Call Trace:
[  327.477830]  dump_stack+0x7c/0xbb
[  327.477830]  __lock_acquire+0x26a9/0x3de0
[  327.477830]  ? __change_page_attr_set_clr+0x133b/0x1d20
[  327.477830]  ? register_lock_class+0x14d0/0x14d0
[  327.477830]  lock_acquire+0x164/0x3b0
[  327.477830]  ? bond_get_stats+0xb8/0x500 [bonding]
[  327.666914]  _raw_spin_lock_nested+0x2e/0x60
[  327.666914]  ? bond_get_stats+0xb8/0x500 [bonding]
[  327.678302]  bond_get_stats+0xb8/0x500 [bonding]
[  327.678302]  ? bond_arp_rcv+0xf10/0xf10 [bonding]
[  327.678302]  ? register_lock_class+0x14d0/0x14d0
[  327.678302]  ? bond_get_stats+0xb8/0x500 [bonding]
[  327.678302]  dev_get_stats+0x1ec/0x270
[  327.678302]  bond_get_stats+0x1d1/0x500 [bonding]
[  327.678302]  ? lock_acquire+0x164/0x3b0
[  327.678302]  ? bond_arp_rcv+0xf10/0xf10 [bonding]
[  327.678302]  ? rtnl_is_locked+0x16/0x30
[  327.678302]  ? devlink_compat_switch_id_get+0x18/0x140
[  327.678302]  ? dev_get_alias+0xe2/0x190
[  327.731145]  ? dev_get_port_parent_id+0x12a/0x340
[  327.731145]  ? rtnl_phys_switch_id_fill+0x88/0xe0
[  327.731145]  dev_get_stats+0x1ec/0x270
[  327.731145]  rtnl_fill_stats+0x44/0xbe0
[  327.731145]  ? nla_put+0xc2/0x140
[  ... ]

Fixes: d3fff6c443fe ("net: add netdev_lockdep_set_classes() helper")
Signed-off-by: Taehee Yoo <ap420073@gmail.com>
---
 drivers/net/bonding/bond_main.c | 60 ++++++++++++++++++++++++++++++---
 include/net/bonding.h           |  3 ++
 2 files changed, 59 insertions(+), 4 deletions(-)

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index abd008c31c9a..2b16683bb8b8 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -1856,6 +1856,32 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev,
 	return res;
 }
 
+static void bond_dev_set_lockdep_one(struct net_device *dev,
+				     struct netdev_queue *txq,
+				     void *_unused)
+{
+	struct bonding *bond = netdev_priv(dev);
+
+	lockdep_set_class(&txq->_xmit_lock, &bond->xmit_lock_key);
+}
+
+static void bond_update_lock_key(struct net_device *dev)
+{
+	struct bonding *bond = netdev_priv(dev);
+
+	lockdep_unregister_key(&bond->stats_lock_key);
+	lockdep_unregister_key(&bond->addr_lock_key);
+	lockdep_unregister_key(&bond->xmit_lock_key);
+
+	lockdep_register_key(&bond->stats_lock_key);
+	lockdep_register_key(&bond->addr_lock_key);
+	lockdep_register_key(&bond->xmit_lock_key);
+
+	lockdep_set_class(&bond->stats_lock, &bond->stats_lock_key);
+	lockdep_set_class(&dev->addr_list_lock, &bond->addr_lock_key);
+	netdev_for_each_tx_queue(dev, bond_dev_set_lockdep_one, NULL);
+}
+
 /* Try to release the slave device <slave> from the bond device <master>
  * It is legal to access curr_active_slave without a lock because all the function
  * is RTNL-locked. If "all" is true it means that the function is being called
@@ -2020,6 +2046,8 @@ static int __bond_release_one(struct net_device *bond_dev,
 	slave_dev->priv_flags &= ~IFF_BONDING_SLAVE;
 
 	bond_free_slave(slave);
+	if (netif_is_bond_master(slave_dev))
+		bond_update_lock_key(slave_dev);
 
 	return 0;
 }
@@ -3454,7 +3482,7 @@ static void bond_get_stats(struct net_device *bond_dev,
 	struct list_head *iter;
 	struct slave *slave;
 
-	spin_lock_nested(&bond->stats_lock, bond_get_nest_level(bond_dev));
+	spin_lock(&bond->stats_lock);
 	memcpy(stats, &bond->bond_stats, sizeof(*stats));
 
 	rcu_read_lock();
@@ -4292,8 +4320,6 @@ void bond_setup(struct net_device *bond_dev)
 {
 	struct bonding *bond = netdev_priv(bond_dev);
 
-	spin_lock_init(&bond->mode_lock);
-	spin_lock_init(&bond->stats_lock);
 	bond->params = bonding_defaults;
 
 	/* Initialize pointers */
@@ -4362,6 +4388,9 @@ static void bond_uninit(struct net_device *bond_dev)
 
 	list_del(&bond->bond_list);
 
+	lockdep_unregister_key(&bond->stats_lock_key);
+	lockdep_unregister_key(&bond->addr_lock_key);
+	lockdep_unregister_key(&bond->xmit_lock_key);
 	bond_debug_unregister(bond);
 }
 
@@ -4753,6 +4782,29 @@ static int bond_check_params(struct bond_params *params)
 	return 0;
 }
 
+static struct lock_class_key qdisc_tx_busylock_key;
+static struct lock_class_key qdisc_running_key;
+
+static void bond_dev_set_lockdep_class(struct net_device *dev)
+{
+	struct bonding *bond = netdev_priv(dev);
+
+	dev->qdisc_tx_busylock = &qdisc_tx_busylock_key;
+	dev->qdisc_running_key = &qdisc_running_key;
+
+	spin_lock_init(&bond->mode_lock);
+
+	spin_lock_init(&bond->stats_lock);
+	lockdep_register_key(&bond->stats_lock_key);
+	lockdep_set_class(&bond->stats_lock, &bond->stats_lock_key);
+
+	lockdep_register_key(&bond->addr_lock_key);
+	lockdep_set_class(&dev->addr_list_lock, &bond->addr_lock_key);
+
+	lockdep_register_key(&bond->xmit_lock_key);
+	netdev_for_each_tx_queue(dev, bond_dev_set_lockdep_one, NULL);
+}
+
 /* Called from registration process */
 static int bond_init(struct net_device *bond_dev)
 {
@@ -4766,7 +4818,7 @@ static int bond_init(struct net_device *bond_dev)
 		return -ENOMEM;
 
 	bond->nest_level = SINGLE_DEPTH_NESTING;
-	netdev_lockdep_set_classes(bond_dev);
+	bond_dev_set_lockdep_class(bond_dev);
 
 	list_add_tail(&bond->bond_list, &bn->dev_list);
 
diff --git a/include/net/bonding.h b/include/net/bonding.h
index f7fe45689142..c39ac7061e41 100644
--- a/include/net/bonding.h
+++ b/include/net/bonding.h
@@ -239,6 +239,9 @@ struct bonding {
 	struct	 dentry *debug_dir;
 #endif /* CONFIG_DEBUG_FS */
 	struct rtnl_link_stats64 bond_stats;
+	struct lock_class_key stats_lock_key;
+	struct lock_class_key xmit_lock_key;
+	struct lock_class_key addr_lock_key;
 };
 
 #define bond_slave_get_rcu(dev) \
-- 
2.17.1


^ permalink raw reply related

* [PATCH net 03/11] bonding: split IFF_BONDING into IFF_BONDING and IFF_BONDING_SLAVE
From: Taehee Yoo @ 2019-09-04 18:39 UTC (permalink / raw)
  To: davem, netdev, j.vosburgh, vfalico, andy, jiri, sd, roopa, saeedm,
	manishc, rahulv, kys, haiyangz, sthemmin, sashal, hare, varun,
	ubraun, kgraul
  Cc: ap420073

The IFF_BONDING means bonding master or bonding slave device.

->ndo_add_slave() sets IFF_BONDING flag and ->ndo_del_slave() removes
IFF_BONDING flag.
This routine makes a problem in the nesting bonding structure.

bond1<--bond2

Both bond0 and bond1 are bonding device and these should keep having
IFF_BONDING flag until they are removed.
But bond1 would lose IFF_BONDING at ->ndo_del_slave because that routine
can not check whether the slave device is the bonding type or not.
So that this patch splits the IFF_BONDING into theIFF_BONDING and
the IFF_BONDING_SLAVE. The IFF_BONDING is bonding master flag and
IFF_BONDING_SLAVE is bonding slave flag.

Test commands:
    ip link add bond0 type bond
    ip link add bond1 type bond
    ip link set bond1 master bond0
    ip link set bond1 nomaster
    ip link del bond1 type bond
    ip link add bond1 type bond

Splat looks like:
[  149.201107] proc_dir_entry 'bonding/bond1' already registered
[  149.208013] WARNING: CPU: 1 PID: 1308 at fs/proc/generic.c:361 proc_register+0x2a9/0x3e0
[  149.208866] Modules linked in: bonding veth openvswitch nsh nf_conncount nf_nat nf_conntrack nf_defrag_ipv4 ip_tables6
[  149.208866] CPU: 1 PID: 1308 Comm: ip Not tainted 5.3.0-rc7+ #322
[  149.208866] RIP: 0010:proc_register+0x2a9/0x3e0
[  149.208866] Code: 89 fa 48 c1 ea 03 80 3c 02 00 0f 85 39 01 00 00 48 8b 04 24 48 89 ea 48 c7 c7 a0 a0 13 89 48 8b b0 0
[  149.208866] RSP: 0018:ffff88810df9f098 EFLAGS: 00010286
[  149.208866] RAX: dffffc0000000008 RBX: ffff8880b5d3aa50 RCX: ffffffff87cdec92
[  149.208866] RDX: 0000000000000001 RSI: 0000000000000008 RDI: ffff888116bf6a8c
[  149.208866] RBP: ffff8880b5d3acd3 R08: ffffed1022d7ff71 R09: ffffed1022d7ff71
[  149.208866] R10: 0000000000000001 R11: ffffed1022d7ff70 R12: ffff8880b5d3abe8
[  149.208866] R13: ffff8880b5d3acd2 R14: dffffc0000000000 R15: ffffed1016ba759a
[  149.208866] FS:  00007f4bd1f650c0(0000) GS:ffff888116a00000(0000) knlGS:0000000000000000
[  149.208866] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  149.208866] CR2: 000055e7ca686118 CR3: 0000000106fd4000 CR4: 00000000001006e0
[  149.208866] Call Trace:
[  149.208866]  proc_create_seq_private+0xb3/0xf0
[  149.208866]  bond_create_proc_entry+0x1b3/0x3f0 [bonding]
[  149.208866]  bond_netdev_event+0x433/0x970 [bonding]
[  149.208866]  ? __module_text_address+0x13/0x140
[  149.208866]  notifier_call_chain+0x90/0x160
[  149.208866]  register_netdevice+0x9b3/0xd70
[  149.208866]  ? alloc_netdev_mqs+0x854/0xc10
[  149.208866]  ? netdev_change_features+0xa0/0xa0
[  149.208866]  ? rtnl_create_link+0x2ed/0xad0
[  149.208866]  bond_newlink+0x2a/0x60 [bonding]
[  149.208866]  __rtnl_newlink+0xb75/0x1180
[  ... ]

Fixes: 0b680e753724 ("[PATCH] bonding: Add priv_flag to avoid event mishandling")
Signed-off-by: Taehee Yoo <ap420073@gmail.com>
---
 drivers/net/bonding/bond_main.c                     | 13 +++++--------
 .../net/ethernet/qlogic/netxen/netxen_nic_main.c    |  2 +-
 drivers/net/hyperv/netvsc_drv.c                     |  3 +--
 drivers/scsi/fcoe/fcoe.c                            |  2 +-
 drivers/target/iscsi/cxgbit/cxgbit_cm.c             |  2 +-
 include/linux/netdevice.h                           |  9 ++++++---
 6 files changed, 15 insertions(+), 16 deletions(-)

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 931d9d935686..abd008c31c9a 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -1560,7 +1560,7 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev,
 		goto err_restore_mac;
 	}
 
-	slave_dev->priv_flags |= IFF_BONDING;
+	slave_dev->priv_flags |= IFF_BONDING_SLAVE;
 	/* initialize slave stats */
 	dev_get_stats(new_slave->dev, &new_slave->slave_stats);
 
@@ -1816,7 +1816,7 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev,
 	slave_disable_netpoll(new_slave);
 
 err_close:
-	slave_dev->priv_flags &= ~IFF_BONDING;
+	slave_dev->priv_flags &= ~IFF_BONDING_SLAVE;
 	dev_close(slave_dev);
 
 err_restore_mac:
@@ -2017,7 +2017,7 @@ static int __bond_release_one(struct net_device *bond_dev,
 	else
 		dev_set_mtu(slave_dev, slave->original_mtu);
 
-	slave_dev->priv_flags &= ~IFF_BONDING;
+	slave_dev->priv_flags &= ~IFF_BONDING_SLAVE;
 
 	bond_free_slave(slave);
 
@@ -3221,10 +3221,7 @@ static int bond_netdev_event(struct notifier_block *this,
 	netdev_dbg(event_dev, "%s received %s\n",
 		   __func__, netdev_cmd_to_name(event));
 
-	if (!(event_dev->priv_flags & IFF_BONDING))
-		return NOTIFY_DONE;
-
-	if (event_dev->flags & IFF_MASTER) {
+	if (netif_is_bond_master(event_dev)) {
 		int ret;
 
 		ret = bond_master_netdev_event(event, event_dev);
@@ -3232,7 +3229,7 @@ static int bond_netdev_event(struct notifier_block *this,
 			return ret;
 	}
 
-	if (event_dev->flags & IFF_SLAVE)
+	if (netif_is_bond_slave(event_dev))
 		return bond_slave_netdev_event(event, event_dev);
 
 	return NOTIFY_DONE;
diff --git a/drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c b/drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c
index 58e2eaf77014..5e0389ba1f13 100644
--- a/drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c
+++ b/drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c
@@ -3340,7 +3340,7 @@ static void netxen_config_master(struct net_device *dev, unsigned long event)
 	 * released and is dev_close()ed in bond_release()
 	 * just before IFF_BONDING is stripped.
 	 */
-	if (!master && dev->priv_flags & IFF_BONDING)
+	if (!master && netif_is_bond_slave(dev))
 		netxen_free_ip_list(adapter, true);
 }
 
diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index e8fce6d715ef..6831202d9bcb 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -2439,8 +2439,7 @@ static int netvsc_netdev_event(struct notifier_block *this,
 		return NOTIFY_DONE;
 
 	/* Avoid Bonding master dev with same MAC registering as VF */
-	if ((event_dev->priv_flags & IFF_BONDING) &&
-	    (event_dev->flags & IFF_MASTER))
+	if (netif_is_bond_master(event_dev))
 		return NOTIFY_DONE;
 
 	switch (event) {
diff --git a/drivers/scsi/fcoe/fcoe.c b/drivers/scsi/fcoe/fcoe.c
index 00dd47bcbb1e..750a6540eb9d 100644
--- a/drivers/scsi/fcoe/fcoe.c
+++ b/drivers/scsi/fcoe/fcoe.c
@@ -307,7 +307,7 @@ static int fcoe_interface_setup(struct fcoe_interface *fcoe,
 	}
 
 	/* Do not support for bonding device */
-	if (netdev->priv_flags & IFF_BONDING && netdev->flags & IFF_MASTER) {
+	if (netif_is_bond_master(netdev)) {
 		FCOE_NETDEV_DBG(netdev, "Bonded interfaces not supported\n");
 		return -EOPNOTSUPP;
 	}
diff --git a/drivers/target/iscsi/cxgbit/cxgbit_cm.c b/drivers/target/iscsi/cxgbit/cxgbit_cm.c
index c70caf4ea490..16c8cae333b2 100644
--- a/drivers/target/iscsi/cxgbit/cxgbit_cm.c
+++ b/drivers/target/iscsi/cxgbit/cxgbit_cm.c
@@ -247,7 +247,7 @@ struct cxgbit_device *cxgbit_find_device(struct net_device *ndev, u8 *port_id)
 
 static struct net_device *cxgbit_get_real_dev(struct net_device *ndev)
 {
-	if (ndev->priv_flags & IFF_BONDING) {
+	if (netif_is_bond_master(ndev) || netif_is_bond_slave(ndev)) {
 		pr_err("Bond devices are not supported. Interface:%s\n",
 		       ndev->name);
 		return NULL;
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 5bb5756129af..a2c47f43e54b 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1441,7 +1441,7 @@ struct net_device_ops {
  *
  * @IFF_802_1Q_VLAN: 802.1Q VLAN device
  * @IFF_EBRIDGE: Ethernet bridging device
- * @IFF_BONDING: bonding master or slave
+ * @IFF_BONDING: bonding master
  * @IFF_ISATAP: ISATAP interface (RFC4214)
  * @IFF_WAN_HDLC: WAN HDLC device
  * @IFF_XMIT_DST_RELEASE: dev_hard_start_xmit() is allowed to
@@ -1474,6 +1474,7 @@ struct net_device_ops {
  * @IFF_FAILOVER_SLAVE: device is lower dev of a failover master device
  * @IFF_L3MDEV_RX_HANDLER: only invoke the rx handler of L3 master device
  * @IFF_LIVE_RENAME_OK: rename is allowed while device is up and running
+ * @IFF_BONDING_SLAVE: bonding slave
  */
 enum netdev_priv_flags {
 	IFF_802_1Q_VLAN			= 1<<0,
@@ -1507,6 +1508,7 @@ enum netdev_priv_flags {
 	IFF_FAILOVER_SLAVE		= 1<<28,
 	IFF_L3MDEV_RX_HANDLER		= 1<<29,
 	IFF_LIVE_RENAME_OK		= 1<<30,
+	IFF_BONDING_SLAVE		= 1<<31,
 };
 
 #define IFF_802_1Q_VLAN			IFF_802_1Q_VLAN
@@ -1539,6 +1541,7 @@ enum netdev_priv_flags {
 #define IFF_FAILOVER_SLAVE		IFF_FAILOVER_SLAVE
 #define IFF_L3MDEV_RX_HANDLER		IFF_L3MDEV_RX_HANDLER
 #define IFF_LIVE_RENAME_OK		IFF_LIVE_RENAME_OK
+#define IFF_BONDING_SLAVE		IFF_BONDING_SLAVE
 
 /**
  *	struct net_device - The DEVICE structure.
@@ -4569,12 +4572,12 @@ static inline bool netif_is_macvlan_port(const struct net_device *dev)
 
 static inline bool netif_is_bond_master(const struct net_device *dev)
 {
-	return dev->flags & IFF_MASTER && dev->priv_flags & IFF_BONDING;
+	return dev->priv_flags & IFF_BONDING;
 }
 
 static inline bool netif_is_bond_slave(const struct net_device *dev)
 {
-	return dev->flags & IFF_SLAVE && dev->priv_flags & IFF_BONDING;
+	return dev->priv_flags & IFF_BONDING_SLAVE;
 }
 
 static inline bool netif_supports_nofcs(struct net_device *dev)
-- 
2.17.1


^ permalink raw reply related

* [PATCH net 02/11] vlan: use dynamic lockdep key instead of subclass
From: Taehee Yoo @ 2019-09-04 18:39 UTC (permalink / raw)
  To: davem, netdev, j.vosburgh, vfalico, andy, jiri, sd, roopa, saeedm,
	manishc, rahulv, kys, haiyangz, sthemmin, sashal, hare, varun,
	ubraun, kgraul
  Cc: ap420073

All VLAN device has same lockdep key and subclass is initialized with
nest_level.
But actual nest_level value can be changed when a lower device is attached.
And at this moment, the subclass should be updated but it seems to be
unsafe.
So this patch makes VLAN use dynamic lockdep key instead of the subclass.

Test commands:
   ip link add dummy0 type dummy
   ip link set dummy0 up
   ip link add bond0 type bond

   ip link add vlan_dummy1 link dummy0 type vlan id 1
   ip link add vlan_bond1 link bond0 type vlan id 2
   ip link set vlan_dummy1 master bond0

   ip link set bond0 up
   ip link set vlan_dummy1 up
   ip link set vlan_bond1 up

Both vlan_dummy1 and vlan_bond1 have the same subclass and it makes
unnecessary deadlock warning message.

Splat looks like:
[  149.244978] ============================================
[  149.244978] WARNING: possible recursive locking detected
[  149.244978] 5.3.0-rc7+ #322 Not tainted
[  149.244978] --------------------------------------------
[  149.244978] ip/1340 is trying to acquire lock:
[  149.244978] 000000001399b1a7 (&vlan_netdev_addr_lock_key/1){+...}, at: dev_uc_sync_multiple+0xfa/0x1a0
[  149.279600]
[  149.279600] but task is already holding lock:
[  149.279600] 00000000b963d9b4 (&vlan_netdev_addr_lock_key/1){+...}, at: dev_set_rx_mode+0x19/0x30
[  149.279600]
[  149.279600] other info that might help us debug this:
[  149.305981]  Possible unsafe locking scenario:
[  149.305981]
[  149.305981]        CPU0
[  149.305981]        ----
[  149.305981]   lock(&vlan_netdev_addr_lock_key/1);
[  149.305981]   lock(&vlan_netdev_addr_lock_key/1);
[  149.326258]
[  149.326258]  *** DEADLOCK ***
[  149.326258]
[  149.326258]  May be due to missing lock nesting notation
[  149.326258]
[  149.326258] 4 locks held by ip/1340:
[  149.326258]  #0: 00000000927f0698 (rtnl_mutex){+.+.}, at: rtnetlink_rcv_msg+0x466/0x8a0
[  149.326258]  #1: 00000000b963d9b4 (&vlan_netdev_addr_lock_key/1){+...}, at: dev_set_rx_mode+0x19/0x30
[  149.326258]  #2: 0000000027395445 (&dev_addr_list_lock_key/3){+...}, at: dev_mc_sync+0xfa/0x1a0
[  149.369961]  #3: 00000000ce334932 (rcu_read_lock){....}, at: bond_set_rx_mode+0x5/0x3c0 [bonding]
[  149.369961]
[  149.369961] stack backtrace:
[  149.369961] CPU: 1 PID: 1340 Comm: ip Not tainted 5.3.0-rc7+ #322
[  149.369961] Call Trace:
[  149.369961]  dump_stack+0x7c/0xbb
[  149.369961]  __lock_acquire+0x26a9/0x3de0
[  149.369961]  ? register_lock_class+0x14d0/0x14d0
[  149.369961]  ? register_lock_class+0x14d0/0x14d0
[  149.369961]  lock_acquire+0x164/0x3b0
[  149.433970]  ? dev_uc_sync_multiple+0xfa/0x1a0
[  149.433970]  _raw_spin_lock_nested+0x2e/0x60
[  149.433970]  ? dev_uc_sync_multiple+0xfa/0x1a0
[  149.433970]  dev_uc_sync_multiple+0xfa/0x1a0
[  149.433970]  bond_set_rx_mode+0x269/0x3c0 [bonding]
[  149.433970]  ? bond_init+0x6f0/0x6f0 [bonding]
[  149.433970]  dev_mc_sync+0x15a/0x1a0
[  149.433970]  vlan_dev_set_rx_mode+0x37/0x80 [8021q]
[  149.433970]  dev_set_rx_mode+0x21/0x30
[  149.433970]  __dev_open+0x202/0x310
[  149.433970]  ? dev_set_rx_mode+0x30/0x30
[  149.433970]  ? mark_held_locks+0xa5/0xe0
[  149.433970]  ? __local_bh_enable_ip+0xe9/0x1b0
[  149.433970]  __dev_change_flags+0x3c3/0x500
[  ... ]

Fixes: 0fe1e567d0b4 ("[VLAN]: nested VLAN: fix lockdep's recursive locking warning")
Signed-off-by: Taehee Yoo <ap420073@gmail.com>
---
 include/linux/if_vlan.h |  3 +++
 net/8021q/vlan_dev.c    | 28 +++++++++++++++-------------
 2 files changed, 18 insertions(+), 13 deletions(-)

diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h
index 244278d5c222..1aed9f613e90 100644
--- a/include/linux/if_vlan.h
+++ b/include/linux/if_vlan.h
@@ -183,6 +183,9 @@ struct vlan_dev_priv {
 	struct netpoll				*netpoll;
 #endif
 	unsigned int				nest_level;
+
+	struct lock_class_key			xmit_lock_key;
+	struct lock_class_key			addr_lock_key;
 };
 
 static inline struct vlan_dev_priv *vlan_dev_priv(const struct net_device *dev)
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index 93eadf179123..12bc80650087 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -494,24 +494,24 @@ static void vlan_dev_set_rx_mode(struct net_device *vlan_dev)
  * "super class" of normal network devices; split their locks off into a
  * separate class since they always nest.
  */
-static struct lock_class_key vlan_netdev_xmit_lock_key;
-static struct lock_class_key vlan_netdev_addr_lock_key;
-
 static void vlan_dev_set_lockdep_one(struct net_device *dev,
 				     struct netdev_queue *txq,
-				     void *_subclass)
+				     void *_unused)
 {
-	lockdep_set_class_and_subclass(&txq->_xmit_lock,
-				       &vlan_netdev_xmit_lock_key,
-				       *(int *)_subclass);
+	struct vlan_dev_priv *vlan = vlan_dev_priv(dev);
+
+	lockdep_set_class(&txq->_xmit_lock, &vlan->xmit_lock_key);
 }
 
-static void vlan_dev_set_lockdep_class(struct net_device *dev, int subclass)
+static void vlan_dev_set_lockdep_class(struct net_device *dev)
 {
-	lockdep_set_class_and_subclass(&dev->addr_list_lock,
-				       &vlan_netdev_addr_lock_key,
-				       subclass);
-	netdev_for_each_tx_queue(dev, vlan_dev_set_lockdep_one, &subclass);
+	struct vlan_dev_priv *vlan = vlan_dev_priv(dev);
+
+	lockdep_register_key(&vlan->addr_lock_key);
+	lockdep_set_class(&dev->addr_list_lock, &vlan->addr_lock_key);
+
+	lockdep_register_key(&vlan->xmit_lock_key);
+	netdev_for_each_tx_queue(dev, vlan_dev_set_lockdep_one, NULL);
 }
 
 static int vlan_dev_get_lock_subclass(struct net_device *dev)
@@ -609,7 +609,7 @@ static int vlan_dev_init(struct net_device *dev)
 
 	SET_NETDEV_DEVTYPE(dev, &vlan_type);
 
-	vlan_dev_set_lockdep_class(dev, vlan_dev_get_lock_subclass(dev));
+	vlan_dev_set_lockdep_class(dev);
 
 	vlan->vlan_pcpu_stats = netdev_alloc_pcpu_stats(struct vlan_pcpu_stats);
 	if (!vlan->vlan_pcpu_stats)
@@ -630,6 +630,8 @@ static void vlan_dev_uninit(struct net_device *dev)
 			kfree(pm);
 		}
 	}
+	lockdep_unregister_key(&vlan->addr_lock_key);
+	lockdep_unregister_key(&vlan->xmit_lock_key);
 }
 
 static netdev_features_t vlan_dev_fix_features(struct net_device *dev,
-- 
2.17.1


^ permalink raw reply related

* [PATCH net 01/11] net: core: limit nested device depth
From: Taehee Yoo @ 2019-09-04 18:38 UTC (permalink / raw)
  To: davem, netdev, j.vosburgh, vfalico, andy, jiri, sd, roopa, saeedm,
	manishc, rahulv, kys, haiyangz, sthemmin, sashal, hare, varun,
	ubraun, kgraul
  Cc: ap420073

Current code doesn't limit the number of nested devices.
Nested devices would be handled recursively and this needs huge stack
memory. So, unlimited nested devices could make stack overflow.

This patch adds upper_level and lower_leve, they are common variables
and represent maximum lower/upper depth.
When upper/lower device is attached or dettached,
{lower/upper}_level are updated. and if maximum depth is bigger than 8,
attach routine fails and returns -EMLINK.

Test commands:
    ip link add dummy0 type dummy
    ip link add link dummy0 name vlan1 type vlan id 1
    ip link set vlan1 up

    for i in {2..100}
    do
	    let A=$i-1

	    ip link add name vlan$i link vlan$A type vlan id $i
    done

Splat looks like:
[  140.483124] BUG: looking up invalid subclass: 8
[  140.483505] turning off the locking correctness validator.
[  140.483505] CPU: 0 PID: 1324 Comm: ip Not tainted 5.3.0-rc7+ #322
[  140.483505] Hardware name: To be filled by O.E.M. To be filled by O.E.M./Aptio CRB, BIOS 5.6.5 07/08/2015
[  140.483505] Call Trace:
[  140.483505]  dump_stack+0x7c/0xbb
[  140.483505]  register_lock_class+0x64d/0x14d0
[  140.483505]  ? is_dynamic_key+0x230/0x230
[  140.483505]  ? module_assert_mutex_or_preempt+0x41/0x70
[  140.483505]  ? __module_address+0x3f/0x3c0
[  140.483505]  lockdep_init_map+0x24e/0x630
[  140.483505]  vlan_dev_init+0x828/0xce0 [8021q]
[  140.483505]  register_netdevice+0x24f/0xd70
[  140.483505]  ? netdev_change_features+0xa0/0xa0
[  140.483505]  ? dev_get_nest_level+0xe1/0x170
[  140.483505]  register_vlan_dev+0x29b/0x710 [8021q]
[  140.483505]  __rtnl_newlink+0xb75/0x1180
[  ... ]

[  168.446539] WARNING: can't dereference registers at 00000000bef3d701 for ip apic_timer_interrupt+0xf/0x20
[  168.466843] ==================================================================
[  168.469452] BUG: KASAN: slab-out-of-bounds in __unwind_start+0x71/0x850
[  168.480707] Write of size 88 at addr ffff8880b8856d38 by task ip/1758
[  168.480707]
[  168.480707] CPU: 1 PID: 1758 Comm: ip Not tainted 5.3.0-rc7+ #322
[  ... ]
[  168.794493] Rebooting in 5 seconds..


Signed-off-by: Taehee Yoo <ap420073@gmail.com>
---
 include/linux/netdevice.h |   4 ++
 net/core/dev.c            | 106 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 110 insertions(+)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 88292953aa6f..5bb5756129af 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1624,6 +1624,8 @@ enum netdev_priv_flags {
  *	@type:		Interface hardware type
  *	@hard_header_len: Maximum hardware header length.
  *	@min_header_len:  Minimum hardware header length
+ *	@upper_level:	Maximum depth level of upper devices.
+ *	@lower_level:	Maximum depth level of lower devices.
  *
  *	@needed_headroom: Extra headroom the hardware may need, but not in all
  *			  cases can this be guaranteed
@@ -1854,6 +1856,8 @@ struct net_device {
 	unsigned short		type;
 	unsigned short		hard_header_len;
 	unsigned char		min_header_len;
+	unsigned char		upper_level;
+	unsigned char		lower_level;
 
 	unsigned short		needed_headroom;
 	unsigned short		needed_tailroom;
diff --git a/net/core/dev.c b/net/core/dev.c
index 0891f499c1bb..6a4b4ce62204 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -146,6 +146,7 @@
 #include "net-sysfs.h"
 
 #define MAX_GRO_SKBS 8
+#define MAX_NEST_DEV 8
 
 /* This should be increased if a protocol with a bigger head is added. */
 #define GRO_MAX_HEAD (MAX_HEADER + 128)
@@ -6602,6 +6603,21 @@ struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
 }
 EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
 
+static struct net_device *netdev_next_upper_dev(struct net_device *dev,
+						struct list_head **iter)
+{
+	struct netdev_adjacent *upper;
+
+	upper = list_entry((*iter)->next, struct netdev_adjacent, list);
+
+	if (&upper->list == &dev->adj_list.upper)
+		return NULL;
+
+	*iter = &upper->list;
+
+	return upper->dev;
+}
+
 static struct net_device *netdev_next_upper_dev_rcu(struct net_device *dev,
 						    struct list_head **iter)
 {
@@ -6619,6 +6635,33 @@ static struct net_device *netdev_next_upper_dev_rcu(struct net_device *dev,
 	return upper->dev;
 }
 
+int netdev_walk_all_upper_dev(struct net_device *dev,
+			      int (*fn)(struct net_device *dev,
+					void *data),
+			      void *data)
+{
+	struct net_device *udev;
+	struct list_head *iter;
+	int ret;
+
+	for (iter = &dev->adj_list.upper,
+	     udev = netdev_next_upper_dev(dev, &iter);
+	     udev;
+	     udev = netdev_next_upper_dev(dev, &iter)) {
+		/* first is the upper device itself */
+		ret = fn(udev, data);
+		if (ret)
+			return ret;
+
+		/* then look at all of its upper devices */
+		ret = netdev_walk_all_upper_dev(udev, fn, data);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
 int netdev_walk_all_upper_dev_rcu(struct net_device *dev,
 				  int (*fn)(struct net_device *dev,
 					    void *data),
@@ -6785,6 +6828,52 @@ static struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev,
 	return lower->dev;
 }
 
+static u8 __netdev_upper_depth(struct net_device *dev)
+{
+	struct net_device *udev;
+	struct list_head *iter;
+	u8 max_depth = 0;
+
+	for (iter = &dev->adj_list.upper,
+	     udev = netdev_next_upper_dev(dev, &iter);
+	     udev;
+	     udev = netdev_next_upper_dev(dev, &iter)) {
+		if (max_depth < udev->upper_level)
+			max_depth = udev->upper_level;
+	}
+
+	return max_depth;
+}
+
+static u8 __netdev_lower_depth(struct net_device *dev)
+{
+	struct net_device *ldev;
+	struct list_head *iter;
+	u8 max_depth = 0;
+
+	for (iter = &dev->adj_list.lower,
+	     ldev = netdev_next_lower_dev(dev, &iter);
+	     ldev;
+	     ldev = netdev_next_lower_dev(dev, &iter)) {
+		if (max_depth < ldev->lower_level)
+			max_depth = ldev->lower_level;
+	}
+
+	return max_depth;
+}
+
+static int __netdev_update_upper_level(struct net_device *dev, void *data)
+{
+	dev->upper_level = __netdev_upper_depth(dev) + 1;
+	return 0;
+}
+
+static int __netdev_update_lower_level(struct net_device *dev, void *data)
+{
+	dev->lower_level = __netdev_lower_depth(dev) + 1;
+	return 0;
+}
+
 int netdev_walk_all_lower_dev_rcu(struct net_device *dev,
 				  int (*fn)(struct net_device *dev,
 					    void *data),
@@ -7063,6 +7152,9 @@ static int __netdev_upper_dev_link(struct net_device *dev,
 	if (netdev_has_upper_dev(upper_dev, dev))
 		return -EBUSY;
 
+	if ((dev->lower_level + upper_dev->upper_level) > MAX_NEST_DEV)
+		return -EMLINK;
+
 	if (!master) {
 		if (netdev_has_upper_dev(dev, upper_dev))
 			return -EEXIST;
@@ -7089,6 +7181,12 @@ static int __netdev_upper_dev_link(struct net_device *dev,
 	if (ret)
 		goto rollback;
 
+	__netdev_update_upper_level(dev, NULL);
+	netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL);
+
+	__netdev_update_lower_level(upper_dev, NULL);
+	netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level, NULL);
+
 	return 0;
 
 rollback:
@@ -7171,6 +7269,12 @@ void netdev_upper_dev_unlink(struct net_device *dev,
 
 	call_netdevice_notifiers_info(NETDEV_CHANGEUPPER,
 				      &changeupper_info.info);
+
+	__netdev_update_upper_level(dev, NULL);
+	netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL);
+
+	__netdev_update_lower_level(upper_dev, NULL);
+	netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level, NULL);
 }
 EXPORT_SYMBOL(netdev_upper_dev_unlink);
 
@@ -9157,6 +9261,8 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
 
 	dev->gso_max_size = GSO_MAX_SIZE;
 	dev->gso_max_segs = GSO_MAX_SEGS;
+	dev->upper_level = 1;
+	dev->lower_level = 1;
 
 	INIT_LIST_HEAD(&dev->napi_list);
 	INIT_LIST_HEAD(&dev->unreg_list);
-- 
2.17.1


^ permalink raw reply related

* [PATCH net 00/11] net: fix nested device bugs
From: Taehee Yoo @ 2019-09-04 18:38 UTC (permalink / raw)
  To: davem, netdev, j.vosburgh, vfalico, andy, jiri, sd, roopa, saeedm,
	manishc, rahulv, kys, haiyangz, sthemmin, sashal, hare, varun,
	ubraun, kgraul
  Cc: ap420073

This patchset fixes several bugs that are related to nesting
device infrastructure.
Current nesting infrastructure code doesn't limit the depth level of
devices. nested devices could be handled recursively. at that moment,
it needs huge memory and stack overflow could occur.
Below devices type have same bug.
VLAN, BONDING, TEAM, MACSEC, MACVLAN and VXLAN.

Test commands:
    ip link add dummy0 type dummy
    ip link add vlan1 link dummy0 type vlan id 1

    for i in {2..100}
    do
	    let A=$i-1
	    ip link add name vlan$i link vlan$A type vlan id $i
    done
    ip link del dummy0

1st patch actually fixes the root cause.
It adds new common variables {upper/lower}_level that represent
depth level. upper_level variable is depth of upper devices.
lower_level variable is depth of lower devices.

      [U][L]       [U][L]
vlan1  1  5  vlan4  1  4
vlan2  2  4  vlan5  2  3
vlan3  3  3    |
  |            |
  +------------+
  |
vlan6  4  2
dummy0 5  1

After this patch, the nesting infrastructure code uses this variable to
check the depth level.

2, 4, 5, 6, 7 patches fix lockdep related problem.
Before this patch, devices use static lockdep map.
So, if devices that are same type is nested, lockdep will warn about
recursive situation.
These patches make these devices use dynamic lockdep key instead of
static lock or subclass.

3rd patch splits IFF_BONDING flag into IFF_BONDING and IFF_BONDING_SLAVE.
Before this patch, there is only IFF_BONDING flags, which means
a bonding master or a bonding slave device.
But this single flag could be problem when bonding devices are set to
nested.

8th patch fixes a refcnt leak in the macsec module.

9th patch adds ignore flag to an adjacent structure.
In order to exchange an adjacent node safely, ignore flag is needed.

10th patch makes vxlan add an adjacent link to limit depth level.

11th patch removes unnecessary variables and callback.

Taehee Yoo (11):
  net: core: limit nested device depth
  vlan: use dynamic lockdep key instead of subclass
  bonding: split IFF_BONDING into IFF_BONDING and IFF_BONDING_SLAVE
  bonding: use dynamic lockdep key instead of subclass
  team: use dynamic lockdep key instead of static key
  macsec: use dynamic lockdep key instead of subclass
  macvlan: use dynamic lockdep key instead of subclass
  macsec: fix refcnt leak in module exit routine
  net: core: add ignore flag to netdev_adjacent structure
  vxlan: add adjacent link to limit depth level
  net: remove unnecessary variables and callback

 drivers/net/bonding/bond_alb.c                |   2 +-
 drivers/net/bonding/bond_main.c               |  87 ++++--
 .../net/ethernet/mellanox/mlx5/core/en_tc.c   |   2 +-
 .../ethernet/qlogic/netxen/netxen_nic_main.c  |   2 +-
 drivers/net/hyperv/netvsc_drv.c               |   3 +-
 drivers/net/macsec.c                          |  50 ++--
 drivers/net/macvlan.c                         |  36 ++-
 drivers/net/team/team.c                       |  61 ++++-
 drivers/net/vxlan.c                           |  71 ++++-
 drivers/scsi/fcoe/fcoe.c                      |   2 +-
 drivers/target/iscsi/cxgbit/cxgbit_cm.c       |   2 +-
 include/linux/if_macvlan.h                    |   3 +-
 include/linux/if_team.h                       |   5 +
 include/linux/if_vlan.h                       |  13 +-
 include/linux/netdevice.h                     |  29 +-
 include/net/bonding.h                         |   4 +-
 include/net/vxlan.h                           |   1 +
 net/8021q/vlan.c                              |   1 -
 net/8021q/vlan_dev.c                          |  32 +--
 net/core/dev.c                                | 252 ++++++++++++++++--
 net/core/dev_addr_lists.c                     |  12 +-
 net/smc/smc_core.c                            |   2 +-
 net/smc/smc_pnet.c                            |   2 +-
 23 files changed, 519 insertions(+), 155 deletions(-)

-- 
2.17.1


^ permalink raw reply

* Re: [PATCH][next] Bluetooth: mgmt: Use struct_size() helper
From: Marcel Holtmann @ 2019-09-04 18:08 UTC (permalink / raw)
  To: Gustavo A. R. Silva
  Cc: Johan Hedberg, David S. Miller, linux-bluetooth, netdev,
	linux-kernel
In-Reply-To: <20190830011211.GA26531@embeddedor>

Hi Gustavo,

> One of the more common cases of allocation size calculations is finding
> the size of a structure that has a zero-sized array at the end, along
> with memory for some number of elements for that array. For example:
> 
> struct mgmt_rp_get_connections {
> 	...
>        struct mgmt_addr_info addr[0];
> } __packed;
> 
> Make use of the struct_size() helper instead of an open-coded version
> in order to avoid any potential type mistakes.
> 
> So, replace the following form:
> 
> sizeof(*rp) + (i * sizeof(struct mgmt_addr_info));
> 
> with:
> 
> struct_size(rp, addr, i)
> 
> Also, notice that, in this case, variable rp_len is not necessary,
> hence it is removed.
> 
> This code was detected with the help of Coccinelle.
> 
> Signed-off-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
> ---
> net/bluetooth/mgmt.c | 8 ++------
> 1 file changed, 2 insertions(+), 6 deletions(-)

patch has been applied to bluetooth-next tree.

Regards

Marcel


^ permalink raw reply

* [PATCH v1] pppoatm: use %*ph to print small buffer
From: Andy Shevchenko @ 2019-09-04 17:44 UTC (permalink / raw)
  To: Mitchell Blank Jr, David S. Miller, netdev; +Cc: Andy Shevchenko

Use %*ph format to print small buffer as hex string.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
---
 net/atm/pppoatm.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/net/atm/pppoatm.c b/net/atm/pppoatm.c
index bd3da9af5ef6..45d8e1d5d033 100644
--- a/net/atm/pppoatm.c
+++ b/net/atm/pppoatm.c
@@ -216,9 +216,7 @@ static void pppoatm_push(struct atm_vcc *atmvcc, struct sk_buff *skb)
 			pvcc->chan.mtu += LLC_LEN;
 			break;
 		}
-		pr_debug("Couldn't autodetect yet (skb: %02X %02X %02X %02X %02X %02X)\n",
-			 skb->data[0], skb->data[1], skb->data[2],
-			 skb->data[3], skb->data[4], skb->data[5]);
+		pr_debug("Couldn't autodetect yet (skb: %6ph)\n", skb->data);
 		goto error;
 	case e_vc:
 		break;
-- 
2.23.0.rc1


^ permalink raw reply related

* RE: [PATCH V2 4/4] crypto: Add Xilinx AES driver
From: Kalyani Akula @ 2019-09-04 17:40 UTC (permalink / raw)
  To: Corentin Labbe
  Cc: herbert@gondor.apana.org.au, kstewart@linuxfoundation.org,
	gregkh@linuxfoundation.org, tglx@linutronix.de,
	pombredanne@nexb.com, linux-crypto@vger.kernel.org,
	linux-kernel@vger.kernel.org, netdev@vger.kernel.org,
	Sarat Chand Savitala
In-Reply-To: <20190902065854.GA28750@Red>

Hi Corentin,

Thanks for the review comments.
Please find my response/queries inline.

> -----Original Message-----
> From: Corentin Labbe <clabbe.montjoie@gmail.com>
> Sent: Monday, September 2, 2019 12:29 PM
> To: Kalyani Akula <kalyania@xilinx.com>
> Cc: herbert@gondor.apana.org.au; kstewart@linuxfoundation.org;
> gregkh@linuxfoundation.org; tglx@linutronix.de; pombredanne@nexb.com;
> linux-crypto@vger.kernel.org; linux-kernel@vger.kernel.org;
> netdev@vger.kernel.org; Kalyani Akula <kalyania@xilinx.com>
> Subject: Re: [PATCH V2 4/4] crypto: Add Xilinx AES driver
> 
> On Sun, Sep 01, 2019 at 07:24:58PM +0530, Kalyani Akula wrote:
> > This patch adds AES driver support for the Xilinx ZynqMP SoC.
> >
> > Signed-off-by: Kalyani Akula <kalyani.akula@xilinx.com>
> > ---
> 
> Hello
> 
> I have some comment below
> 
> >  drivers/crypto/Kconfig          |  11 ++
> >  drivers/crypto/Makefile         |   1 +
> >  drivers/crypto/zynqmp-aes-gcm.c | 297
> > ++++++++++++++++++++++++++++++++++++++++
> >  3 files changed, 309 insertions(+)
> >  create mode 100644 drivers/crypto/zynqmp-aes-gcm.c
> >
> > diff --git a/drivers/crypto/Kconfig b/drivers/crypto/Kconfig index
> > 603413f..a0d058a 100644
> > --- a/drivers/crypto/Kconfig
> > +++ b/drivers/crypto/Kconfig
> > @@ -677,6 +677,17 @@ config CRYPTO_DEV_ROCKCHIP
> >  	  This driver interfaces with the hardware crypto accelerator.
> >  	  Supporting cbc/ecb chainmode, and aes/des/des3_ede cipher mode.
> >
> > +config CRYPTO_DEV_ZYNQMP_AES
> > +	tristate "Support for Xilinx ZynqMP AES hw accelerator"
> > +	depends on ARCH_ZYNQMP || COMPILE_TEST
> > +	select CRYPTO_AES
> > +	select CRYPTO_SKCIPHER
> > +	help
> > +	  Xilinx ZynqMP has AES-GCM engine used for symmetric key
> > +	  encryption and decryption. This driver interfaces with AES hw
> > +	  accelerator. Select this if you want to use the ZynqMP module
> > +	  for AES algorithms.
> > +
> >  config CRYPTO_DEV_MEDIATEK
> >  	tristate "MediaTek's EIP97 Cryptographic Engine driver"
> >  	depends on (ARM && ARCH_MEDIATEK) || COMPILE_TEST diff --git
> > a/drivers/crypto/Makefile b/drivers/crypto/Makefile index
> > afc4753..c99663a 100644
> > --- a/drivers/crypto/Makefile
> > +++ b/drivers/crypto/Makefile
> > @@ -48,3 +48,4 @@ obj-$(CONFIG_CRYPTO_DEV_BCM_SPU) += bcm/
> >  obj-$(CONFIG_CRYPTO_DEV_SAFEXCEL) += inside-secure/
> >  obj-$(CONFIG_CRYPTO_DEV_ARTPEC6) += axis/  obj-y += hisilicon/
> > +obj-$(CONFIG_CRYPTO_DEV_ZYNQMP_AES) += zynqmp-aes-gcm.o
> > diff --git a/drivers/crypto/zynqmp-aes-gcm.c
> > b/drivers/crypto/zynqmp-aes-gcm.c new file mode 100644 index
> > 0000000..d65f038
> > --- /dev/null
> > +++ b/drivers/crypto/zynqmp-aes-gcm.c
> > @@ -0,0 +1,297 @@
> > +// SPDX-License-Identifier: GPL-2.0
> > +/*
> > + * Xilinx ZynqMP AES Driver.
> > + * Copyright (c) 2019 Xilinx Inc.
> > + */
> > +
> > +#include <crypto/aes.h>
> > +#include <crypto/scatterwalk.h>
> > +#include <linux/kernel.h>
> > +#include <linux/module.h>
> > +#include <linux/of_device.h>
> > +#include <linux/scatterlist.h>
> > +#include <linux/firmware/xlnx-zynqmp.h>
> > +
> > +#define ZYNQMP_AES_IV_SIZE			12
> > +#define ZYNQMP_AES_GCM_SIZE			16
> > +#define ZYNQMP_AES_KEY_SIZE			32
> > +
> > +#define ZYNQMP_AES_DECRYPT			0
> > +#define ZYNQMP_AES_ENCRYPT			1
> > +
> > +#define ZYNQMP_AES_KUP_KEY			0
> > +#define ZYNQMP_AES_DEVICE_KEY			1
> > +#define ZYNQMP_AES_PUF_KEY			2
> > +
> > +#define ZYNQMP_AES_GCM_TAG_MISMATCH_ERR		0x01
> > +#define ZYNQMP_AES_SIZE_ERR			0x06
> > +#define ZYNQMP_AES_WRONG_KEY_SRC_ERR		0x13
> > +#define ZYNQMP_AES_PUF_NOT_PROGRAMMED		0xE300
> > +
> > +#define ZYNQMP_AES_BLOCKSIZE			0x04
> > +
> > +static const struct zynqmp_eemi_ops *eemi_ops; struct zynqmp_aes_dev
> > +*aes_dd;
> 
> I still think that using a global variable for storing device driver data is bad.

I think storing the list of dd's would solve up the issue with global variable, but there is only one AES instance here.
Please suggest

> 
> > +
> > +struct zynqmp_aes_dev {
> > +	struct device *dev;
> > +};
> > +
> > +struct zynqmp_aes_op {
> > +	struct zynqmp_aes_dev *dd;
> > +	void *src;
> > +	void *dst;
> > +	int len;
> > +	u8 key[ZYNQMP_AES_KEY_SIZE];
> > +	u8 *iv;
> > +	u32 keylen;
> > +	u32 keytype;
> > +};
> > +
> > +struct zynqmp_aes_data {
> > +	u64 src;
> > +	u64 iv;
> > +	u64 key;
> > +	u64 dst;
> > +	u64 size;
> > +	u64 optype;
> > +	u64 keysrc;
> > +};
> > +
> > +static int zynqmp_setkey_blk(struct crypto_tfm *tfm, const u8 *key,
> > +			     unsigned int len)
> > +{
> > +	struct zynqmp_aes_op *op = crypto_tfm_ctx(tfm);
> > +
> > +	if (((len != 1) && (len !=  ZYNQMP_AES_KEY_SIZE)) || (!key))
> 
> typo, two space

Will fix in the next version

> 
> > +		return -EINVAL;
> > +
> > +	if (len == 1) {
> > +		op->keytype = *key;
> > +
> > +		if ((op->keytype < ZYNQMP_AES_KUP_KEY) ||
> > +			(op->keytype > ZYNQMP_AES_PUF_KEY))
> > +			return -EINVAL;
> > +
> > +	} else if (len == ZYNQMP_AES_KEY_SIZE) {
> > +		op->keytype = ZYNQMP_AES_KUP_KEY;
> > +		op->keylen = len;
> > +		memcpy(op->key, key, len);
> > +	}
> > +
> > +	return 0;
> > +}
> 
> It seems your driver does not support AES keysize of 128/196, you need to
> fallback in that case.

[Kalyani] In case of 128/196 keysize, returning the error would suffice ?
Or still algorithm need to work ?
If error is enough, it is taken care by this condition 
if (((len != 1) && (len !=  ZYNQMP_AES_KEY_SIZE)) || (!key))


> You need to comment the keylen=1 usecase and use a define for this value.
> 

Will fix in next version

> > +
> > +static int zynqmp_aes_xcrypt(struct blkcipher_desc *desc,
> > +			     struct scatterlist *dst,
> > +			     struct scatterlist *src,
> > +			     unsigned int nbytes,
> > +			     unsigned int flags)
> > +{
> > +	struct zynqmp_aes_op *op = crypto_blkcipher_ctx(desc->tfm);
> > +	struct zynqmp_aes_dev *dd = aes_dd;
> > +	int err, ret, copy_bytes, src_data = 0, dst_data = 0;
> > +	dma_addr_t dma_addr, dma_addr_buf;
> > +	struct zynqmp_aes_data *abuf;
> > +	struct blkcipher_walk walk;
> > +	unsigned int data_size;
> > +	size_t dma_size;
> > +	char *kbuf;
> > +
> > +	if (!eemi_ops->aes)
> > +		return -ENOTSUPP;
> > +
> > +	if (op->keytype == ZYNQMP_AES_KUP_KEY)
> > +		dma_size = nbytes + ZYNQMP_AES_KEY_SIZE
> > +			+ ZYNQMP_AES_IV_SIZE;
> > +	else
> > +		dma_size = nbytes + ZYNQMP_AES_IV_SIZE;
> > +
> > +	kbuf = dma_alloc_coherent(dd->dev, dma_size, &dma_addr,
> GFP_KERNEL);
> > +	if (!kbuf)
> > +		return -ENOMEM;
> > +
> > +	abuf = dma_alloc_coherent(dd->dev, sizeof(struct zynqmp_aes_data),
> > +				  &dma_addr_buf, GFP_KERNEL);
> > +	if (!abuf) {
> > +		dma_free_coherent(dd->dev, dma_size, kbuf, dma_addr);
> > +		return -ENOMEM;
> > +	}
> > +
> > +	data_size = nbytes;
> > +	blkcipher_walk_init(&walk, dst, src, data_size);
> > +	err = blkcipher_walk_virt(desc, &walk);
> > +	op->iv = walk.iv;
> > +
> > +	while ((nbytes = walk.nbytes)) {
> > +		op->src = walk.src.virt.addr;
> > +		memcpy(kbuf + src_data, op->src, nbytes);
> > +		src_data = src_data + nbytes;
> > +		nbytes &= (ZYNQMP_AES_BLOCKSIZE - 1);
> > +		err = blkcipher_walk_done(desc, &walk, nbytes);
> > +	}
> > +	memcpy(kbuf + data_size, op->iv, ZYNQMP_AES_IV_SIZE);
> > +	abuf->src = dma_addr;
> > +	abuf->dst = dma_addr;
> > +	abuf->iv = abuf->src + data_size;
> > +	abuf->size = data_size - ZYNQMP_AES_GCM_SIZE;
> > +	abuf->optype = flags;
> > +	abuf->keysrc = op->keytype;
> > +
> > +	if (op->keytype == ZYNQMP_AES_KUP_KEY) {
> > +		memcpy(kbuf + data_size + ZYNQMP_AES_IV_SIZE,
> > +		       op->key, ZYNQMP_AES_KEY_SIZE);
> > +
> > +		abuf->key = abuf->src + data_size + ZYNQMP_AES_IV_SIZE;
> > +	} else {
> > +		abuf->key = 0;
> > +	}
> > +	eemi_ops->aes(dma_addr_buf, &ret);
> > +
> > +	if (ret != 0) {
> > +		switch (ret) {
> > +		case ZYNQMP_AES_GCM_TAG_MISMATCH_ERR:
> > +			dev_err(dd->dev, "ERROR: Gcm Tag mismatch\n\r");
> > +			break;
> > +		case ZYNQMP_AES_SIZE_ERR:
> > +			dev_err(dd->dev, "ERROR : Non word aligned
> data\n\r");
> > +			break;
> > +		case ZYNQMP_AES_WRONG_KEY_SRC_ERR:
> > +			dev_err(dd->dev, "ERROR: Wrong KeySrc, enable secure
> mode\n\r");
> > +			break;
> > +		case ZYNQMP_AES_PUF_NOT_PROGRAMMED:
> > +			dev_err(dd->dev, "ERROR: PUF is not registered\r\n");
> > +			break;
> > +		default:
> > +			dev_err(dd->dev, "ERROR: Invalid");
> > +			break;
> > +		}
> > +		goto END;
> > +	}
> > +	if (flags)
> > +		copy_bytes = data_size;
> > +	else
> > +		copy_bytes = data_size - ZYNQMP_AES_GCM_SIZE;
> > +
> > +	blkcipher_walk_init(&walk, dst, src, copy_bytes);
> > +	err = blkcipher_walk_virt(desc, &walk);
> > +
> > +	while ((nbytes = walk.nbytes)) {
> > +		memcpy(walk.dst.virt.addr, kbuf + dst_data, nbytes);
> > +		dst_data = dst_data + nbytes;
> > +		nbytes &= (ZYNQMP_AES_BLOCKSIZE - 1);
> > +		err = blkcipher_walk_done(desc, &walk, nbytes);
> > +	}
> > +END:
> > +	memset(kbuf, 0, dma_size);
> > +	memset(abuf, 0, sizeof(struct zynqmp_aes_data));
> > +	dma_free_coherent(dd->dev, dma_size, kbuf, dma_addr);
> > +	dma_free_coherent(dd->dev, sizeof(struct zynqmp_aes_data),
> > +			  abuf, dma_addr_buf);
> > +	return err;
> > +}
> > +
> > +static int zynqmp_aes_decrypt(struct blkcipher_desc *desc,
> > +			      struct scatterlist *dst,
> > +			      struct scatterlist *src,
> > +			      unsigned int nbytes)
> > +{
> > +	return zynqmp_aes_xcrypt(desc, dst, src, nbytes,
> > +ZYNQMP_AES_DECRYPT); }
> > +
> > +static int zynqmp_aes_encrypt(struct blkcipher_desc *desc,
> > +			      struct scatterlist *dst,
> > +			      struct scatterlist *src,
> > +			      unsigned int nbytes)
> > +{
> > +	return zynqmp_aes_xcrypt(desc, dst, src, nbytes,
> > +ZYNQMP_AES_ENCRYPT); }
> > +
> > +static struct crypto_alg zynqmp_alg = {
> > +	.cra_name		=	"xilinx-zynqmp-aes",
> > +	.cra_driver_name	=	"zynqmp-aes-gcm",
> > +	.cra_priority		=	400,
> > +	.cra_flags		=	CRYPTO_ALG_TYPE_BLKCIPHER |
> > +					CRYPTO_ALG_KERN_DRIVER_ONLY,
> > +	.cra_blocksize		=	ZYNQMP_AES_BLOCKSIZE,
> > +	.cra_ctxsize		=	sizeof(struct zynqmp_aes_op),
> > +	.cra_alignmask		=	15,
> > +	.cra_type		=	&crypto_blkcipher_type,
> > +	.cra_module		=	THIS_MODULE,
> > +	.cra_u			=	{
> > +	.blkcipher	=	{
> > +			.min_keysize	=	0,
> 
> Are you sure to accept this a keysize of 0 ?
> 

Will correct in next version 

Regards
kalyani
> Regards

^ permalink raw reply

* [PATCH iproute2] devlink: fix segfault on health command
From: Andrea Claudi @ 2019-09-04 17:26 UTC (permalink / raw)
  To: netdev; +Cc: stephen, dsahern

devlink segfaults when using grace_period without reporter

$ devlink health set pci/0000:00:09.0 grace_period 3500
Segmentation fault

devlink is instead supposed to gracefully fail printing a warning
message

$ devlink health set pci/0000:00:09.0 grace_period 3500
Reporter's name is expected.

This happens because DL_OPT_HEALTH_REPORTER_NAME and
DL_OPT_HEALTH_REPORTER_GRACEFUL_PERIOD are both defined as BIT(27).
When dl_opts_put() parse options and grace_period is set, it erroneously
tries to set reporter name to null.

This is fixed simply shifting by 1 bit enumeration starting with
DL_OPT_HEALTH_REPORTER_GRACEFUL_PERIOD.

Fixes: b18d89195b16 ("devlink: Add devlink health set command")
Signed-off-by: Andrea Claudi <aclaudi@redhat.com>
---
 devlink/devlink.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/devlink/devlink.c b/devlink/devlink.c
index 91c85dc1de730..0293373928f50 100644
--- a/devlink/devlink.c
+++ b/devlink/devlink.c
@@ -231,8 +231,8 @@ static void ifname_map_free(struct ifname_map *ifname_map)
 #define DL_OPT_FLASH_FILE_NAME	BIT(25)
 #define DL_OPT_FLASH_COMPONENT	BIT(26)
 #define DL_OPT_HEALTH_REPORTER_NAME	BIT(27)
-#define DL_OPT_HEALTH_REPORTER_GRACEFUL_PERIOD	BIT(27)
-#define DL_OPT_HEALTH_REPORTER_AUTO_RECOVER	BIT(28)
+#define DL_OPT_HEALTH_REPORTER_GRACEFUL_PERIOD	BIT(28)
+#define DL_OPT_HEALTH_REPORTER_AUTO_RECOVER	BIT(29)
 
 struct dl_opts {
 	uint32_t present; /* flags of present items */
-- 
2.21.0


^ permalink raw reply related

* Re: [PATCH v2 2/2] net: phy: gmii2rgmii: Dont use priv field in phy device
From: Florian Fainelli @ 2019-09-04 17:11 UTC (permalink / raw)
  To: Harini Katakam, andrew, hkallweit1, davem
  Cc: michal.simek, netdev, linux-arm-kernel, linux-kernel,
	harinikatakamlinux, radhey.shyam.pandey
In-Reply-To: <1567605621-6818-3-git-send-email-harini.katakam@xilinx.com>

On 9/4/19 7:00 AM, Harini Katakam wrote:
> Use set/get drv data in phydev's mdio device instead. Phy device priv
> field maybe used by the external phy driver and should not be
> overwritten.
> 
> Signed-off-by: Harini Katakam <harini.katakam@xilinx.com>

Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
-- 
Florian

^ permalink raw reply

* Re: [PATCH v2 1/2] include: mdio: Add driver data helpers
From: Florian Fainelli @ 2019-09-04 17:07 UTC (permalink / raw)
  To: Harini Katakam, andrew, hkallweit1, davem
  Cc: michal.simek, netdev, linux-arm-kernel, linux-kernel,
	harinikatakamlinux, radhey.shyam.pandey
In-Reply-To: <1567605621-6818-2-git-send-email-harini.katakam@xilinx.com>

On 9/4/19 7:00 AM, Harini Katakam wrote:
> Add set/get drv_data helpers for mdio device.
> 
> Signed-off-by: Harini Katakam <harini.katakam@xilinx.com>

Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
-- 
Florian

^ permalink raw reply

* Re: [PATCH v2 2/2] net: phy: gmii2rgmii: Dont use priv field in phy device
From: Andrew Lunn @ 2019-09-04 16:47 UTC (permalink / raw)
  To: Harini Katakam
  Cc: f.fainelli, hkallweit1, davem, michal.simek, netdev,
	linux-arm-kernel, linux-kernel, harinikatakamlinux,
	radhey.shyam.pandey
In-Reply-To: <1567605621-6818-3-git-send-email-harini.katakam@xilinx.com>

On Wed, Sep 04, 2019 at 07:30:21PM +0530, Harini Katakam wrote:
> Use set/get drv data in phydev's mdio device instead. Phy device priv
> field maybe used by the external phy driver and should not be
> overwritten.
> 
> Signed-off-by: Harini Katakam <harini.katakam@xilinx.com>

Reviewed-by: Andrew Lunn <andrew@lunn.ch>

    Andrew

^ permalink raw reply

* Re: [PATCH v2 1/2] include: mdio: Add driver data helpers
From: Andrew Lunn @ 2019-09-04 16:46 UTC (permalink / raw)
  To: Harini Katakam
  Cc: f.fainelli, hkallweit1, davem, michal.simek, netdev,
	linux-arm-kernel, linux-kernel, harinikatakamlinux,
	radhey.shyam.pandey
In-Reply-To: <1567605621-6818-2-git-send-email-harini.katakam@xilinx.com>

On Wed, Sep 04, 2019 at 07:30:20PM +0530, Harini Katakam wrote:
> Add set/get drv_data helpers for mdio device.
> 
> Signed-off-by: Harini Katakam <harini.katakam@xilinx.com>

Reviewed-by: Andrew Lunn <andrew@lunn.ch>

    Andrew

^ permalink raw reply

* Re: [PATCH bpf-next v3 3/4] xsk: use state member for socket synchronization
From: Jonathan Lemon @ 2019-09-04 16:40 UTC (permalink / raw)
  To: Björn Töpel
  Cc: ast, daniel, netdev, Björn Töpel, magnus.karlsson,
	magnus.karlsson, bpf, syzbot+c82697e3043781e08802, hdanton,
	i.maximets
In-Reply-To: <20190904114913.17217-4-bjorn.topel@gmail.com>



On 4 Sep 2019, at 4:49, Björn Töpel wrote:

> From: Björn Töpel <bjorn.topel@intel.com>
>
> Prior the state variable was introduced by Ilya, the dev member was
> used to determine whether the socket was bound or not. However, when
> dev was read, proper SMP barriers and READ_ONCE were missing. In order
> to address the missing barriers and READ_ONCE, we start using the
> state variable as a point of synchronization. The state member
> read/write is paired with proper SMP barriers, and from this follows
> that the members described above does not need READ_ONCE if used in
> conjunction with state check.
>
> In all syscalls and the xsk_rcv path we check if state is
> XSK_BOUND. If that is the case we do a SMP read barrier, and this
> implies that the dev, umem and all rings are correctly setup. Note
> that no READ_ONCE are needed for these variable if used when state is
> XSK_BOUND (plus the read barrier).
>
> To summarize: The members struct xdp_sock members dev, queue_id, umem,
> fq, cq, tx, rx, and state were read lock-less, with incorrect barriers
> and missing {READ, WRITE}_ONCE. Now, umem, fq, cq, tx, rx, and state
> are read lock-less. When these members are updated, WRITE_ONCE is
> used. When read, READ_ONCE are only used when read outside the control
> mutex (e.g. mmap) or, not synchronized with the state member
> (XSK_BOUND plus smp_rmb())
>
> Note that dev and queue_id do not need a WRITE_ONCE or READ_ONCE, due
> to the introduce state synchronization (XSK_BOUND plus smp_rmb()).
>
> Introducing the state check also fixes a race, found by syzcaller, in
> xsk_poll() where umem could be accessed when stale.
>
> Suggested-by: Hillf Danton <hdanton@sina.com>
> Reported-by: syzbot+c82697e3043781e08802@syzkaller.appspotmail.com
> Fixes: 77cd0d7b3f25 ("xsk: add support for need_wakeup flag in AF_XDP 
> rings")
> Signed-off-by: Björn Töpel <bjorn.topel@intel.com>

Acked-by: Jonathan Lemon <jonathan.lemon@gmail.com>

^ permalink raw reply

* Re: rtnl_lock() question
From: Jonathan Lemon @ 2019-09-04 16:38 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Netdev, Saeed Mahameed
In-Reply-To: <3164f8de-de20-44f7-03fb-8bc39ca8449e@gmail.com>

On 4 Sep 2019, at 0:39, Eric Dumazet wrote:

> On 9/3/19 11:55 PM, Jonathan Lemon wrote:
>> How appropriate is it to hold the rtnl_lock() across a sleepable
>> memory allocation?  On one hand it's just a mutex, but it would
>> seem like it could block quite a few things.
>>
>
> Sure, all GFP_KERNEL allocations can sleep for quite a while.
>
> On the other hand, we may want to delay stuff if memory is under 
> pressure,
> or complex operations like NEWLINK would fail.
>
> RTNL is mostly taken for control path operations, we prefer them to be
> mostly reliable, otherwise admins job would be a nightmare.
>
> In some cases, it is relatively easy to pre-allocate memory before 
> rtnl is taken,
> but that will only take care of some selected paths.

The particular code path that I'm looking at is mlx5e_tx_timeout_work().

This is called on TX timeout, and mlx5 wants to move an entire channel
and all the supporting structures elsewhere.  Under the rtnl_lock(), it
calls kvzmalloc() in order to grab a large chunk of contig memory, which
ends up stalling the system.

I suspect these large allocation should really be done outside the lock.
-- 
Jonathan

^ permalink raw reply

* Re: [PATCH iproute2-next] bpf: fix snprintf truncation warning
From: Andrea Claudi @ 2019-09-04 16:28 UTC (permalink / raw)
  To: linux-netdev; +Cc: Stephen Hemminger, David Ahern
In-Reply-To: <12a9cb8d91e41a08466141d4bb8ee659487d01df.1567611976.git.aclaudi@redhat.com>

On Wed, Sep 4, 2019 at 5:50 PM Andrea Claudi <aclaudi@redhat.com> wrote:
>
> gcc v9.2.1 produces the following warning compiling iproute2:
>
> bpf.c: In function ‘bpf_get_work_dir’:
> bpf.c:784:49: warning: ‘snprintf’ output may be truncated before the last format character [-Wformat-truncation=]
>   784 |  snprintf(bpf_wrk_dir, sizeof(bpf_wrk_dir), "%s/", mnt);
>       |                                                 ^
> bpf.c:784:2: note: ‘snprintf’ output between 2 and 4097 bytes into a destination of size 4096
>   784 |  snprintf(bpf_wrk_dir, sizeof(bpf_wrk_dir), "%s/", mnt);
>       |  ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
>
> Fix it extending bpf_wrk_dir size by 1 byte for the extra "/" char.
>
> Signed-off-by: Andrea Claudi <aclaudi@redhat.com>
> ---
>  lib/bpf.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/lib/bpf.c b/lib/bpf.c
> index 7d2a322ffbaec..95de7894a93ce 100644
> --- a/lib/bpf.c
> +++ b/lib/bpf.c
> @@ -742,7 +742,7 @@ static int bpf_gen_hierarchy(const char *base)
>  static const char *bpf_get_work_dir(enum bpf_prog_type type)
>  {
>         static char bpf_tmp[PATH_MAX] = BPF_DIR_MNT;
> -       static char bpf_wrk_dir[PATH_MAX];
> +       static char bpf_wrk_dir[PATH_MAX + 1];
>         static const char *mnt;
>         static bool bpf_mnt_cached;
>         const char *mnt_env = getenv(BPF_ENV_MNT);
> --
> 2.21.0
>

Sorry, I forgot to add:

Fixes: e42256699cac ("bpf: make tc's bpf loader generic and move into lib")

^ permalink raw reply

* [PATCH bpf-next 6/6] selftests/bpf: test_progs: convert test_tcp_rtt
From: Stanislav Fomichev @ 2019-09-04 16:25 UTC (permalink / raw)
  To: netdev, bpf; +Cc: davem, ast, daniel, Stanislav Fomichev
In-Reply-To: <20190904162509.199561-1-sdf@google.com>

Move the files, adjust includes, remove entry from Makefile & .gitignore

Signed-off-by: Stanislav Fomichev <sdf@google.com>
---
 tools/testing/selftests/bpf/.gitignore        |  1 -
 tools/testing/selftests/bpf/Makefile          |  3 +-
 .../{test_tcp_rtt.c => prog_tests/tcp_rtt.c}  | 83 ++++++-------------
 3 files changed, 28 insertions(+), 59 deletions(-)
 rename tools/testing/selftests/bpf/{test_tcp_rtt.c => prog_tests/tcp_rtt.c} (76%)

diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore
index 5b06bb45b500..7470327edcfe 100644
--- a/tools/testing/selftests/bpf/.gitignore
+++ b/tools/testing/selftests/bpf/.gitignore
@@ -39,4 +39,3 @@ libbpf.so.*
 test_hashmap
 test_btf_dump
 xdping
-test_tcp_rtt
diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index fe786df1174b..811f1b24d02b 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -28,7 +28,7 @@ TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test
 	test_sock test_btf test_sockmap get_cgroup_id_user test_socket_cookie \
 	test_cgroup_storage test_select_reuseport test_section_names \
 	test_netcnt test_tcpnotify_user test_sock_fields test_sysctl test_hashmap \
-	test_btf_dump test_cgroup_attach xdping test_tcp_rtt
+	test_btf_dump test_cgroup_attach xdping
 
 BPF_OBJ_FILES = $(patsubst %.c,%.o, $(notdir $(wildcard progs/*.c)))
 TEST_GEN_FILES = $(BPF_OBJ_FILES)
@@ -108,7 +108,6 @@ $(OUTPUT)/test_netcnt: cgroup_helpers.c
 $(OUTPUT)/test_sock_fields: cgroup_helpers.c
 $(OUTPUT)/test_sysctl: cgroup_helpers.c
 $(OUTPUT)/test_cgroup_attach: cgroup_helpers.c
-$(OUTPUT)/test_tcp_rtt: cgroup_helpers.c
 
 .PHONY: force
 
diff --git a/tools/testing/selftests/bpf/test_tcp_rtt.c b/tools/testing/selftests/bpf/prog_tests/tcp_rtt.c
similarity index 76%
rename from tools/testing/selftests/bpf/test_tcp_rtt.c
rename to tools/testing/selftests/bpf/prog_tests/tcp_rtt.c
index 93916a69823e..fdc0b3614a9e 100644
--- a/tools/testing/selftests/bpf/test_tcp_rtt.c
+++ b/tools/testing/selftests/bpf/prog_tests/tcp_rtt.c
@@ -1,24 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
-#include <error.h>
-#include <errno.h>
-#include <stdio.h>
-#include <unistd.h>
-#include <sys/types.h>
-#include <sys/socket.h>
-#include <netinet/in.h>
-#include <netinet/tcp.h>
-#include <pthread.h>
-
-#include <linux/filter.h>
-#include <bpf/bpf.h>
-#include <bpf/libbpf.h>
-
-#include "bpf_rlimit.h"
-#include "bpf_util.h"
+#include <test_progs.h>
 #include "cgroup_helpers.h"
 
-#define CG_PATH                                "/tcp_rtt"
-
 struct tcp_rtt_storage {
 	__u32 invoked;
 	__u32 dsack_dups;
@@ -31,8 +14,8 @@ static void send_byte(int fd)
 {
 	char b = 0x55;
 
-	if (write(fd, &b, sizeof(b)) != 1)
-		error(1, errno, "Failed to send single byte");
+	if (CHECK_FAIL(write(fd, &b, sizeof(b)) != 1))
+		perror("Failed to send single byte");
 }
 
 static int wait_for_ack(int fd, int retries)
@@ -66,8 +49,10 @@ static int verify_sk(int map_fd, int client_fd, const char *msg, __u32 invoked,
 	int err = 0;
 	struct tcp_rtt_storage val;
 
-	if (bpf_map_lookup_elem(map_fd, &client_fd, &val) < 0)
-		error(1, errno, "Failed to read socket storage");
+	if (CHECK_FAIL(bpf_map_lookup_elem(map_fd, &client_fd, &val) < 0)) {
+		perror("Failed to read socket storage");
+		return -1;
+	}
 
 	if (val.invoked != invoked) {
 		log_err("%s: unexpected bpf_tcp_sock.invoked %d != %d",
@@ -225,61 +210,47 @@ static void *server_thread(void *arg)
 	int fd = *(int *)arg;
 	int client_fd;
 
-	if (listen(fd, 1) < 0)
-		error(1, errno, "Failed to listed on socket");
+	if (CHECK_FAIL(listen(fd, 1)) < 0) {
+		perror("Failed to listed on socket");
+		return NULL;
+	}
 
 	client_fd = accept(fd, (struct sockaddr *)&addr, &len);
-	if (client_fd < 0)
-		error(1, errno, "Failed to accept client");
+	if (CHECK_FAIL(client_fd < 0)) {
+		perror("Failed to accept client");
+		return NULL;
+	}
 
 	/* Wait for the next connection (that never arrives)
 	 * to keep this thread alive to prevent calling
 	 * close() on client_fd.
 	 */
-	if (accept(fd, (struct sockaddr *)&addr, &len) >= 0)
-		error(1, errno, "Unexpected success in second accept");
+	if (CHECK_FAIL(accept(fd, (struct sockaddr *)&addr, &len) >= 0)) {
+		perror("Unexpected success in second accept");
+		return NULL;
+	}
 
 	close(client_fd);
 
 	return NULL;
 }
 
-int main(int args, char **argv)
+void test_tcp_rtt(void)
 {
 	int server_fd, cgroup_fd;
-	int err = EXIT_SUCCESS;
 	pthread_t tid;
 
-	if (setup_cgroup_environment())
-		goto cleanup_obj;
-
-	cgroup_fd = create_and_get_cgroup(CG_PATH);
-	if (cgroup_fd < 0)
-		goto cleanup_cgroup_env;
-
-	if (join_cgroup(CG_PATH))
-		goto cleanup_cgroup;
+	cgroup_fd = test__join_cgroup("/tcp_rtt");
+	if (CHECK_FAIL(cgroup_fd < 0))
+		return;
 
 	server_fd = start_server();
-	if (server_fd < 0) {
-		err = EXIT_FAILURE;
-		goto cleanup_cgroup;
-	}
+	if (CHECK_FAIL(server_fd < 0))
+		goto close_cgroup_fd;
 
 	pthread_create(&tid, NULL, server_thread, (void *)&server_fd);
-
-	if (run_test(cgroup_fd, server_fd))
-		err = EXIT_FAILURE;
-
+	CHECK_FAIL(run_test(cgroup_fd, server_fd));
 	close(server_fd);
-
-	printf("test_sockopt_sk: %s\n",
-	       err == EXIT_SUCCESS ? "PASSED" : "FAILED");
-
-cleanup_cgroup:
+close_cgroup_fd:
 	close(cgroup_fd);
-cleanup_cgroup_env:
-	cleanup_cgroup_environment();
-cleanup_obj:
-	return err;
 }
-- 
2.23.0.187.g17f5b7556c-goog


^ permalink raw reply related

* [PATCH bpf-next 5/6] selftests/bpf: test_progs: convert test_sockopt_inherit
From: Stanislav Fomichev @ 2019-09-04 16:25 UTC (permalink / raw)
  To: netdev, bpf; +Cc: davem, ast, daniel, Stanislav Fomichev
In-Reply-To: <20190904162509.199561-1-sdf@google.com>

Move the files, adjust includes, remove entry from Makefile & .gitignore

I also added pthread_cond_wait for the server thread startup. We don't
want to connect to the server that's not yet up (for some reason
this existing race is now more prominent with test_progs).

Signed-off-by: Stanislav Fomichev <sdf@google.com>
---
 tools/testing/selftests/bpf/.gitignore        |   1 -
 tools/testing/selftests/bpf/Makefile          |   4 +-
 .../sockopt_inherit.c}                        | 102 ++++++++----------
 3 files changed, 43 insertions(+), 64 deletions(-)
 rename tools/testing/selftests/bpf/{test_sockopt_inherit.c => prog_tests/sockopt_inherit.c} (72%)

diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore
index 4143add5a11e..5b06bb45b500 100644
--- a/tools/testing/selftests/bpf/.gitignore
+++ b/tools/testing/selftests/bpf/.gitignore
@@ -39,5 +39,4 @@ libbpf.so.*
 test_hashmap
 test_btf_dump
 xdping
-test_sockopt_inherit
 test_tcp_rtt
diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 271f8ce89c97..fe786df1174b 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -28,8 +28,7 @@ TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test
 	test_sock test_btf test_sockmap get_cgroup_id_user test_socket_cookie \
 	test_cgroup_storage test_select_reuseport test_section_names \
 	test_netcnt test_tcpnotify_user test_sock_fields test_sysctl test_hashmap \
-	test_btf_dump test_cgroup_attach xdping \
-	test_sockopt_inherit test_tcp_rtt
+	test_btf_dump test_cgroup_attach xdping test_tcp_rtt
 
 BPF_OBJ_FILES = $(patsubst %.c,%.o, $(notdir $(wildcard progs/*.c)))
 TEST_GEN_FILES = $(BPF_OBJ_FILES)
@@ -109,7 +108,6 @@ $(OUTPUT)/test_netcnt: cgroup_helpers.c
 $(OUTPUT)/test_sock_fields: cgroup_helpers.c
 $(OUTPUT)/test_sysctl: cgroup_helpers.c
 $(OUTPUT)/test_cgroup_attach: cgroup_helpers.c
-$(OUTPUT)/test_sockopt_inherit: cgroup_helpers.c
 $(OUTPUT)/test_tcp_rtt: cgroup_helpers.c
 
 .PHONY: force
diff --git a/tools/testing/selftests/bpf/test_sockopt_inherit.c b/tools/testing/selftests/bpf/prog_tests/sockopt_inherit.c
similarity index 72%
rename from tools/testing/selftests/bpf/test_sockopt_inherit.c
rename to tools/testing/selftests/bpf/prog_tests/sockopt_inherit.c
index 1bf699815b9b..6cbeea7b4bf1 100644
--- a/tools/testing/selftests/bpf/test_sockopt_inherit.c
+++ b/tools/testing/selftests/bpf/prog_tests/sockopt_inherit.c
@@ -1,22 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
-#include <error.h>
-#include <errno.h>
-#include <stdio.h>
-#include <unistd.h>
-#include <sys/types.h>
-#include <sys/socket.h>
-#include <netinet/in.h>
-#include <pthread.h>
-
-#include <linux/filter.h>
-#include <bpf/bpf.h>
-#include <bpf/libbpf.h>
-
-#include "bpf_rlimit.h"
-#include "bpf_util.h"
+#include <test_progs.h>
 #include "cgroup_helpers.h"
 
-#define CG_PATH				"/sockopt_inherit"
 #define SOL_CUSTOM			0xdeadbeef
 #define CUSTOM_INHERIT1			0
 #define CUSTOM_INHERIT2			1
@@ -74,6 +59,9 @@ static int verify_sockopt(int fd, int optname, const char *msg, char expected)
 	return 0;
 }
 
+static pthread_mutex_t server_started_mtx = PTHREAD_MUTEX_INITIALIZER;
+static pthread_cond_t server_started = PTHREAD_COND_INITIALIZER;
+
 static void *server_thread(void *arg)
 {
 	struct sockaddr_storage addr;
@@ -82,16 +70,26 @@ static void *server_thread(void *arg)
 	int client_fd;
 	int err = 0;
 
-	if (listen(fd, 1) < 0)
-		error(1, errno, "Failed to listed on socket");
+	err = listen(fd, 1);
+
+	pthread_mutex_lock(&server_started_mtx);
+	pthread_cond_signal(&server_started);
+	pthread_mutex_unlock(&server_started_mtx);
+
+	if (CHECK_FAIL(err < 0)) {
+		perror("Failed to listed on socket");
+		return NULL;
+	}
 
 	err += verify_sockopt(fd, CUSTOM_INHERIT1, "listen", 1);
 	err += verify_sockopt(fd, CUSTOM_INHERIT2, "listen", 1);
 	err += verify_sockopt(fd, CUSTOM_LISTENER, "listen", 1);
 
 	client_fd = accept(fd, (struct sockaddr *)&addr, &len);
-	if (client_fd < 0)
-		error(1, errno, "Failed to accept client");
+	if (CHECK_FAIL(client_fd < 0)) {
+		perror("Failed to accept client");
+		return NULL;
+	}
 
 	err += verify_sockopt(client_fd, CUSTOM_INHERIT1, "accept", 1);
 	err += verify_sockopt(client_fd, CUSTOM_INHERIT2, "accept", 1);
@@ -167,7 +165,7 @@ static int prog_attach(struct bpf_object *obj, int cgroup_fd, const char *title)
 	return 0;
 }
 
-static int run_test(int cgroup_fd)
+static void run_test(int cgroup_fd)
 {
 	struct bpf_prog_load_attr attr = {
 		.file = "./sockopt_inherit.o",
@@ -180,40 +178,41 @@ static int run_test(int cgroup_fd)
 	int err;
 
 	err = bpf_prog_load_xattr(&attr, &obj, &ignored);
-	if (err) {
-		log_err("Failed to load BPF object");
-		return -1;
-	}
+	if (CHECK_FAIL(err))
+		return;
 
 	err = prog_attach(obj, cgroup_fd, "cgroup/getsockopt");
-	if (err)
+	if (CHECK_FAIL(err))
 		goto close_bpf_object;
 
 	err = prog_attach(obj, cgroup_fd, "cgroup/setsockopt");
-	if (err)
+	if (CHECK_FAIL(err))
 		goto close_bpf_object;
 
 	server_fd = start_server();
-	if (server_fd < 0) {
-		err = -1;
+	if (CHECK_FAIL(server_fd < 0))
+		goto close_bpf_object;
+
+	if (CHECK_FAIL(pthread_create(&tid, NULL, server_thread,
+				      (void *)&server_fd)))
 		goto close_bpf_object;
-	}
 
-	pthread_create(&tid, NULL, server_thread, (void *)&server_fd);
+	pthread_mutex_lock(&server_started_mtx);
+	pthread_cond_wait(&server_started, &server_started_mtx);
+	pthread_mutex_unlock(&server_started_mtx);
 
 	client_fd = connect_to_server(server_fd);
-	if (client_fd < 0) {
-		err = -1;
+	if (CHECK_FAIL(client_fd < 0))
 		goto close_server_fd;
-	}
 
-	err += verify_sockopt(client_fd, CUSTOM_INHERIT1, "connect", 0);
-	err += verify_sockopt(client_fd, CUSTOM_INHERIT2, "connect", 0);
-	err += verify_sockopt(client_fd, CUSTOM_LISTENER, "connect", 0);
+	CHECK_FAIL(verify_sockopt(client_fd, CUSTOM_INHERIT1, "connect", 0));
+	CHECK_FAIL(verify_sockopt(client_fd, CUSTOM_INHERIT2, "connect", 0));
+	CHECK_FAIL(verify_sockopt(client_fd, CUSTOM_LISTENER, "connect", 0));
 
 	pthread_join(tid, &server_err);
 
-	err += (int)(long)server_err;
+	err = (int)(long)server_err;
+	CHECK_FAIL(err);
 
 	close(client_fd);
 
@@ -221,33 +220,16 @@ static int run_test(int cgroup_fd)
 	close(server_fd);
 close_bpf_object:
 	bpf_object__close(obj);
-	return err;
 }
 
-int main(int args, char **argv)
+void test_sockopt_inherit(void)
 {
 	int cgroup_fd;
-	int err = EXIT_SUCCESS;
-
-	if (setup_cgroup_environment())
-		return err;
-
-	cgroup_fd = create_and_get_cgroup(CG_PATH);
-	if (cgroup_fd < 0)
-		goto cleanup_cgroup_env;
-
-	if (join_cgroup(CG_PATH))
-		goto cleanup_cgroup;
-
-	if (run_test(cgroup_fd))
-		err = EXIT_FAILURE;
 
-	printf("test_sockopt_inherit: %s\n",
-	       err == EXIT_SUCCESS ? "PASSED" : "FAILED");
+	cgroup_fd = test__join_cgroup("/sockopt_inherit");
+	if (CHECK_FAIL(cgroup_fd < 0))
+		return;
 
-cleanup_cgroup:
+	run_test(cgroup_fd);
 	close(cgroup_fd);
-cleanup_cgroup_env:
-	cleanup_cgroup_environment();
-	return err;
 }
-- 
2.23.0.187.g17f5b7556c-goog


^ permalink raw reply related

* [PATCH bpf-next 4/6] selftests/bpf: test_progs: convert test_sockopt_multi
From: Stanislav Fomichev @ 2019-09-04 16:25 UTC (permalink / raw)
  To: netdev, bpf; +Cc: davem, ast, daniel, Stanislav Fomichev
In-Reply-To: <20190904162509.199561-1-sdf@google.com>

Move the files, adjust includes, remove entry from Makefile & .gitignore

Signed-off-by: Stanislav Fomichev <sdf@google.com>
---
 tools/testing/selftests/bpf/.gitignore        |  1 -
 tools/testing/selftests/bpf/Makefile          |  3 +-
 .../sockopt_multi.c}                          | 62 +++----------------
 3 files changed, 11 insertions(+), 55 deletions(-)
 rename tools/testing/selftests/bpf/{test_sockopt_multi.c => prog_tests/sockopt_multi.c} (83%)

diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore
index bc83c1a7ea1b..4143add5a11e 100644
--- a/tools/testing/selftests/bpf/.gitignore
+++ b/tools/testing/selftests/bpf/.gitignore
@@ -39,6 +39,5 @@ libbpf.so.*
 test_hashmap
 test_btf_dump
 xdping
-test_sockopt_multi
 test_sockopt_inherit
 test_tcp_rtt
diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index ea790901297c..271f8ce89c97 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -29,7 +29,7 @@ TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test
 	test_cgroup_storage test_select_reuseport test_section_names \
 	test_netcnt test_tcpnotify_user test_sock_fields test_sysctl test_hashmap \
 	test_btf_dump test_cgroup_attach xdping \
-	test_sockopt_multi test_sockopt_inherit test_tcp_rtt
+	test_sockopt_inherit test_tcp_rtt
 
 BPF_OBJ_FILES = $(patsubst %.c,%.o, $(notdir $(wildcard progs/*.c)))
 TEST_GEN_FILES = $(BPF_OBJ_FILES)
@@ -109,7 +109,6 @@ $(OUTPUT)/test_netcnt: cgroup_helpers.c
 $(OUTPUT)/test_sock_fields: cgroup_helpers.c
 $(OUTPUT)/test_sysctl: cgroup_helpers.c
 $(OUTPUT)/test_cgroup_attach: cgroup_helpers.c
-$(OUTPUT)/test_sockopt_multi: cgroup_helpers.c
 $(OUTPUT)/test_sockopt_inherit: cgroup_helpers.c
 $(OUTPUT)/test_tcp_rtt: cgroup_helpers.c
 
diff --git a/tools/testing/selftests/bpf/test_sockopt_multi.c b/tools/testing/selftests/bpf/prog_tests/sockopt_multi.c
similarity index 83%
rename from tools/testing/selftests/bpf/test_sockopt_multi.c
rename to tools/testing/selftests/bpf/prog_tests/sockopt_multi.c
index 4be3441db867..29188d6f5c8d 100644
--- a/tools/testing/selftests/bpf/test_sockopt_multi.c
+++ b/tools/testing/selftests/bpf/prog_tests/sockopt_multi.c
@@ -1,19 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-
-#include <error.h>
-#include <errno.h>
-#include <stdio.h>
-#include <unistd.h>
-#include <sys/types.h>
-#include <sys/socket.h>
-#include <netinet/in.h>
-
-#include <linux/filter.h>
-#include <bpf/bpf.h>
-#include <bpf/libbpf.h>
-
-#include "bpf_rlimit.h"
-#include "bpf_util.h"
+#include <test_progs.h>
 #include "cgroup_helpers.h"
 
 static int prog_attach(struct bpf_object *obj, int cgroup_fd, const char *title)
@@ -308,7 +294,7 @@ static int run_setsockopt_test(struct bpf_object *obj, int cg_parent,
 	return err;
 }
 
-int main(int argc, char **argv)
+void test_sockopt_multi(void)
 {
 	struct bpf_prog_load_attr attr = {
 		.file = "./sockopt_multi.o",
@@ -319,56 +305,28 @@ int main(int argc, char **argv)
 	int err = -1;
 	int ignored;
 
-	if (setup_cgroup_environment()) {
-		log_err("Failed to setup cgroup environment\n");
-		goto out;
-	}
-
-	cg_parent = create_and_get_cgroup("/parent");
-	if (cg_parent < 0) {
-		log_err("Failed to create cgroup /parent\n");
-		goto out;
-	}
-
-	cg_child = create_and_get_cgroup("/parent/child");
-	if (cg_child < 0) {
-		log_err("Failed to create cgroup /parent/child\n");
+	cg_parent = test__join_cgroup("/parent");
+	if (CHECK_FAIL(cg_parent < 0))
 		goto out;
-	}
 
-	if (join_cgroup("/parent/child")) {
-		log_err("Failed to join cgroup /parent/child\n");
+	cg_child = test__join_cgroup("/parent/child");
+	if (CHECK_FAIL(cg_child < 0))
 		goto out;
-	}
 
 	err = bpf_prog_load_xattr(&attr, &obj, &ignored);
-	if (err) {
-		log_err("Failed to load BPF object");
+	if (CHECK_FAIL(err))
 		goto out;
-	}
 
 	sock_fd = socket(AF_INET, SOCK_STREAM, 0);
-	if (sock_fd < 0) {
-		log_err("Failed to create socket");
+	if (CHECK_FAIL(sock_fd < 0))
 		goto out;
-	}
 
-	if (run_getsockopt_test(obj, cg_parent, cg_child, sock_fd))
-		err = -1;
-	printf("test_sockopt_multi: getsockopt %s\n",
-	       err ? "FAILED" : "PASSED");
-
-	if (run_setsockopt_test(obj, cg_parent, cg_child, sock_fd))
-		err = -1;
-	printf("test_sockopt_multi: setsockopt %s\n",
-	       err ? "FAILED" : "PASSED");
+	CHECK_FAIL(run_getsockopt_test(obj, cg_parent, cg_child, sock_fd));
+	CHECK_FAIL(run_setsockopt_test(obj, cg_parent, cg_child, sock_fd));
 
 out:
 	close(sock_fd);
 	bpf_object__close(obj);
 	close(cg_child);
 	close(cg_parent);
-
-	printf("test_sockopt_multi: %s\n", err ? "FAILED" : "PASSED");
-	return err ? EXIT_FAILURE : EXIT_SUCCESS;
 }
-- 
2.23.0.187.g17f5b7556c-goog


^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox