netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCHv3 net 0/3] bond: fix xfrm offload issues
@ 2025-02-27  8:37 Hangbin Liu
  2025-02-27  8:37 ` [PATCHv3 net 1/3] bonding: move IPsec deletion to bond_ipsec_free_sa Hangbin Liu
                   ` (2 more replies)
  0 siblings, 3 replies; 16+ messages in thread
From: Hangbin Liu @ 2025-02-27  8:37 UTC (permalink / raw)
  To: netdev
  Cc: Jay Vosburgh, Andrew Lunn, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Nikolay Aleksandrov, Simon Horman,
	Shuah Khan, Tariq Toukan, Jianbo Liu, Jarod Wilson,
	Steffen Klassert, Cosmin Ratiu, linux-kselftest, linux-kernel,
	Hangbin Liu

The first patch fixes the incorrect locks using in bond driver.
The second patch fixes the xfrm offload feature during setup active-backup
mode. The third patch add a ipsec offload testing.

v3: move the ipsec deletion to bond_ipsec_free_sa (Cosmin Ratiu)
v2: do not turn carrier on if bond change link failed (Nikolay Aleksandrov)
    move the mutex lock to a work queue (Cosmin Ratiu)

Hangbin Liu (3):
  bonding: move IPsec deletion to bond_ipsec_free_sa
  bonding: fix xfrm offload feature setup on active-backup mode
  selftests: bonding: add ipsec offload test

 drivers/net/bonding/bond_main.c               |  36 ++--
 drivers/net/bonding/bond_netlink.c            |  16 +-
 include/net/bonding.h                         |   1 +
 .../selftests/drivers/net/bonding/Makefile    |   3 +-
 .../drivers/net/bonding/bond_ipsec_offload.sh | 155 ++++++++++++++++++
 .../selftests/drivers/net/bonding/config      |   4 +
 6 files changed, 195 insertions(+), 20 deletions(-)
 create mode 100755 tools/testing/selftests/drivers/net/bonding/bond_ipsec_offload.sh

-- 
2.46.0


^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCHv3 net 1/3] bonding: move IPsec deletion to bond_ipsec_free_sa
  2025-02-27  8:37 [PATCHv3 net 0/3] bond: fix xfrm offload issues Hangbin Liu
@ 2025-02-27  8:37 ` Hangbin Liu
  2025-02-27  8:50   ` Nikolay Aleksandrov
  2025-02-27  8:37 ` [PATCHv3 net 2/3] bonding: fix xfrm offload feature setup on active-backup mode Hangbin Liu
  2025-02-27  8:37 ` [PATCHv3 net 3/3] selftests: bonding: add ipsec offload test Hangbin Liu
  2 siblings, 1 reply; 16+ messages in thread
From: Hangbin Liu @ 2025-02-27  8:37 UTC (permalink / raw)
  To: netdev
  Cc: Jay Vosburgh, Andrew Lunn, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Nikolay Aleksandrov, Simon Horman,
	Shuah Khan, Tariq Toukan, Jianbo Liu, Jarod Wilson,
	Steffen Klassert, Cosmin Ratiu, linux-kselftest, linux-kernel,
	Hangbin Liu

The fixed commit placed mutex_lock() inside spin_lock_bh(), which triggers
a warning:

  BUG: sleeping function called from invalid context at...

Fix this by moving the IPsec deletion operation to bond_ipsec_free_sa,
which is not held by spin_lock_bh().

Additionally, delete the IPsec list in bond_ipsec_del_sa_all() when the
XFRM state is DEAD to prevent xdo_dev_state_free() from being triggered
again in bond_ipsec_free_sa().

For bond_ipsec_free_sa(), there are now three conditions:

  1. if (!slave): When no active device exists.
  2. if (!xs->xso.real_dev): When xdo_dev_state_add() fails.
  3. if (xs->xso.real_dev != real_dev): When an xs has already been freed
     by bond_ipsec_del_sa_all() due to migration, and the active slave has
     changed to a new device. At the same time, the xs is marked as DEAD
     due to the XFRM entry is removed, triggering xfrm_state_gc_task() and
     bond_ipsec_free_sa().

In all three cases, xdo_dev_state_free() should not be called, only xs
should be removed from bond->ipsec list.

Fixes: 2aeeef906d5a ("bonding: change ipsec_lock from spin lock to mutex")
Reported-by: Jakub Kicinski <kuba@kernel.org>
Closes: https://lore.kernel.org/netdev/20241212062734.182a0164@kernel.org
Suggested-by: Cosmin Ratiu <cratiu@nvidia.com>
Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
---
 drivers/net/bonding/bond_main.c | 34 ++++++++++++++++++++++-----------
 1 file changed, 23 insertions(+), 11 deletions(-)

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index e45bba240cbc..683bf1221caf 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -537,6 +537,10 @@ static void bond_ipsec_add_sa_all(struct bonding *bond)
 	}
 
 	list_for_each_entry(ipsec, &bond->ipsec_list, list) {
+		/* Skip dead xfrm states, they'll be freed later. */
+		if (ipsec->xs->km.state == XFRM_STATE_DEAD)
+			continue;
+
 		/* If new state is added before ipsec_lock acquired */
 		if (ipsec->xs->xso.real_dev == real_dev)
 			continue;
@@ -560,7 +564,6 @@ static void bond_ipsec_del_sa(struct xfrm_state *xs)
 	struct net_device *bond_dev = xs->xso.dev;
 	struct net_device *real_dev;
 	netdevice_tracker tracker;
-	struct bond_ipsec *ipsec;
 	struct bonding *bond;
 	struct slave *slave;
 
@@ -592,15 +595,6 @@ static void bond_ipsec_del_sa(struct xfrm_state *xs)
 	real_dev->xfrmdev_ops->xdo_dev_state_delete(xs);
 out:
 	netdev_put(real_dev, &tracker);
-	mutex_lock(&bond->ipsec_lock);
-	list_for_each_entry(ipsec, &bond->ipsec_list, list) {
-		if (ipsec->xs == xs) {
-			list_del(&ipsec->list);
-			kfree(ipsec);
-			break;
-		}
-	}
-	mutex_unlock(&bond->ipsec_lock);
 }
 
 static void bond_ipsec_del_sa_all(struct bonding *bond)
@@ -617,6 +611,12 @@ static void bond_ipsec_del_sa_all(struct bonding *bond)
 
 	mutex_lock(&bond->ipsec_lock);
 	list_for_each_entry(ipsec, &bond->ipsec_list, list) {
+		if (ipsec->xs->km.state == XFRM_STATE_DEAD) {
+			list_del(&ipsec->list);
+			kfree(ipsec);
+			continue;
+		}
+
 		if (!ipsec->xs->xso.real_dev)
 			continue;
 
@@ -640,6 +640,7 @@ static void bond_ipsec_free_sa(struct xfrm_state *xs)
 	struct net_device *bond_dev = xs->xso.dev;
 	struct net_device *real_dev;
 	netdevice_tracker tracker;
+	struct bond_ipsec *ipsec;
 	struct bonding *bond;
 	struct slave *slave;
 
@@ -659,13 +660,24 @@ static void bond_ipsec_free_sa(struct xfrm_state *xs)
 	if (!xs->xso.real_dev)
 		goto out;
 
-	WARN_ON(xs->xso.real_dev != real_dev);
+	if (xs->xso.real_dev != real_dev)
+		goto out;
 
 	if (real_dev && real_dev->xfrmdev_ops &&
 	    real_dev->xfrmdev_ops->xdo_dev_state_free)
 		real_dev->xfrmdev_ops->xdo_dev_state_free(xs);
 out:
 	netdev_put(real_dev, &tracker);
+
+	mutex_lock(&bond->ipsec_lock);
+	list_for_each_entry(ipsec, &bond->ipsec_list, list) {
+		if (ipsec->xs == xs) {
+			list_del(&ipsec->list);
+			kfree(ipsec);
+			break;
+		}
+	}
+	mutex_unlock(&bond->ipsec_lock);
 }
 
 /**
-- 
2.46.0


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCHv3 net 2/3] bonding: fix xfrm offload feature setup on active-backup mode
  2025-02-27  8:37 [PATCHv3 net 0/3] bond: fix xfrm offload issues Hangbin Liu
  2025-02-27  8:37 ` [PATCHv3 net 1/3] bonding: move IPsec deletion to bond_ipsec_free_sa Hangbin Liu
@ 2025-02-27  8:37 ` Hangbin Liu
  2025-02-27  8:37 ` [PATCHv3 net 3/3] selftests: bonding: add ipsec offload test Hangbin Liu
  2 siblings, 0 replies; 16+ messages in thread
From: Hangbin Liu @ 2025-02-27  8:37 UTC (permalink / raw)
  To: netdev
  Cc: Jay Vosburgh, Andrew Lunn, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Nikolay Aleksandrov, Simon Horman,
	Shuah Khan, Tariq Toukan, Jianbo Liu, Jarod Wilson,
	Steffen Klassert, Cosmin Ratiu, linux-kselftest, linux-kernel,
	Hangbin Liu

The active-backup bonding mode supports XFRM ESP offload. However, when
a bond is added using command like `ip link add bond0 type bond mode 1
miimon 100`, the `ethtool -k` command shows that the XFRM ESP offload is
disabled. This occurs because, in bond_newlink(), we change bond link
first and register bond device later. So the XFRM feature update in
bond_option_mode_set() is not called as the bond device is not yet
registered, leading to the offload feature not being set successfully.

To resolve this issue, we can modify the code order in bond_newlink() to
ensure that the bond device is registered first before changing the bond
link parameters. This change will allow the XFRM ESP offload feature to be
correctly enabled.

Fixes: 007ab5345545 ("bonding: fix feature flag setting at init time")
Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
---
 drivers/net/bonding/bond_main.c    |  2 +-
 drivers/net/bonding/bond_netlink.c | 16 +++++++++-------
 include/net/bonding.h              |  1 +
 3 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 683bf1221caf..65e4b5d599e6 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -4401,7 +4401,7 @@ void bond_work_init_all(struct bonding *bond)
 	INIT_DELAYED_WORK(&bond->slave_arr_work, bond_slave_arr_handler);
 }
 
-static void bond_work_cancel_all(struct bonding *bond)
+void bond_work_cancel_all(struct bonding *bond)
 {
 	cancel_delayed_work_sync(&bond->mii_work);
 	cancel_delayed_work_sync(&bond->arp_work);
diff --git a/drivers/net/bonding/bond_netlink.c b/drivers/net/bonding/bond_netlink.c
index 2a6a424806aa..ed16af6db557 100644
--- a/drivers/net/bonding/bond_netlink.c
+++ b/drivers/net/bonding/bond_netlink.c
@@ -568,18 +568,20 @@ static int bond_newlink(struct net *src_net, struct net_device *bond_dev,
 			struct nlattr *tb[], struct nlattr *data[],
 			struct netlink_ext_ack *extack)
 {
+	struct bonding *bond = netdev_priv(bond_dev);
 	int err;
 
-	err = bond_changelink(bond_dev, tb, data, extack);
-	if (err < 0)
+	err = register_netdevice(bond_dev);
+	if (err)
 		return err;
 
-	err = register_netdevice(bond_dev);
-	if (!err) {
-		struct bonding *bond = netdev_priv(bond_dev);
+	netif_carrier_off(bond_dev);
+	bond_work_init_all(bond);
 
-		netif_carrier_off(bond_dev);
-		bond_work_init_all(bond);
+	err = bond_changelink(bond_dev, tb, data, extack);
+	if (err) {
+		bond_work_cancel_all(bond);
+		unregister_netdevice(bond_dev);
 	}
 
 	return err;
diff --git a/include/net/bonding.h b/include/net/bonding.h
index 8bb5f016969f..e5e005cd2e17 100644
--- a/include/net/bonding.h
+++ b/include/net/bonding.h
@@ -707,6 +707,7 @@ struct bond_vlan_tag *bond_verify_device_path(struct net_device *start_dev,
 int bond_update_slave_arr(struct bonding *bond, struct slave *skipslave);
 void bond_slave_arr_work_rearm(struct bonding *bond, unsigned long delay);
 void bond_work_init_all(struct bonding *bond);
+void bond_work_cancel_all(struct bonding *bond);
 
 #ifdef CONFIG_PROC_FS
 void bond_create_proc_entry(struct bonding *bond);
-- 
2.46.0


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCHv3 net 3/3] selftests: bonding: add ipsec offload test
  2025-02-27  8:37 [PATCHv3 net 0/3] bond: fix xfrm offload issues Hangbin Liu
  2025-02-27  8:37 ` [PATCHv3 net 1/3] bonding: move IPsec deletion to bond_ipsec_free_sa Hangbin Liu
  2025-02-27  8:37 ` [PATCHv3 net 2/3] bonding: fix xfrm offload feature setup on active-backup mode Hangbin Liu
@ 2025-02-27  8:37 ` Hangbin Liu
  2025-02-27 13:59   ` Petr Machata
  2 siblings, 1 reply; 16+ messages in thread
From: Hangbin Liu @ 2025-02-27  8:37 UTC (permalink / raw)
  To: netdev
  Cc: Jay Vosburgh, Andrew Lunn, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Nikolay Aleksandrov, Simon Horman,
	Shuah Khan, Tariq Toukan, Jianbo Liu, Jarod Wilson,
	Steffen Klassert, Cosmin Ratiu, linux-kselftest, linux-kernel,
	Hangbin Liu

This introduces a test for IPSec offload over bonding, utilizing netdevsim
for the testing process, as veth interfaces do not support IPSec offload.
The test will ensure that the IPSec offload functionality remains operational
even after a failover event occurs in the bonding configuration.

Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
---
 .../selftests/drivers/net/bonding/Makefile    |   3 +-
 .../drivers/net/bonding/bond_ipsec_offload.sh | 155 ++++++++++++++++++
 .../selftests/drivers/net/bonding/config      |   4 +
 3 files changed, 161 insertions(+), 1 deletion(-)
 create mode 100755 tools/testing/selftests/drivers/net/bonding/bond_ipsec_offload.sh

diff --git a/tools/testing/selftests/drivers/net/bonding/Makefile b/tools/testing/selftests/drivers/net/bonding/Makefile
index 2b10854e4b1e..d5a7de16d33a 100644
--- a/tools/testing/selftests/drivers/net/bonding/Makefile
+++ b/tools/testing/selftests/drivers/net/bonding/Makefile
@@ -10,7 +10,8 @@ TEST_PROGS := \
 	mode-2-recovery-updelay.sh \
 	bond_options.sh \
 	bond-eth-type-change.sh \
-	bond_macvlan_ipvlan.sh
+	bond_macvlan_ipvlan.sh \
+	bond_ipsec_offload.sh
 
 TEST_FILES := \
 	lag_lib.sh \
diff --git a/tools/testing/selftests/drivers/net/bonding/bond_ipsec_offload.sh b/tools/testing/selftests/drivers/net/bonding/bond_ipsec_offload.sh
new file mode 100755
index 000000000000..169866b47a67
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/bonding/bond_ipsec_offload.sh
@@ -0,0 +1,155 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# IPsec over bonding offload test:
+#
+#  +----------------+
+#  |     bond0      |
+#  |       |        |
+#  |  eth0    eth1  |
+#  +---+-------+----+
+#
+# We use netdevsim instead of physical interfaces
+#-------------------------------------------------------------------
+# Example commands
+#   ip x s add proto esp src 192.0.2.1 dst 192.0.2.2 \
+#            spi 0x07 mode transport reqid 0x07 replay-window 32 \
+#            aead 'rfc4106(gcm(aes))' 1234567890123456dcba 128 \
+#            sel src 192.0.2.1/24 dst 192.0.2.2/24
+#            offload dev bond0 dir out
+#   ip x p add dir out src 192.0.2.1/24 dst 192.0.2.2/24 \
+#            tmpl proto esp src 192.0.2.1 dst 192.0.2.2 \
+#            spi 0x07 mode transport reqid 0x07
+#
+#-------------------------------------------------------------------
+
+lib_dir=$(dirname "$0")
+source "$lib_dir"/../../../net/lib.sh
+algo="aead rfc4106(gcm(aes)) 0x3132333435363738393031323334353664636261 128"
+srcip=192.0.2.1
+dstip=192.0.2.2
+ipsec0=/sys/kernel/debug/netdevsim/netdevsim0/ports/0/ipsec
+ipsec1=/sys/kernel/debug/netdevsim/netdevsim0/ports/1/ipsec
+ret=0
+
+cleanup()
+{
+	modprobe -r netdevsim
+	cleanup_ns $ns
+}
+
+active_slave_changed()
+{
+        local old_active_slave=$1
+        local new_active_slave=$(ip -n ${ns} -d -j link show bond0 | \
+				 jq -r ".[].linkinfo.info_data.active_slave")
+        [ "$new_active_slave" != "$old_active_slave" -a "$new_active_slave" != "null" ]
+}
+
+test_offload()
+{
+	# use ping to exercise the Tx path
+	ip netns exec $ns ping -I bond0 -c 3 -W 1 -i 0 $dstip >/dev/null
+
+	active_slave=$(ip -n ${ns} -d -j link show bond0 | \
+		       jq -r ".[].linkinfo.info_data.active_slave")
+
+	if [ $active_slave = $nic0 ]; then
+		sysfs=$ipsec0
+	elif [ $active_slave = $nic1 ]; then
+		sysfs=$ipsec1
+	else
+		echo "FAIL: bond_ipsec_offload invalid active_slave $active_slave"
+		ret=1
+	fi
+
+	# The tx/rx order in sysfs may changed after failover
+	if grep -q "SA count=2 tx=3" $sysfs && grep -q "tx ipaddr=$dstip" $sysfs; then
+		echo "PASS: bond_ipsec_offload has correct tx count with link ${active_slave}"
+	else
+		echo "FAIL: bond_ipsec_offload incorrect tx count with link ${active_slave}"
+		ret=1
+	fi
+}
+
+if ! mount | grep -q debugfs; then
+	mount -t debugfs none /sys/kernel/debug/ &> /dev/null
+fi
+
+# setup netdevsim since dummy/veth dev doesn't have offload support
+if [ ! -w /sys/bus/netdevsim/new_device ] ; then
+	modprobe -q netdevsim
+	if [ $? -ne 0 ]; then
+		echo "SKIP: can't load netdevsim for ipsec offload"
+		exit $ksft_skip
+	fi
+fi
+
+trap cleanup EXIT
+
+setup_ns ns
+ip -n $ns link add bond0 type bond mode active-backup miimon 100
+ip -n $ns addr add $srcip/24 dev bond0
+ip -n $ns link set bond0 up
+
+ifaces=$(ip netns exec $ns bash -c '
+	sysfsnet=/sys/bus/netdevsim/devices/netdevsim0/net/
+	echo "0 2" > /sys/bus/netdevsim/new_device
+	while [ ! -d $sysfsnet ] ; do :; done
+	udevadm settle
+	ls $sysfsnet
+')
+nic0=$(echo $ifaces | cut -f1 -d ' ')
+nic1=$(echo $ifaces | cut -f2 -d ' ')
+ip -n $ns link set $nic0 master bond0
+ip -n $ns link set $nic1 master bond0
+
+# create offloaded SAs, both in and out
+ip -n $ns x p add dir out src $srcip/24 dst $dstip/24 \
+    tmpl proto esp src $srcip dst $dstip spi 9 \
+    mode transport reqid 42
+
+ip -n $ns x p add dir in src $dstip/24 dst $srcip/24 \
+    tmpl proto esp src $dstip dst $srcip spi 9 \
+    mode transport reqid 42
+
+ip -n $ns x s add proto esp src $srcip dst $dstip spi 9 \
+    mode transport reqid 42 $algo sel src $srcip/24 dst $dstip/24 \
+    offload dev bond0 dir out
+
+ip -n $ns x s add proto esp src $dstip dst $srcip spi 9 \
+    mode transport reqid 42 $algo sel src $dstip/24 dst $srcip/24 \
+    offload dev bond0 dir in
+
+# does offload show up in ip output
+lines=`ip -n $ns x s list | grep -c "crypto offload parameters: dev bond0 dir"`
+if [ $lines -ne 2 ] ; then
+	echo "FAIL: bond_ipsec_offload SA offload missing from list output"
+	ret=1
+fi
+
+# we didn't create a peer, make sure we can Tx by adding a permanent neighbour
+# this need to be added after enslave
+ip -n $ns neigh add $dstip dev bond0 lladdr 00:11:22:33:44:55
+
+# start Offload testing
+test_offload
+
+# do failover
+ip -n $ns link set $active_slave down
+slowwait 5 active_slave_changed $active_slave
+test_offload
+
+# make sure offload get removed from driver
+ip -n $ns x s flush
+ip -n $ns x p flush
+line0=$(grep -c "SA count=0" $ipsec0)
+line1=$(grep -c "SA count=0" $ipsec1)
+if [ $line0 -ne 1 -o $line1 -ne 1 ]  ; then
+	echo "FAIL: bond_ipsec_offload SA not removed from driver"
+	ret=1
+else
+	echo "PASS: bond_ipsec_offload SA removed from driver"
+fi
+
+exit $ret
diff --git a/tools/testing/selftests/drivers/net/bonding/config b/tools/testing/selftests/drivers/net/bonding/config
index dad4e5fda4db..054fb772846f 100644
--- a/tools/testing/selftests/drivers/net/bonding/config
+++ b/tools/testing/selftests/drivers/net/bonding/config
@@ -9,3 +9,7 @@ CONFIG_NET_CLS_FLOWER=y
 CONFIG_NET_SCH_INGRESS=y
 CONFIG_NLMON=y
 CONFIG_VETH=y
+CONFIG_INET_ESP=y
+CONFIG_INET_ESP_OFFLOAD=y
+CONFIG_XFRM_USER=m
+CONFIG_NETDEVSIM=m
-- 
2.46.0


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* Re: [PATCHv3 net 1/3] bonding: move IPsec deletion to bond_ipsec_free_sa
  2025-02-27  8:37 ` [PATCHv3 net 1/3] bonding: move IPsec deletion to bond_ipsec_free_sa Hangbin Liu
@ 2025-02-27  8:50   ` Nikolay Aleksandrov
  2025-02-27  9:21     ` Nikolay Aleksandrov
  0 siblings, 1 reply; 16+ messages in thread
From: Nikolay Aleksandrov @ 2025-02-27  8:50 UTC (permalink / raw)
  To: Hangbin Liu, netdev
  Cc: Jay Vosburgh, Andrew Lunn, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Simon Horman, Shuah Khan,
	Tariq Toukan, Jianbo Liu, Jarod Wilson, Steffen Klassert,
	Cosmin Ratiu, linux-kselftest, linux-kernel

On 2/27/25 10:37, Hangbin Liu wrote:
> The fixed commit placed mutex_lock() inside spin_lock_bh(), which triggers
> a warning:
> 
>   BUG: sleeping function called from invalid context at...
> 
> Fix this by moving the IPsec deletion operation to bond_ipsec_free_sa,
> which is not held by spin_lock_bh().
> 
> Additionally, delete the IPsec list in bond_ipsec_del_sa_all() when the
> XFRM state is DEAD to prevent xdo_dev_state_free() from being triggered
> again in bond_ipsec_free_sa().
> 
> For bond_ipsec_free_sa(), there are now three conditions:
> 
>   1. if (!slave): When no active device exists.
>   2. if (!xs->xso.real_dev): When xdo_dev_state_add() fails.
>   3. if (xs->xso.real_dev != real_dev): When an xs has already been freed
>      by bond_ipsec_del_sa_all() due to migration, and the active slave has
>      changed to a new device. At the same time, the xs is marked as DEAD
>      due to the XFRM entry is removed, triggering xfrm_state_gc_task() and
>      bond_ipsec_free_sa().
> 
> In all three cases, xdo_dev_state_free() should not be called, only xs
> should be removed from bond->ipsec list.
> 
> Fixes: 2aeeef906d5a ("bonding: change ipsec_lock from spin lock to mutex")
> Reported-by: Jakub Kicinski <kuba@kernel.org>
> Closes: https://lore.kernel.org/netdev/20241212062734.182a0164@kernel.org
> Suggested-by: Cosmin Ratiu <cratiu@nvidia.com>
> Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
> ---
>  drivers/net/bonding/bond_main.c | 34 ++++++++++++++++++++++-----------
>  1 file changed, 23 insertions(+), 11 deletions(-)
> 
> diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
> index e45bba240cbc..683bf1221caf 100644
> --- a/drivers/net/bonding/bond_main.c
> +++ b/drivers/net/bonding/bond_main.c
> @@ -537,6 +537,10 @@ static void bond_ipsec_add_sa_all(struct bonding *bond)
>  	}
>  
>  	list_for_each_entry(ipsec, &bond->ipsec_list, list) {
> +		/* Skip dead xfrm states, they'll be freed later. */
> +		if (ipsec->xs->km.state == XFRM_STATE_DEAD)
> +			continue;
> +
>  		/* If new state is added before ipsec_lock acquired */
>  		if (ipsec->xs->xso.real_dev == real_dev)
>  			continue;
> @@ -560,7 +564,6 @@ static void bond_ipsec_del_sa(struct xfrm_state *xs)
>  	struct net_device *bond_dev = xs->xso.dev;
>  	struct net_device *real_dev;
>  	netdevice_tracker tracker;
> -	struct bond_ipsec *ipsec;
>  	struct bonding *bond;
>  	struct slave *slave;
>  
> @@ -592,15 +595,6 @@ static void bond_ipsec_del_sa(struct xfrm_state *xs)
>  	real_dev->xfrmdev_ops->xdo_dev_state_delete(xs);
>  out:
>  	netdev_put(real_dev, &tracker);
> -	mutex_lock(&bond->ipsec_lock);
> -	list_for_each_entry(ipsec, &bond->ipsec_list, list) {
> -		if (ipsec->xs == xs) {
> -			list_del(&ipsec->list);
> -			kfree(ipsec);
> -			break;
> -		}
> -	}
> -	mutex_unlock(&bond->ipsec_lock);
>  }
>  
>  static void bond_ipsec_del_sa_all(struct bonding *bond)
> @@ -617,6 +611,12 @@ static void bond_ipsec_del_sa_all(struct bonding *bond)
>  
>  	mutex_lock(&bond->ipsec_lock);
>  	list_for_each_entry(ipsec, &bond->ipsec_list, list) {
> +		if (ipsec->xs->km.state == XFRM_STATE_DEAD) {
> +			list_del(&ipsec->list);

To be able to do this here, you'll have to use list_for_each_entry_safe().

> +			kfree(ipsec);
> +			continue;
> +		}
> +
>  		if (!ipsec->xs->xso.real_dev)
>  			continue;
>  
> @@ -640,6 +640,7 @@ static void bond_ipsec_free_sa(struct xfrm_state *xs)
>  	struct net_device *bond_dev = xs->xso.dev;
>  	struct net_device *real_dev;
>  	netdevice_tracker tracker;
> +	struct bond_ipsec *ipsec;
>  	struct bonding *bond;
>  	struct slave *slave;
>  
> @@ -659,13 +660,24 @@ static void bond_ipsec_free_sa(struct xfrm_state *xs)
>  	if (!xs->xso.real_dev)
>  		goto out;
>  
> -	WARN_ON(xs->xso.real_dev != real_dev);
> +	if (xs->xso.real_dev != real_dev)
> +		goto out;
>  
>  	if (real_dev && real_dev->xfrmdev_ops &&
>  	    real_dev->xfrmdev_ops->xdo_dev_state_free)
>  		real_dev->xfrmdev_ops->xdo_dev_state_free(xs);
>  out:
>  	netdev_put(real_dev, &tracker);
> +
> +	mutex_lock(&bond->ipsec_lock);
> +	list_for_each_entry(ipsec, &bond->ipsec_list, list) {
> +		if (ipsec->xs == xs) {
> +			list_del(&ipsec->list);
> +			kfree(ipsec);
> +			break;
> +		}
> +	}
> +	mutex_unlock(&bond->ipsec_lock);
>  }
>  
>  /**


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCHv3 net 1/3] bonding: move IPsec deletion to bond_ipsec_free_sa
  2025-02-27  8:50   ` Nikolay Aleksandrov
@ 2025-02-27  9:21     ` Nikolay Aleksandrov
  2025-02-27 13:21       ` Hangbin Liu
  0 siblings, 1 reply; 16+ messages in thread
From: Nikolay Aleksandrov @ 2025-02-27  9:21 UTC (permalink / raw)
  To: Hangbin Liu, netdev
  Cc: Jay Vosburgh, Andrew Lunn, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Simon Horman, Shuah Khan,
	Tariq Toukan, Jianbo Liu, Jarod Wilson, Steffen Klassert,
	Cosmin Ratiu, linux-kselftest, linux-kernel

On 2/27/25 10:50, Nikolay Aleksandrov wrote:
> On 2/27/25 10:37, Hangbin Liu wrote:
>> The fixed commit placed mutex_lock() inside spin_lock_bh(), which triggers
>> a warning:
>>
>>   BUG: sleeping function called from invalid context at...
>>
>> Fix this by moving the IPsec deletion operation to bond_ipsec_free_sa,
>> which is not held by spin_lock_bh().
>>
>> Additionally, delete the IPsec list in bond_ipsec_del_sa_all() when the
>> XFRM state is DEAD to prevent xdo_dev_state_free() from being triggered
>> again in bond_ipsec_free_sa().
>>
>> For bond_ipsec_free_sa(), there are now three conditions:
>>
>>   1. if (!slave): When no active device exists.
>>   2. if (!xs->xso.real_dev): When xdo_dev_state_add() fails.
>>   3. if (xs->xso.real_dev != real_dev): When an xs has already been freed
>>      by bond_ipsec_del_sa_all() due to migration, and the active slave has
>>      changed to a new device. At the same time, the xs is marked as DEAD
>>      due to the XFRM entry is removed, triggering xfrm_state_gc_task() and
>>      bond_ipsec_free_sa().
>>
>> In all three cases, xdo_dev_state_free() should not be called, only xs
>> should be removed from bond->ipsec list.
>>
>> Fixes: 2aeeef906d5a ("bonding: change ipsec_lock from spin lock to mutex")
>> Reported-by: Jakub Kicinski <kuba@kernel.org>
>> Closes: https://lore.kernel.org/netdev/20241212062734.182a0164@kernel.org
>> Suggested-by: Cosmin Ratiu <cratiu@nvidia.com>
>> Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
>> ---
>>  drivers/net/bonding/bond_main.c | 34 ++++++++++++++++++++++-----------
>>  1 file changed, 23 insertions(+), 11 deletions(-)
>>
>> diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
>> index e45bba240cbc..683bf1221caf 100644
>> --- a/drivers/net/bonding/bond_main.c
>> +++ b/drivers/net/bonding/bond_main.c
>> @@ -537,6 +537,10 @@ static void bond_ipsec_add_sa_all(struct bonding *bond)
>>  	}
>>  
>>  	list_for_each_entry(ipsec, &bond->ipsec_list, list) {
>> +		/* Skip dead xfrm states, they'll be freed later. */
>> +		if (ipsec->xs->km.state == XFRM_STATE_DEAD)
>> +			continue;
>> +
>>  		/* If new state is added before ipsec_lock acquired */
>>  		if (ipsec->xs->xso.real_dev == real_dev)
>>  			continue;
>> @@ -560,7 +564,6 @@ static void bond_ipsec_del_sa(struct xfrm_state *xs)
>>  	struct net_device *bond_dev = xs->xso.dev;
>>  	struct net_device *real_dev;
>>  	netdevice_tracker tracker;
>> -	struct bond_ipsec *ipsec;
>>  	struct bonding *bond;
>>  	struct slave *slave;
>>  
>> @@ -592,15 +595,6 @@ static void bond_ipsec_del_sa(struct xfrm_state *xs)
>>  	real_dev->xfrmdev_ops->xdo_dev_state_delete(xs);
>>  out:
>>  	netdev_put(real_dev, &tracker);
>> -	mutex_lock(&bond->ipsec_lock);
>> -	list_for_each_entry(ipsec, &bond->ipsec_list, list) {
>> -		if (ipsec->xs == xs) {
>> -			list_del(&ipsec->list);
>> -			kfree(ipsec);
>> -			break;
>> -		}
>> -	}
>> -	mutex_unlock(&bond->ipsec_lock);
>>  }
>>  
>>  static void bond_ipsec_del_sa_all(struct bonding *bond)
>> @@ -617,6 +611,12 @@ static void bond_ipsec_del_sa_all(struct bonding *bond)
>>  
>>  	mutex_lock(&bond->ipsec_lock);
>>  	list_for_each_entry(ipsec, &bond->ipsec_list, list) {
>> +		if (ipsec->xs->km.state == XFRM_STATE_DEAD) {
>> +			list_del(&ipsec->list);
> 
> To be able to do this here, you'll have to use list_for_each_entry_safe().
> 

One more thing - note I'm not an xfrm expert by far but it seems to me here you have
to also call  xdo_dev_state_free() with the old active slave dev otherwise that will
never get called with the original real_dev after the switch to a new
active slave (or more accurately it might if the GC runs between the switching
but it is a race), care must be taken wrt sequence of events because the XFRM
GC may be running in parallel which probably means that in bond_ipsec_free_sa()
you'll have to take the mutex before calling xdo_dev_state_free() and check
if the entry is still linked in the bond's ipsec list before calling the free_sa
callback, if it isn't then del_sa_all got to it before the GC and there's nothing
to do if it also called the dev's free_sa callback. The check for real_dev doesn't
seem enough to protect against this race.

Cheers,
 Nik

>> +			kfree(ipsec);
>> +			continue;
>> +		}
>> +
>>  		if (!ipsec->xs->xso.real_dev)
>>  			continue;
>>  
>> @@ -640,6 +640,7 @@ static void bond_ipsec_free_sa(struct xfrm_state *xs)
>>  	struct net_device *bond_dev = xs->xso.dev;
>>  	struct net_device *real_dev;
>>  	netdevice_tracker tracker;
>> +	struct bond_ipsec *ipsec;
>>  	struct bonding *bond;
>>  	struct slave *slave;
>>  
>> @@ -659,13 +660,24 @@ static void bond_ipsec_free_sa(struct xfrm_state *xs)
>>  	if (!xs->xso.real_dev)
>>  		goto out;
>>  
>> -	WARN_ON(xs->xso.real_dev != real_dev);
>> +	if (xs->xso.real_dev != real_dev)
>> +		goto out;
>>  
>>  	if (real_dev && real_dev->xfrmdev_ops &&
>>  	    real_dev->xfrmdev_ops->xdo_dev_state_free)
>>  		real_dev->xfrmdev_ops->xdo_dev_state_free(xs);
>>  out:
>>  	netdev_put(real_dev, &tracker);
>> +
>> +	mutex_lock(&bond->ipsec_lock);
>> +	list_for_each_entry(ipsec, &bond->ipsec_list, list) {
>> +		if (ipsec->xs == xs) {
>> +			list_del(&ipsec->list);
>> +			kfree(ipsec);
>> +			break;
>> +		}
>> +	}
>> +	mutex_unlock(&bond->ipsec_lock);
>>  }
>>  
>>  /**
> 


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCHv3 net 1/3] bonding: move IPsec deletion to bond_ipsec_free_sa
  2025-02-27  9:21     ` Nikolay Aleksandrov
@ 2025-02-27 13:21       ` Hangbin Liu
  2025-02-27 13:31         ` Nikolay Aleksandrov
  0 siblings, 1 reply; 16+ messages in thread
From: Hangbin Liu @ 2025-02-27 13:21 UTC (permalink / raw)
  To: Nikolay Aleksandrov
  Cc: netdev, Jay Vosburgh, Andrew Lunn, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Simon Horman, Shuah Khan,
	Tariq Toukan, Jianbo Liu, Jarod Wilson, Steffen Klassert,
	Cosmin Ratiu, linux-kselftest, linux-kernel

On Thu, Feb 27, 2025 at 11:21:51AM +0200, Nikolay Aleksandrov wrote:
> >> @@ -617,6 +611,12 @@ static void bond_ipsec_del_sa_all(struct bonding *bond)
> >>  
> >>  	mutex_lock(&bond->ipsec_lock);
> >>  	list_for_each_entry(ipsec, &bond->ipsec_list, list) {
> >> +		if (ipsec->xs->km.state == XFRM_STATE_DEAD) {
> >> +			list_del(&ipsec->list);
> > 
> > To be able to do this here, you'll have to use list_for_each_entry_safe().
> > 
> 
> One more thing - note I'm not an xfrm expert by far but it seems to me here you have
> to also call  xdo_dev_state_free() with the old active slave dev otherwise that will
> never get called with the original real_dev after the switch to a new
> active slave (or more accurately it might if the GC runs between the switching
> but it is a race), care must be taken wrt sequence of events because the XFRM

Can we just call xs->xso.real_dev->xfrmdev_ops->xdo_dev_state_free(xs)
no matter xs->xso.real_dev == real_dev or not? I'm afraid calling
xdo_dev_state_free() every where may make us lot more easily.

> GC may be running in parallel which probably means that in bond_ipsec_free_sa()
> you'll have to take the mutex before calling xdo_dev_state_free() and check
> if the entry is still linked in the bond's ipsec list before calling the free_sa
> callback, if it isn't then del_sa_all got to it before the GC and there's nothing
> to do if it also called the dev's free_sa callback. The check for real_dev doesn't
> seem enough to protect against this race.

I agree that we need to take the mutex before calling xdo_dev_state_free()
in bond_ipsec_free_sa(). Do you think if this is enough? I'm a bit lot here.

Thanks
Hangbin

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCHv3 net 1/3] bonding: move IPsec deletion to bond_ipsec_free_sa
  2025-02-27 13:21       ` Hangbin Liu
@ 2025-02-27 13:31         ` Nikolay Aleksandrov
  2025-02-28  2:20           ` Hangbin Liu
  0 siblings, 1 reply; 16+ messages in thread
From: Nikolay Aleksandrov @ 2025-02-27 13:31 UTC (permalink / raw)
  To: Hangbin Liu
  Cc: netdev, Jay Vosburgh, Andrew Lunn, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Simon Horman, Shuah Khan,
	Tariq Toukan, Jianbo Liu, Jarod Wilson, Steffen Klassert,
	Cosmin Ratiu, linux-kselftest, linux-kernel

On 2/27/25 15:21, Hangbin Liu wrote:
> On Thu, Feb 27, 2025 at 11:21:51AM +0200, Nikolay Aleksandrov wrote:
>>>> @@ -617,6 +611,12 @@ static void bond_ipsec_del_sa_all(struct bonding *bond)
>>>>  
>>>>  	mutex_lock(&bond->ipsec_lock);
>>>>  	list_for_each_entry(ipsec, &bond->ipsec_list, list) {
>>>> +		if (ipsec->xs->km.state == XFRM_STATE_DEAD) {
>>>> +			list_del(&ipsec->list);
>>>
>>> To be able to do this here, you'll have to use list_for_each_entry_safe().
>>>
>>
>> One more thing - note I'm not an xfrm expert by far but it seems to me here you have
>> to also call  xdo_dev_state_free() with the old active slave dev otherwise that will
>> never get called with the original real_dev after the switch to a new
>> active slave (or more accurately it might if the GC runs between the switching
>> but it is a race), care must be taken wrt sequence of events because the XFRM
> 
> Can we just call xs->xso.real_dev->xfrmdev_ops->xdo_dev_state_free(xs)
> no matter xs->xso.real_dev == real_dev or not? I'm afraid calling
> xdo_dev_state_free() every where may make us lot more easily.
> 

You'd have to check all drivers that implement the callback to answer that and even then
I'd stick to the canonical way of how it's done in xfrm and make the bond just passthrough.
Any other games become dangerous and new code will have to be carefully reviewed every
time, calling another device's free_sa when it wasn't added before doesn't sound good.

>> GC may be running in parallel which probably means that in bond_ipsec_free_sa()
>> you'll have to take the mutex before calling xdo_dev_state_free() and check
>> if the entry is still linked in the bond's ipsec list before calling the free_sa
>> callback, if it isn't then del_sa_all got to it before the GC and there's nothing
>> to do if it also called the dev's free_sa callback. The check for real_dev doesn't
>> seem enough to protect against this race.
> 
> I agree that we need to take the mutex before calling xdo_dev_state_free()
> in bond_ipsec_free_sa(). Do you think if this is enough? I'm a bit lot here.
> 
> Thanks
> Hangbin

Well, the race is between the xfrm GC and del_sa_all, in bond's free_sa if you
walk the list under the mutex before calling real_dev's free callback and
don't find the current element that's being freed in free_sa then it was
cleaned up by del_sa_all, otherwise del_sa_all is waiting to walk that
list and clean the entries. I think it should be fine as long as free_sa
was called once with the proper device.




^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCHv3 net 3/3] selftests: bonding: add ipsec offload test
  2025-02-27  8:37 ` [PATCHv3 net 3/3] selftests: bonding: add ipsec offload test Hangbin Liu
@ 2025-02-27 13:59   ` Petr Machata
  0 siblings, 0 replies; 16+ messages in thread
From: Petr Machata @ 2025-02-27 13:59 UTC (permalink / raw)
  To: Hangbin Liu
  Cc: netdev, Jay Vosburgh, Andrew Lunn, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Nikolay Aleksandrov, Simon Horman,
	Shuah Khan, Tariq Toukan, Jianbo Liu, Jarod Wilson,
	Steffen Klassert, Cosmin Ratiu, linux-kselftest, linux-kernel


Hangbin Liu <liuhangbin@gmail.com> writes:

> This introduces a test for IPSec offload over bonding, utilizing netdevsim
> for the testing process, as veth interfaces do not support IPSec offload.
> The test will ensure that the IPSec offload functionality remains operational
> even after a failover event occurs in the bonding configuration.
>
> Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
> ---
>  .../selftests/drivers/net/bonding/Makefile    |   3 +-
>  .../drivers/net/bonding/bond_ipsec_offload.sh | 155 ++++++++++++++++++
>  .../selftests/drivers/net/bonding/config      |   4 +
>  3 files changed, 161 insertions(+), 1 deletion(-)
>  create mode 100755 tools/testing/selftests/drivers/net/bonding/bond_ipsec_offload.sh
>
> diff --git a/tools/testing/selftests/drivers/net/bonding/Makefile b/tools/testing/selftests/drivers/net/bonding/Makefile
> index 2b10854e4b1e..d5a7de16d33a 100644
> --- a/tools/testing/selftests/drivers/net/bonding/Makefile
> +++ b/tools/testing/selftests/drivers/net/bonding/Makefile
> @@ -10,7 +10,8 @@ TEST_PROGS := \
>  	mode-2-recovery-updelay.sh \
>  	bond_options.sh \
>  	bond-eth-type-change.sh \
> -	bond_macvlan_ipvlan.sh
> +	bond_macvlan_ipvlan.sh \
> +	bond_ipsec_offload.sh
>  
>  TEST_FILES := \
>  	lag_lib.sh \
> diff --git a/tools/testing/selftests/drivers/net/bonding/bond_ipsec_offload.sh b/tools/testing/selftests/drivers/net/bonding/bond_ipsec_offload.sh
> new file mode 100755
> index 000000000000..169866b47a67
> --- /dev/null
> +++ b/tools/testing/selftests/drivers/net/bonding/bond_ipsec_offload.sh
> @@ -0,0 +1,155 @@
> +#!/bin/bash
> +# SPDX-License-Identifier: GPL-2.0
> +
> +# IPsec over bonding offload test:
> +#
> +#  +----------------+
> +#  |     bond0      |
> +#  |       |        |
> +#  |  eth0    eth1  |
> +#  +---+-------+----+
> +#
> +# We use netdevsim instead of physical interfaces
> +#-------------------------------------------------------------------
> +# Example commands
> +#   ip x s add proto esp src 192.0.2.1 dst 192.0.2.2 \
> +#            spi 0x07 mode transport reqid 0x07 replay-window 32 \
> +#            aead 'rfc4106(gcm(aes))' 1234567890123456dcba 128 \
> +#            sel src 192.0.2.1/24 dst 192.0.2.2/24
> +#            offload dev bond0 dir out
> +#   ip x p add dir out src 192.0.2.1/24 dst 192.0.2.2/24 \
> +#            tmpl proto esp src 192.0.2.1 dst 192.0.2.2 \
> +#            spi 0x07 mode transport reqid 0x07
> +#
> +#-------------------------------------------------------------------
> +
> +lib_dir=$(dirname "$0")
> +source "$lib_dir"/../../../net/lib.sh
> +algo="aead rfc4106(gcm(aes)) 0x3132333435363738393031323334353664636261 128"
> +srcip=192.0.2.1
> +dstip=192.0.2.2
> +ipsec0=/sys/kernel/debug/netdevsim/netdevsim0/ports/0/ipsec
> +ipsec1=/sys/kernel/debug/netdevsim/netdevsim0/ports/1/ipsec
> +ret=0
> +
> +cleanup()
> +{
> +	modprobe -r netdevsim
> +	cleanup_ns $ns
> +}
> +
> +active_slave_changed()
> +{
> +        local old_active_slave=$1
> +        local new_active_slave=$(ip -n ${ns} -d -j link show bond0 | \
> +				 jq -r ".[].linkinfo.info_data.active_slave")
> +        [ "$new_active_slave" != "$old_active_slave" -a "$new_active_slave" != "null" ]
> +}
> +
> +test_offload()
> +{
> +	# use ping to exercise the Tx path
> +	ip netns exec $ns ping -I bond0 -c 3 -W 1 -i 0 $dstip >/dev/null
> +
> +	active_slave=$(ip -n ${ns} -d -j link show bond0 | \
> +		       jq -r ".[].linkinfo.info_data.active_slave")
> +
> +	if [ $active_slave = $nic0 ]; then
> +		sysfs=$ipsec0
> +	elif [ $active_slave = $nic1 ]; then
> +		sysfs=$ipsec1
> +	else
> +		echo "FAIL: bond_ipsec_offload invalid active_slave $active_slave"
> +		ret=1
> +	fi
> +
> +	# The tx/rx order in sysfs may changed after failover
> +	if grep -q "SA count=2 tx=3" $sysfs && grep -q "tx ipaddr=$dstip" $sysfs; then
> +		echo "PASS: bond_ipsec_offload has correct tx count with link ${active_slave}"
> +	else
> +		echo "FAIL: bond_ipsec_offload incorrect tx count with link ${active_slave}"
> +		ret=1
> +	fi

lib.sh got all sorts of logging and checking helpers that were
previously in forwarding/, I think it makes sense to use them. Would the
following make sense to you?

test_offload()
{
	# use ping to exercise the Tx path
	ip netns exec $ns ping -I bond0 -c 3 -W 1 -i 0 $dstip >/dev/null

	active_slave=$(ip -n ${ns} -d -j link show bond0 | \
		       jq -r ".[].linkinfo.info_data.active_slave")

	RET=0

	if [ $active_slave = $nic0 ]; then
		sysfs=$ipsec0
	elif [ $active_slave = $nic1 ]; then
		sysfs=$ipsec1
	else
		check_err 1 "bond_ipsec_offload invalid active_slave $active_slave"
	fi

	# The tx/rx order in sysfs may changed after failover
	grep -q "SA count=2 tx=3" $sysfs && grep -q "tx ipaddr=$dstip" $sysfs
 	check_err $? "incorrect tx count with link ${active_slave}"

	log_test bond_ipsec_offload
}

... etc. below.

> +}
> +
> +if ! mount | grep -q debugfs; then
> +	mount -t debugfs none /sys/kernel/debug/ &> /dev/null

Clean this up at exit?

	defer umount /sys/kernel/debug/

(But then the cleanup trap needs to be registered sooner, and cleanup()
needs to invoke defer_scopes_cleanup.)

> +fi
> +
> +# setup netdevsim since dummy/veth dev doesn't have offload support
> +if [ ! -w /sys/bus/netdevsim/new_device ] ; then
> +	modprobe -q netdevsim
> +	if [ $? -ne 0 ]; then
> +		echo "SKIP: can't load netdevsim for ipsec offload"
> +		exit $ksft_skip
> +	fi

And here you can just schedule a cleanup, as above.

	defer modprobe -r netdevsim

> +fi
> +
> +trap cleanup EXIT
> +
> +setup_ns ns

defer cleanup_ns $ns

And at that point you can drop cleanup altogether, and just have:

trap defer_scopes_cleanup EXIT

> +ip -n $ns link add bond0 type bond mode active-backup miimon 100
> +ip -n $ns addr add $srcip/24 dev bond0
> +ip -n $ns link set bond0 up
> +
> +ifaces=$(ip netns exec $ns bash -c '
> +	sysfsnet=/sys/bus/netdevsim/devices/netdevsim0/net/
> +	echo "0 2" > /sys/bus/netdevsim/new_device
> +	while [ ! -d $sysfsnet ] ; do :; done
> +	udevadm settle
> +	ls $sysfsnet
> +')
> +nic0=$(echo $ifaces | cut -f1 -d ' ')
> +nic1=$(echo $ifaces | cut -f2 -d ' ')
> +ip -n $ns link set $nic0 master bond0
> +ip -n $ns link set $nic1 master bond0
> +
> +# create offloaded SAs, both in and out
> +ip -n $ns x p add dir out src $srcip/24 dst $dstip/24 \
> +    tmpl proto esp src $srcip dst $dstip spi 9 \
> +    mode transport reqid 42
> +
> +ip -n $ns x p add dir in src $dstip/24 dst $srcip/24 \
> +    tmpl proto esp src $dstip dst $srcip spi 9 \
> +    mode transport reqid 42
> +
> +ip -n $ns x s add proto esp src $srcip dst $dstip spi 9 \
> +    mode transport reqid 42 $algo sel src $srcip/24 dst $dstip/24 \
> +    offload dev bond0 dir out
> +
> +ip -n $ns x s add proto esp src $dstip dst $srcip spi 9 \
> +    mode transport reqid 42 $algo sel src $dstip/24 dst $srcip/24 \
> +    offload dev bond0 dir in
> +
> +# does offload show up in ip output
> +lines=`ip -n $ns x s list | grep -c "crypto offload parameters: dev bond0 dir"`
> +if [ $lines -ne 2 ] ; then
> +	echo "FAIL: bond_ipsec_offload SA offload missing from list output"
> +	ret=1
> +fi
> +
> +# we didn't create a peer, make sure we can Tx by adding a permanent neighbour
> +# this need to be added after enslave
> +ip -n $ns neigh add $dstip dev bond0 lladdr 00:11:22:33:44:55
> +
> +# start Offload testing
> +test_offload
> +
> +# do failover
> +ip -n $ns link set $active_slave down
> +slowwait 5 active_slave_changed $active_slave
> +test_offload

Hm, active_slave being overriden in the function is a bit sneaky. But
shifting the assignment out of the function is not great, because then
it would just needs to be done twice. Ho hum. This might just be the
least annoying way to write it after all.

> +
> +# make sure offload get removed from driver
> +ip -n $ns x s flush
> +ip -n $ns x p flush
> +line0=$(grep -c "SA count=0" $ipsec0)
> +line1=$(grep -c "SA count=0" $ipsec1)
> +if [ $line0 -ne 1 -o $line1 -ne 1 ]  ; then
> +	echo "FAIL: bond_ipsec_offload SA not removed from driver"
> +	ret=1
> +else
> +	echo "PASS: bond_ipsec_offload SA removed from driver"
> +fi
> +
> +exit $ret

With log_test this would be. It merges results from individual tests to
get the right exit status.

exit $EXIT_STATUS

> diff --git a/tools/testing/selftests/drivers/net/bonding/config b/tools/testing/selftests/drivers/net/bonding/config
> index dad4e5fda4db..054fb772846f 100644
> --- a/tools/testing/selftests/drivers/net/bonding/config
> +++ b/tools/testing/selftests/drivers/net/bonding/config
> @@ -9,3 +9,7 @@ CONFIG_NET_CLS_FLOWER=y
>  CONFIG_NET_SCH_INGRESS=y
>  CONFIG_NLMON=y
>  CONFIG_VETH=y
> +CONFIG_INET_ESP=y
> +CONFIG_INET_ESP_OFFLOAD=y
> +CONFIG_XFRM_USER=m
> +CONFIG_NETDEVSIM=m


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCHv3 net 1/3] bonding: move IPsec deletion to bond_ipsec_free_sa
  2025-02-27 13:31         ` Nikolay Aleksandrov
@ 2025-02-28  2:20           ` Hangbin Liu
  2025-02-28 10:31             ` Cosmin Ratiu
  0 siblings, 1 reply; 16+ messages in thread
From: Hangbin Liu @ 2025-02-28  2:20 UTC (permalink / raw)
  To: Nikolay Aleksandrov
  Cc: netdev, Jay Vosburgh, Andrew Lunn, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Simon Horman, Shuah Khan,
	Tariq Toukan, Jianbo Liu, Jarod Wilson, Steffen Klassert,
	Cosmin Ratiu, linux-kselftest, linux-kernel

On Thu, Feb 27, 2025 at 03:31:01PM +0200, Nikolay Aleksandrov wrote:
> >> One more thing - note I'm not an xfrm expert by far but it seems to me here you have
> >> to also call  xdo_dev_state_free() with the old active slave dev otherwise that will
> >> never get called with the original real_dev after the switch to a new
> >> active slave (or more accurately it might if the GC runs between the switching
> >> but it is a race), care must be taken wrt sequence of events because the XFRM
> > 
> > Can we just call xs->xso.real_dev->xfrmdev_ops->xdo_dev_state_free(xs)
> > no matter xs->xso.real_dev == real_dev or not? I'm afraid calling
> > xdo_dev_state_free() every where may make us lot more easily.
> > 
> 
> You'd have to check all drivers that implement the callback to answer that and even then
> I'd stick to the canonical way of how it's done in xfrm and make the bond just passthrough.
> Any other games become dangerous and new code will have to be carefully reviewed every
> time, calling another device's free_sa when it wasn't added before doesn't sound good.
> 
> >> GC may be running in parallel which probably means that in bond_ipsec_free_sa()
> >> you'll have to take the mutex before calling xdo_dev_state_free() and check
> >> if the entry is still linked in the bond's ipsec list before calling the free_sa
> >> callback, if it isn't then del_sa_all got to it before the GC and there's nothing
> >> to do if it also called the dev's free_sa callback. The check for real_dev doesn't
> >> seem enough to protect against this race.
> > 
> > I agree that we need to take the mutex before calling xdo_dev_state_free()
> > in bond_ipsec_free_sa(). Do you think if this is enough? I'm a bit lot here.
> > 
> > Thanks
> > Hangbin
> 
> Well, the race is between the xfrm GC and del_sa_all, in bond's free_sa if you
> walk the list under the mutex before calling real_dev's free callback and
> don't find the current element that's being freed in free_sa then it was
> cleaned up by del_sa_all, otherwise del_sa_all is waiting to walk that
> list and clean the entries. I think it should be fine as long as free_sa
> was called once with the proper device.

OK, so the free will be called either in del_sa_all() or free_sa().
Something like this?

 static void bond_ipsec_del_sa_all(struct bonding *bond)
@@ -620,6 +614,16 @@ static void bond_ipsec_del_sa_all(struct bonding *bond)
 		if (!ipsec->xs->xso.real_dev)
 			continue;
 
+		if (ipsec->xs->km.state == XFRM_STATE_DEAD) {
+			/* already dead no need to delete again */
+			if (real_dev->xfrmdev_ops->xdo_dev_state_free)
+				real_dev->xfrmdev_ops->xdo_dev_state_free(ipsec->xs);
+			list_del(&ipsec->list);
+			kfree(ipsec);
+			continue;
+		}
+
 		if (!real_dev->xfrmdev_ops ||
 		    !real_dev->xfrmdev_ops->xdo_dev_state_delete ||
 		    netif_is_bond_master(real_dev)) {
 
@@ -659,11 +664,22 @@ static void bond_ipsec_free_sa(struct xfrm_state *xs)
 	if (!xs->xso.real_dev)
 		goto out;
 
-	WARN_ON(xs->xso.real_dev != real_dev);
+	mutex_lock(&bond->ipsec_lock);
+	list_for_each_entry(ipsec, &bond->ipsec_list, list) {
+		if (ipsec->xs == xs) {
+			if (real_dev && xs->xso.real_dev == real_dev &&

                           ^^ looks we don't need this xs->xso.real_dev == real_dev
			   checking if there is no race, do we? Or just keep
			   the WARN_ON() in case of any race.

+			    real_dev->xfrmdev_ops &&
+			    real_dev->xfrmdev_ops->xdo_dev_state_free)
+				real_dev->xfrmdev_ops->xdo_dev_state_free(xs);
+			list_del(&ipsec->list);
+			kfree(ipsec);
+			break;
+		}
+	}
+	mutex_unlock(&bond->ipsec_lock);
 
-	if (real_dev && real_dev->xfrmdev_ops &&
-	    real_dev->xfrmdev_ops->xdo_dev_state_free)
-		real_dev->xfrmdev_ops->xdo_dev_state_free(xs);
 out:
 	netdev_put(real_dev, &tracker);
 }
-- 
2.39.5 (Apple Git-154)


Thanks
Hangbin

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCHv3 net 1/3] bonding: move IPsec deletion to bond_ipsec_free_sa
  2025-02-28  2:20           ` Hangbin Liu
@ 2025-02-28 10:31             ` Cosmin Ratiu
  2025-02-28 11:07               ` Nikolay Aleksandrov
                                 ` (2 more replies)
  0 siblings, 3 replies; 16+ messages in thread
From: Cosmin Ratiu @ 2025-02-28 10:31 UTC (permalink / raw)
  To: razor@blackwall.org, liuhangbin@gmail.com
  Cc: andrew+netdev@lunn.ch, jarod@redhat.com, davem@davemloft.net,
	Tariq Toukan, linux-kselftest@vger.kernel.org, shuah@kernel.org,
	steffen.klassert@secunet.com, jv@jvosburgh.net, kuba@kernel.org,
	horms@kernel.org, edumazet@google.com, netdev@vger.kernel.org,
	linux-kernel@vger.kernel.org, Jianbo Liu, pabeni@redhat.com

On Fri, 2025-02-28 at 02:20 +0000, Hangbin Liu wrote:
> On Thu, Feb 27, 2025 at 03:31:01PM +0200, Nikolay Aleksandrov wrote:
> > > > One more thing - note I'm not an xfrm expert by far but it
> > > > seems to me here you have
> > > > to also call  xdo_dev_state_free() with the old active slave
> > > > dev otherwise that will
> > > > never get called with the original real_dev after the switch to
> > > > a new
> > > > active slave (or more accurately it might if the GC runs
> > > > between the switching
> > > > but it is a race), care must be taken wrt sequence of events
> > > > because the XFRM
> > > 
> > > Can we just call xs->xso.real_dev->xfrmdev_ops-
> > > >xdo_dev_state_free(xs)
> > > no matter xs->xso.real_dev == real_dev or not? I'm afraid calling
> > > xdo_dev_state_free() every where may make us lot more easily.
> > > 
> > 
> > You'd have to check all drivers that implement the callback to
> > answer that and even then
> > I'd stick to the canonical way of how it's done in xfrm and make
> > the bond just passthrough.
> > Any other games become dangerous and new code will have to be
> > carefully reviewed every
> > time, calling another device's free_sa when it wasn't added before
> > doesn't sound good.
> > 
> > > > GC may be running in parallel which probably means that in
> > > > bond_ipsec_free_sa()
> > > > you'll have to take the mutex before calling
> > > > xdo_dev_state_free() and check
> > > > if the entry is still linked in the bond's ipsec list before
> > > > calling the free_sa
> > > > callback, if it isn't then del_sa_all got to it before the GC
> > > > and there's nothing
> > > > to do if it also called the dev's free_sa callback. The check
> > > > for real_dev doesn't
> > > > seem enough to protect against this race.
> > > 
> > > I agree that we need to take the mutex before calling
> > > xdo_dev_state_free()
> > > in bond_ipsec_free_sa(). Do you think if this is enough? I'm a
> > > bit lot here.
> > > 
> > > Thanks
> > > Hangbin
> > 
> > Well, the race is between the xfrm GC and del_sa_all, in bond's
> > free_sa if you
> > walk the list under the mutex before calling real_dev's free
> > callback and
> > don't find the current element that's being freed in free_sa then
> > it was
> > cleaned up by del_sa_all, otherwise del_sa_all is waiting to walk
> > that
> > list and clean the entries. I think it should be fine as long as
> > free_sa
> > was called once with the proper device.
> 
> OK, so the free will be called either in del_sa_all() or free_sa().
> Something like this?
> 
[...]

Unfortunately, after applying these changes and reasoning about them
for a bit, I don't think this will work. There are still races left.
For example:
1. An xs is marked DEAD (in __xfrm_state_delete, with x->lock held) and
before .xdo_dev_state_delete() is called on it, bond_ipsec_del_sa_all
is called in parallel, doesn't call delete on xs (because it's dead),
then calls free (incorrect without delete first), then removes the list
entry. Later, xdo_dev_state_delete( == bond_ipsec_del_sa) is called,
and calls delete (incorrect, out of order with free). Finally,
bond_ipsec_free_sa is called, which fortunately doesn't do anything
silly in the new proposed form because xs is no longer in the list.

2. A more sinister form of the above race can happen when 
bond_ipsec_del_sa_all() calls delete on real_dev, then in parallel and
immediately after __xfrm_state_delete marks xs as DEAD and calls
bond_ipsec_del_sa() which happily calls delete on real_dev again.

In order to fix these races (and others like it), I think
bond_ipsec_del_sa_all and bond_ipsec_add_sa_all *need* to acquire x-
>lock for each xs being processed. This would prevent xfrm from
concurrently initiating add/delete operations on the managed states.

Cosmin.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCHv3 net 1/3] bonding: move IPsec deletion to bond_ipsec_free_sa
  2025-02-28 10:31             ` Cosmin Ratiu
@ 2025-02-28 11:07               ` Nikolay Aleksandrov
  2025-02-28 11:10                 ` Nikolay Aleksandrov
  2025-02-28 12:59               ` Hangbin Liu
  2025-03-04  9:18               ` Hangbin Liu
  2 siblings, 1 reply; 16+ messages in thread
From: Nikolay Aleksandrov @ 2025-02-28 11:07 UTC (permalink / raw)
  To: Cosmin Ratiu, liuhangbin@gmail.com
  Cc: andrew+netdev@lunn.ch, jarod@redhat.com, davem@davemloft.net,
	Tariq Toukan, linux-kselftest@vger.kernel.org, shuah@kernel.org,
	steffen.klassert@secunet.com, jv@jvosburgh.net, kuba@kernel.org,
	horms@kernel.org, edumazet@google.com, netdev@vger.kernel.org,
	linux-kernel@vger.kernel.org, Jianbo Liu, pabeni@redhat.com

On 2/28/25 12:31, Cosmin Ratiu wrote:
> On Fri, 2025-02-28 at 02:20 +0000, Hangbin Liu wrote:
>> On Thu, Feb 27, 2025 at 03:31:01PM +0200, Nikolay Aleksandrov wrote:
>>>>> One more thing - note I'm not an xfrm expert by far but it
>>>>> seems to me here you have
>>>>> to also call  xdo_dev_state_free() with the old active slave
>>>>> dev otherwise that will
>>>>> never get called with the original real_dev after the switch to
>>>>> a new
>>>>> active slave (or more accurately it might if the GC runs
>>>>> between the switching
>>>>> but it is a race), care must be taken wrt sequence of events
>>>>> because the XFRM
>>>>
>>>> Can we just call xs->xso.real_dev->xfrmdev_ops-
>>>>> xdo_dev_state_free(xs)
>>>> no matter xs->xso.real_dev == real_dev or not? I'm afraid calling
>>>> xdo_dev_state_free() every where may make us lot more easily.
>>>>
>>>
>>> You'd have to check all drivers that implement the callback to
>>> answer that and even then
>>> I'd stick to the canonical way of how it's done in xfrm and make
>>> the bond just passthrough.
>>> Any other games become dangerous and new code will have to be
>>> carefully reviewed every
>>> time, calling another device's free_sa when it wasn't added before
>>> doesn't sound good.
>>>
>>>>> GC may be running in parallel which probably means that in
>>>>> bond_ipsec_free_sa()
>>>>> you'll have to take the mutex before calling
>>>>> xdo_dev_state_free() and check
>>>>> if the entry is still linked in the bond's ipsec list before
>>>>> calling the free_sa
>>>>> callback, if it isn't then del_sa_all got to it before the GC
>>>>> and there's nothing
>>>>> to do if it also called the dev's free_sa callback. The check
>>>>> for real_dev doesn't
>>>>> seem enough to protect against this race.
>>>>
>>>> I agree that we need to take the mutex before calling
>>>> xdo_dev_state_free()
>>>> in bond_ipsec_free_sa(). Do you think if this is enough? I'm a
>>>> bit lot here.
>>>>
>>>> Thanks
>>>> Hangbin
>>>
>>> Well, the race is between the xfrm GC and del_sa_all, in bond's
>>> free_sa if you
>>> walk the list under the mutex before calling real_dev's free
>>> callback and
>>> don't find the current element that's being freed in free_sa then
>>> it was
>>> cleaned up by del_sa_all, otherwise del_sa_all is waiting to walk
>>> that
>>> list and clean the entries. I think it should be fine as long as
>>> free_sa
>>> was called once with the proper device.
>>
>> OK, so the free will be called either in del_sa_all() or free_sa().
>> Something like this?
>>
> [...]
> 
> Unfortunately, after applying these changes and reasoning about them
> for a bit, I don't think this will work. There are still races left.
> For example:
> 1. An xs is marked DEAD (in __xfrm_state_delete, with x->lock held) and
> before .xdo_dev_state_delete() is called on it, bond_ipsec_del_sa_all
> is called in parallel, doesn't call delete on xs (because it's dead),
> then calls free (incorrect without delete first), then removes the list
> entry. Later, xdo_dev_state_delete( == bond_ipsec_del_sa) is called,
> and calls delete (incorrect, out of order with free). Finally,
> bond_ipsec_free_sa is called, which fortunately doesn't do anything
> silly in the new proposed form because xs is no longer in the list.
> 
> 2. A more sinister form of the above race can happen when 
> bond_ipsec_del_sa_all() calls delete on real_dev, then in parallel and
> immediately after __xfrm_state_delete marks xs as DEAD and calls
> bond_ipsec_del_sa() which happily calls delete on real_dev again.
> 
> In order to fix these races (and others like it), I think
> bond_ipsec_del_sa_all and bond_ipsec_add_sa_all *need* to acquire x-
>> lock for each xs being processed. This would prevent xfrm from
> concurrently initiating add/delete operations on the managed states.
> 
> Cosmin.

Duh, right you are. The state is protected by x->lock and cannot be trusted
outside of it. If you take x->lock inside the list walk with the mutex held
you can deadlock.

Cheers,
 Nik



^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCHv3 net 1/3] bonding: move IPsec deletion to bond_ipsec_free_sa
  2025-02-28 11:07               ` Nikolay Aleksandrov
@ 2025-02-28 11:10                 ` Nikolay Aleksandrov
  0 siblings, 0 replies; 16+ messages in thread
From: Nikolay Aleksandrov @ 2025-02-28 11:10 UTC (permalink / raw)
  To: Cosmin Ratiu, liuhangbin@gmail.com
  Cc: andrew+netdev@lunn.ch, jarod@redhat.com, davem@davemloft.net,
	Tariq Toukan, linux-kselftest@vger.kernel.org, shuah@kernel.org,
	steffen.klassert@secunet.com, jv@jvosburgh.net, kuba@kernel.org,
	horms@kernel.org, edumazet@google.com, netdev@vger.kernel.org,
	linux-kernel@vger.kernel.org, Jianbo Liu, pabeni@redhat.com

On 2/28/25 13:07, Nikolay Aleksandrov wrote:
> On 2/28/25 12:31, Cosmin Ratiu wrote:
>> On Fri, 2025-02-28 at 02:20 +0000, Hangbin Liu wrote:
>>> On Thu, Feb 27, 2025 at 03:31:01PM +0200, Nikolay Aleksandrov wrote:
>>>>>> One more thing - note I'm not an xfrm expert by far but it
>>>>>> seems to me here you have
>>>>>> to also call  xdo_dev_state_free() with the old active slave
>>>>>> dev otherwise that will
>>>>>> never get called with the original real_dev after the switch to
>>>>>> a new
>>>>>> active slave (or more accurately it might if the GC runs
>>>>>> between the switching
>>>>>> but it is a race), care must be taken wrt sequence of events
>>>>>> because the XFRM
>>>>>
>>>>> Can we just call xs->xso.real_dev->xfrmdev_ops-
>>>>>> xdo_dev_state_free(xs)
>>>>> no matter xs->xso.real_dev == real_dev or not? I'm afraid calling
>>>>> xdo_dev_state_free() every where may make us lot more easily.
>>>>>
>>>>
>>>> You'd have to check all drivers that implement the callback to
>>>> answer that and even then
>>>> I'd stick to the canonical way of how it's done in xfrm and make
>>>> the bond just passthrough.
>>>> Any other games become dangerous and new code will have to be
>>>> carefully reviewed every
>>>> time, calling another device's free_sa when it wasn't added before
>>>> doesn't sound good.
>>>>
>>>>>> GC may be running in parallel which probably means that in
>>>>>> bond_ipsec_free_sa()
>>>>>> you'll have to take the mutex before calling
>>>>>> xdo_dev_state_free() and check
>>>>>> if the entry is still linked in the bond's ipsec list before
>>>>>> calling the free_sa
>>>>>> callback, if it isn't then del_sa_all got to it before the GC
>>>>>> and there's nothing
>>>>>> to do if it also called the dev's free_sa callback. The check
>>>>>> for real_dev doesn't
>>>>>> seem enough to protect against this race.
>>>>>
>>>>> I agree that we need to take the mutex before calling
>>>>> xdo_dev_state_free()
>>>>> in bond_ipsec_free_sa(). Do you think if this is enough? I'm a
>>>>> bit lot here.
>>>>>
>>>>> Thanks
>>>>> Hangbin
>>>>
>>>> Well, the race is between the xfrm GC and del_sa_all, in bond's
>>>> free_sa if you
>>>> walk the list under the mutex before calling real_dev's free
>>>> callback and
>>>> don't find the current element that's being freed in free_sa then
>>>> it was
>>>> cleaned up by del_sa_all, otherwise del_sa_all is waiting to walk
>>>> that
>>>> list and clean the entries. I think it should be fine as long as
>>>> free_sa
>>>> was called once with the proper device.
>>>
>>> OK, so the free will be called either in del_sa_all() or free_sa().
>>> Something like this?
>>>
>> [...]
>>
>> Unfortunately, after applying these changes and reasoning about them
>> for a bit, I don't think this will work. There are still races left.
>> For example:
>> 1. An xs is marked DEAD (in __xfrm_state_delete, with x->lock held) and
>> before .xdo_dev_state_delete() is called on it, bond_ipsec_del_sa_all
>> is called in parallel, doesn't call delete on xs (because it's dead),
>> then calls free (incorrect without delete first), then removes the list
>> entry. Later, xdo_dev_state_delete( == bond_ipsec_del_sa) is called,
>> and calls delete (incorrect, out of order with free). Finally,
>> bond_ipsec_free_sa is called, which fortunately doesn't do anything
>> silly in the new proposed form because xs is no longer in the list.
>>
>> 2. A more sinister form of the above race can happen when 
>> bond_ipsec_del_sa_all() calls delete on real_dev, then in parallel and
>> immediately after __xfrm_state_delete marks xs as DEAD and calls
>> bond_ipsec_del_sa() which happily calls delete on real_dev again.
>>
>> In order to fix these races (and others like it), I think
>> bond_ipsec_del_sa_all and bond_ipsec_add_sa_all *need* to acquire x-
>>> lock for each xs being processed. This would prevent xfrm from
>> concurrently initiating add/delete operations on the managed states.
>>
>> Cosmin.
> 
> Duh, right you are. The state is protected by x->lock and cannot be trusted
> outside of it. If you take x->lock inside the list walk with the mutex held
> you can deadlock.
> 
> Cheers,
>  Nik
> 

Correction - actually took a closer look at the xfrm code and it should be fine.
The x->lock is taken only in the delete path and if the mutex is not acquired by
bond's del_sa callback it should be ok. Though this must be very well documented.



^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCHv3 net 1/3] bonding: move IPsec deletion to bond_ipsec_free_sa
  2025-02-28 10:31             ` Cosmin Ratiu
  2025-02-28 11:07               ` Nikolay Aleksandrov
@ 2025-02-28 12:59               ` Hangbin Liu
  2025-03-04  9:18               ` Hangbin Liu
  2 siblings, 0 replies; 16+ messages in thread
From: Hangbin Liu @ 2025-02-28 12:59 UTC (permalink / raw)
  To: Cosmin Ratiu
  Cc: razor@blackwall.org, andrew+netdev@lunn.ch, jarod@redhat.com,
	davem@davemloft.net, Tariq Toukan,
	linux-kselftest@vger.kernel.org, shuah@kernel.org,
	steffen.klassert@secunet.com, jv@jvosburgh.net, kuba@kernel.org,
	horms@kernel.org, edumazet@google.com, netdev@vger.kernel.org,
	linux-kernel@vger.kernel.org, Jianbo Liu, pabeni@redhat.com

Hi Cosmin,
On Fri, Feb 28, 2025 at 10:31:58AM +0000, Cosmin Ratiu wrote:
> On Fri, 2025-02-28 at 02:20 +0000, Hangbin Liu wrote:
> > On Thu, Feb 27, 2025 at 03:31:01PM +0200, Nikolay Aleksandrov wrote:
> > > > > One more thing - note I'm not an xfrm expert by far but it
> > > > > seems to me here you have
> > > > > to also call  xdo_dev_state_free() with the old active slave
> > > > > dev otherwise that will
> > > > > never get called with the original real_dev after the switch to
> > > > > a new
> > > > > active slave (or more accurately it might if the GC runs
> > > > > between the switching
> > > > > but it is a race), care must be taken wrt sequence of events
> > > > > because the XFRM
> > > > 
> > > > Can we just call xs->xso.real_dev->xfrmdev_ops-
> > > > >xdo_dev_state_free(xs)
> > > > no matter xs->xso.real_dev == real_dev or not? I'm afraid calling
> > > > xdo_dev_state_free() every where may make us lot more easily.
> > > > 
> > > 
> > > You'd have to check all drivers that implement the callback to
> > > answer that and even then
> > > I'd stick to the canonical way of how it's done in xfrm and make
> > > the bond just passthrough.
> > > Any other games become dangerous and new code will have to be
> > > carefully reviewed every
> > > time, calling another device's free_sa when it wasn't added before
> > > doesn't sound good.
> > > 
> > > > > GC may be running in parallel which probably means that in
> > > > > bond_ipsec_free_sa()
> > > > > you'll have to take the mutex before calling
> > > > > xdo_dev_state_free() and check
> > > > > if the entry is still linked in the bond's ipsec list before
> > > > > calling the free_sa
> > > > > callback, if it isn't then del_sa_all got to it before the GC
> > > > > and there's nothing
> > > > > to do if it also called the dev's free_sa callback. The check
> > > > > for real_dev doesn't
> > > > > seem enough to protect against this race.
> > > > 
> > > > I agree that we need to take the mutex before calling
> > > > xdo_dev_state_free()
> > > > in bond_ipsec_free_sa(). Do you think if this is enough? I'm a
> > > > bit lot here.
> > > > 
> > > > Thanks
> > > > Hangbin
> > > 
> > > Well, the race is between the xfrm GC and del_sa_all, in bond's
> > > free_sa if you
> > > walk the list under the mutex before calling real_dev's free
> > > callback and
> > > don't find the current element that's being freed in free_sa then
> > > it was
> > > cleaned up by del_sa_all, otherwise del_sa_all is waiting to walk
> > > that
> > > list and clean the entries. I think it should be fine as long as
> > > free_sa
> > > was called once with the proper device.
> > 
> > OK, so the free will be called either in del_sa_all() or free_sa().
> > Something like this?
> > 
> [...]
> 
> Unfortunately, after applying these changes and reasoning about them
> for a bit, I don't think this will work. There are still races left.
> For example:
> 1. An xs is marked DEAD (in __xfrm_state_delete, with x->lock held) and
> before .xdo_dev_state_delete() is called on it, bond_ipsec_del_sa_all
> is called in parallel, doesn't call delete on xs (because it's dead),
> then calls free (incorrect without delete first), then removes the list
> entry. Later, xdo_dev_state_delete( == bond_ipsec_del_sa) is called,
> and calls delete (incorrect, out of order with free). Finally,
> bond_ipsec_free_sa is called, which fortunately doesn't do anything
> silly in the new proposed form because xs is no longer in the list.
> 
> 2. A more sinister form of the above race can happen when 
> bond_ipsec_del_sa_all() calls delete on real_dev, then in parallel and
> immediately after __xfrm_state_delete marks xs as DEAD and calls
> bond_ipsec_del_sa() which happily calls delete on real_dev again.
> 
> In order to fix these races (and others like it), I think
> bond_ipsec_del_sa_all and bond_ipsec_add_sa_all *need* to acquire x-
> >lock for each xs being processed. This would prevent xfrm from
> concurrently initiating add/delete operations on the managed states.
> 

Thanks a lot for the careful checking. I will add the x->lock
in del/add_sa_all.

Regards
Hangbin

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCHv3 net 1/3] bonding: move IPsec deletion to bond_ipsec_free_sa
  2025-02-28 10:31             ` Cosmin Ratiu
  2025-02-28 11:07               ` Nikolay Aleksandrov
  2025-02-28 12:59               ` Hangbin Liu
@ 2025-03-04  9:18               ` Hangbin Liu
  2025-03-04 10:25                 ` Cosmin Ratiu
  2 siblings, 1 reply; 16+ messages in thread
From: Hangbin Liu @ 2025-03-04  9:18 UTC (permalink / raw)
  To: Cosmin Ratiu
  Cc: razor@blackwall.org, andrew+netdev@lunn.ch, jarod@redhat.com,
	davem@davemloft.net, Tariq Toukan,
	linux-kselftest@vger.kernel.org, shuah@kernel.org,
	steffen.klassert@secunet.com, jv@jvosburgh.net, kuba@kernel.org,
	horms@kernel.org, edumazet@google.com, netdev@vger.kernel.org,
	linux-kernel@vger.kernel.org, Jianbo Liu, pabeni@redhat.com

Hi Cosmin,
On Fri, Feb 28, 2025 at 10:31:58AM +0000, Cosmin Ratiu wrote:
> On Fri, 2025-02-28 at 02:20 +0000, Hangbin Liu wrote:
> > On Thu, Feb 27, 2025 at 03:31:01PM +0200, Nikolay Aleksandrov wrote:
> > > > > One more thing - note I'm not an xfrm expert by far but it
> > > > > seems to me here you have
> > > > > to also call  xdo_dev_state_free() with the old active slave
> > > > > dev otherwise that will
> > > > > never get called with the original real_dev after the switch to
> > > > > a new
> > > > > active slave (or more accurately it might if the GC runs
> > > > > between the switching
> > > > > but it is a race), care must be taken wrt sequence of events
> > > > > because the XFRM
> > > > 
> > > > Can we just call xs->xso.real_dev->xfrmdev_ops-
> > > > >xdo_dev_state_free(xs)
> > > > no matter xs->xso.real_dev == real_dev or not? I'm afraid calling
> > > > xdo_dev_state_free() every where may make us lot more easily.
> > > > 
> > > 
> > > You'd have to check all drivers that implement the callback to
> > > answer that and even then
> > > I'd stick to the canonical way of how it's done in xfrm and make
> > > the bond just passthrough.
> > > Any other games become dangerous and new code will have to be
> > > carefully reviewed every
> > > time, calling another device's free_sa when it wasn't added before
> > > doesn't sound good.
> > > 
> > > > > GC may be running in parallel which probably means that in
> > > > > bond_ipsec_free_sa()
> > > > > you'll have to take the mutex before calling
> > > > > xdo_dev_state_free() and check
> > > > > if the entry is still linked in the bond's ipsec list before
> > > > > calling the free_sa
> > > > > callback, if it isn't then del_sa_all got to it before the GC
> > > > > and there's nothing
> > > > > to do if it also called the dev's free_sa callback. The check
> > > > > for real_dev doesn't
> > > > > seem enough to protect against this race.
> > > > 
> > > > I agree that we need to take the mutex before calling
> > > > xdo_dev_state_free()
> > > > in bond_ipsec_free_sa(). Do you think if this is enough? I'm a
> > > > bit lot here.
> > > > 
> > > > Thanks
> > > > Hangbin
> > > 
> > > Well, the race is between the xfrm GC and del_sa_all, in bond's
> > > free_sa if you
> > > walk the list under the mutex before calling real_dev's free
> > > callback and
> > > don't find the current element that's being freed in free_sa then
> > > it was
> > > cleaned up by del_sa_all, otherwise del_sa_all is waiting to walk
> > > that
> > > list and clean the entries. I think it should be fine as long as
> > > free_sa
> > > was called once with the proper device.
> > 
> > OK, so the free will be called either in del_sa_all() or free_sa().
> > Something like this?
> > 
> [...]
> 
> Unfortunately, after applying these changes and reasoning about them
> for a bit, I don't think this will work. There are still races left.
> For example:
> 1. An xs is marked DEAD (in __xfrm_state_delete, with x->lock held) and
> before .xdo_dev_state_delete() is called on it, bond_ipsec_del_sa_all
> is called in parallel, doesn't call delete on xs (because it's dead),
> then calls free (incorrect without delete first), then removes the list
> entry. Later, xdo_dev_state_delete( == bond_ipsec_del_sa) is called,
> and calls delete (incorrect, out of order with free). Finally,
> bond_ipsec_free_sa is called, which fortunately doesn't do anything
> silly in the new proposed form because xs is no longer in the list.
> 
> 2. A more sinister form of the above race can happen when 
> bond_ipsec_del_sa_all() calls delete on real_dev, then in parallel and
> immediately after __xfrm_state_delete marks xs as DEAD and calls
> bond_ipsec_del_sa() which happily calls delete on real_dev again.
> 
> In order to fix these races (and others like it), I think
> bond_ipsec_del_sa_all and bond_ipsec_add_sa_all *need* to acquire x-
> >lock for each xs being processed. This would prevent xfrm from
> concurrently initiating add/delete operations on the managed states.
> 

Just to make sure I added the lock in correct place, would you please help
confirm.

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index e85878b12376..c59ad3a5cf43 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -537,19 +537,25 @@ static void bond_ipsec_add_sa_all(struct bonding *bond)
 	}
 
 	list_for_each_entry(ipsec, &bond->ipsec_list, list) {
+		spin_lock_bh(&ipsec->xs->lock);
 		/* Skip dead xfrm states, they'll be freed later. */
-		if (ipsec->xs->km.state == XFRM_STATE_DEAD)
+		if (ipsec->xs->km.state == XFRM_STATE_DEAD) {
+			spin_unlock_bh(&ipsec->xs->lock);
 			continue;
+		}
 
 		/* If new state is added before ipsec_lock acquired */
-		if (ipsec->xs->xso.real_dev == real_dev)
+		if (ipsec->xs->xso.real_dev == real_dev) {
+			spin_unlock_bh(&ipsec->xs->lock);
 			continue;
+		}
 
 		ipsec->xs->xso.real_dev = real_dev;
 		if (real_dev->xfrmdev_ops->xdo_dev_state_add(ipsec->xs, NULL)) {
 			slave_warn(bond_dev, real_dev, "%s: failed to add SA\n", __func__);
 			ipsec->xs->xso.real_dev = NULL;
 		}
+		spin_unlock_bh(&ipsec->xs->lock);
 	}
 out:
 	mutex_unlock(&bond->ipsec_lock);
@@ -614,6 +620,7 @@ static void bond_ipsec_del_sa_all(struct bonding *bond)
 		if (!ipsec->xs->xso.real_dev)
 			continue;
 
+		spin_lock_bh(&ipsec->xs->lock);
 		if (ipsec->xs->km.state == XFRM_STATE_DEAD) {
 			/* already dead no need to delete again */
 			if (ipsec->xs->xso.real_dev == real_dev &&
@@ -621,6 +628,7 @@ static void bond_ipsec_del_sa_all(struct bonding *bond)
 				real_dev->xfrmdev_ops->xdo_dev_state_free(ipsec->xs);
 			list_del(&ipsec->list);
 			kfree(ipsec);
+			spin_unlock_bh(&ipsec->xs->lock);
 			continue;
 		}
 
@@ -635,6 +643,7 @@ static void bond_ipsec_del_sa_all(struct bonding *bond)
 			if (real_dev->xfrmdev_ops->xdo_dev_state_free)
 				real_dev->xfrmdev_ops->xdo_dev_state_free(ipsec->xs);
 		}
+		spin_unlock_bh(&ipsec->xs->lock);
 	}
 	mutex_unlock(&bond->ipsec_lock);
 }

Thanks
Hangbin

^ permalink raw reply related	[flat|nested] 16+ messages in thread

* Re: [PATCHv3 net 1/3] bonding: move IPsec deletion to bond_ipsec_free_sa
  2025-03-04  9:18               ` Hangbin Liu
@ 2025-03-04 10:25                 ` Cosmin Ratiu
  0 siblings, 0 replies; 16+ messages in thread
From: Cosmin Ratiu @ 2025-03-04 10:25 UTC (permalink / raw)
  To: liuhangbin@gmail.com
  Cc: shuah@kernel.org, andrew+netdev@lunn.ch, davem@davemloft.net,
	jv@jvosburgh.net, jarod@redhat.com, razor@blackwall.org,
	linux-kernel@vger.kernel.org, edumazet@google.com, Jianbo Liu,
	pabeni@redhat.com, horms@kernel.org, kuba@kernel.org,
	Tariq Toukan, netdev@vger.kernel.org,
	steffen.klassert@secunet.com, linux-kselftest@vger.kernel.org

On Tue, 2025-03-04 at 09:18 +0000, Hangbin Liu wrote:
> 
> Just to make sure I added the lock in correct place, would you please
> help
> confirm.
> 
> diff --git a/drivers/net/bonding/bond_main.c
> b/drivers/net/bonding/bond_main.c
> index e85878b12376..c59ad3a5cf43 100644
> --- a/drivers/net/bonding/bond_main.c
> +++ b/drivers/net/bonding/bond_main.c
> @@ -537,19 +537,25 @@ static void bond_ipsec_add_sa_all(struct
> bonding *bond)
>  	}
>  
>  	list_for_each_entry(ipsec, &bond->ipsec_list, list) {
> +		spin_lock_bh(&ipsec->xs->lock);
>  		/* Skip dead xfrm states, they'll be freed later. */
> -		if (ipsec->xs->km.state == XFRM_STATE_DEAD)
> +		if (ipsec->xs->km.state == XFRM_STATE_DEAD) {
> +			spin_unlock_bh(&ipsec->xs->lock);

Instead of unlocking on every branch, I recommend adding a "next:" tag
before the unlock at the end of the loop and switching the "continue"
statements with "goto next".

>  			continue;
> +		}
>  
>  		/* If new state is added before ipsec_lock acquired
> */
> -		if (ipsec->xs->xso.real_dev == real_dev)
> +		if (ipsec->xs->xso.real_dev == real_dev) {
> +			spin_unlock_bh(&ipsec->xs->lock);
>  			continue;
> +		}
>  
>  		ipsec->xs->xso.real_dev = real_dev;
>  		if (real_dev->xfrmdev_ops->xdo_dev_state_add(ipsec-
> >xs, NULL)) {
>  			slave_warn(bond_dev, real_dev, "%s: failed
> to add SA\n", __func__);
>  			ipsec->xs->xso.real_dev = NULL;
>  		}

Add the "next:" tag here.

> +		spin_unlock_bh(&ipsec->xs->lock);
>  	}
>  out:
>  	mutex_unlock(&bond->ipsec_lock);
> @@ -614,6 +620,7 @@ static void bond_ipsec_del_sa_all(struct bonding
> *bond)
>  		if (!ipsec->xs->xso.real_dev)
>  			continue;

The above if should be in the critical section as well.

>  
> +		spin_lock_bh(&ipsec->xs->lock);
>  		if (ipsec->xs->km.state == XFRM_STATE_DEAD) {
>  			/* already dead no need to delete again */
>  			if (ipsec->xs->xso.real_dev == real_dev &&
> @@ -621,6 +628,7 @@ static void bond_ipsec_del_sa_all(struct bonding
> *bond)
>  				real_dev->xfrmdev_ops-
> >xdo_dev_state_free(ipsec->xs);
>  			list_del(&ipsec->list);
>  			kfree(ipsec);
> +			spin_unlock_bh(&ipsec->xs->lock);

And I recommend the same thing with "goto next" here, jumping at the
end of the loop, before the unlock.

>  			continue;
>  		}
>  
> @@ -635,6 +643,7 @@ static void bond_ipsec_del_sa_all(struct bonding
> *bond)
>  			if (real_dev->xfrmdev_ops-
> >xdo_dev_state_free)
>  				real_dev->xfrmdev_ops-
> >xdo_dev_state_free(ipsec->xs);
>  		}
> +		spin_unlock_bh(&ipsec->xs->lock);
>  	}
>  	mutex_unlock(&bond->ipsec_lock);
>  }
> 
> Thanks
> Hangbin


^ permalink raw reply	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2025-03-04 10:25 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-02-27  8:37 [PATCHv3 net 0/3] bond: fix xfrm offload issues Hangbin Liu
2025-02-27  8:37 ` [PATCHv3 net 1/3] bonding: move IPsec deletion to bond_ipsec_free_sa Hangbin Liu
2025-02-27  8:50   ` Nikolay Aleksandrov
2025-02-27  9:21     ` Nikolay Aleksandrov
2025-02-27 13:21       ` Hangbin Liu
2025-02-27 13:31         ` Nikolay Aleksandrov
2025-02-28  2:20           ` Hangbin Liu
2025-02-28 10:31             ` Cosmin Ratiu
2025-02-28 11:07               ` Nikolay Aleksandrov
2025-02-28 11:10                 ` Nikolay Aleksandrov
2025-02-28 12:59               ` Hangbin Liu
2025-03-04  9:18               ` Hangbin Liu
2025-03-04 10:25                 ` Cosmin Ratiu
2025-02-27  8:37 ` [PATCHv3 net 2/3] bonding: fix xfrm offload feature setup on active-backup mode Hangbin Liu
2025-02-27  8:37 ` [PATCHv3 net 3/3] selftests: bonding: add ipsec offload test Hangbin Liu
2025-02-27 13:59   ` Petr Machata

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).