* [PATCHv2 net 1/3] bonding: move mutex lock to a work queue for XFRM GC tasks
2025-02-25 9:40 [PATCHv2 net 0/3] bond: fix xfrm offload issues Hangbin Liu
@ 2025-02-25 9:40 ` Hangbin Liu
2025-02-25 11:05 ` Nikolay Aleksandrov
2025-02-25 14:00 ` Cosmin Ratiu
2025-02-25 9:40 ` [PATCHv2 net 2/3] bonding: fix xfrm offload feature setup on active-backup mode Hangbin Liu
2025-02-25 9:40 ` [PATCHv2 net 3/3] selftests: bonding: add ipsec offload test Hangbin Liu
2 siblings, 2 replies; 13+ messages in thread
From: Hangbin Liu @ 2025-02-25 9:40 UTC (permalink / raw)
To: netdev
Cc: Jay Vosburgh, Andrew Lunn, David S. Miller, Eric Dumazet,
Jakub Kicinski, Paolo Abeni, Nikolay Aleksandrov, Simon Horman,
Shuah Khan, Tariq Toukan, Jianbo Liu, Jarod Wilson,
Steffen Klassert, Cosmin Ratiu, linux-kselftest, linux-kernel,
Hangbin Liu
The fixed commit placed mutex_lock() inside spin_lock_bh(), which triggers
a warning like:
BUG: sleeping function called from invalid context at...
Fix this by moving the mutex_lock() operation to a work queue.
Fixes: 2aeeef906d5a ("bonding: change ipsec_lock from spin lock to mutex")
Reported-by: Jakub Kicinski <kuba@kernel.org>
Closes: https://lore.kernel.org/netdev/20241212062734.182a0164@kernel.org
Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
---
drivers/net/bonding/bond_main.c | 41 +++++++++++++++++++++++++--------
include/net/bonding.h | 6 +++++
2 files changed, 37 insertions(+), 10 deletions(-)
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index e45bba240cbc..cc7064aa4b35 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -551,6 +551,25 @@ static void bond_ipsec_add_sa_all(struct bonding *bond)
mutex_unlock(&bond->ipsec_lock);
}
+static void bond_xfrm_state_gc_work(struct work_struct *work)
+{
+ struct bond_xfrm_work *xfrm_work = container_of(work, struct bond_xfrm_work, work);
+ struct bonding *bond = xfrm_work->bond;
+ struct xfrm_state *xs = xfrm_work->xs;
+ struct bond_ipsec *ipsec;
+
+ mutex_lock(&bond->ipsec_lock);
+ list_for_each_entry(ipsec, &bond->ipsec_list, list) {
+ if (ipsec->xs == xs) {
+ list_del(&ipsec->list);
+ kfree(ipsec);
+ xfrm_state_put(xs);
+ break;
+ }
+ }
+ mutex_unlock(&bond->ipsec_lock);
+}
+
/**
* bond_ipsec_del_sa - clear out this specific SA
* @xs: pointer to transformer state struct
@@ -558,9 +577,9 @@ static void bond_ipsec_add_sa_all(struct bonding *bond)
static void bond_ipsec_del_sa(struct xfrm_state *xs)
{
struct net_device *bond_dev = xs->xso.dev;
+ struct bond_xfrm_work *xfrm_work;
struct net_device *real_dev;
netdevice_tracker tracker;
- struct bond_ipsec *ipsec;
struct bonding *bond;
struct slave *slave;
@@ -592,15 +611,17 @@ static void bond_ipsec_del_sa(struct xfrm_state *xs)
real_dev->xfrmdev_ops->xdo_dev_state_delete(xs);
out:
netdev_put(real_dev, &tracker);
- mutex_lock(&bond->ipsec_lock);
- list_for_each_entry(ipsec, &bond->ipsec_list, list) {
- if (ipsec->xs == xs) {
- list_del(&ipsec->list);
- kfree(ipsec);
- break;
- }
- }
- mutex_unlock(&bond->ipsec_lock);
+
+ xfrm_work = kmalloc(sizeof(*xfrm_work), GFP_ATOMIC);
+ if (!xfrm_work)
+ return;
+
+ INIT_WORK(&xfrm_work->work, bond_xfrm_state_gc_work);
+ xfrm_work->bond = bond;
+ xfrm_work->xs = xs;
+ xfrm_state_hold(xs);
+
+ queue_work(bond->wq, &xfrm_work->work);
}
static void bond_ipsec_del_sa_all(struct bonding *bond)
diff --git a/include/net/bonding.h b/include/net/bonding.h
index 8bb5f016969f..d54ba5e3affb 100644
--- a/include/net/bonding.h
+++ b/include/net/bonding.h
@@ -209,6 +209,12 @@ struct bond_ipsec {
struct xfrm_state *xs;
};
+struct bond_xfrm_work {
+ struct work_struct work;
+ struct bonding *bond;
+ struct xfrm_state *xs;
+};
+
/*
* Here are the locking policies for the two bonding locks:
* Get rcu_read_lock when reading or RTNL when writing slave list.
--
2.46.0
^ permalink raw reply related [flat|nested] 13+ messages in thread* Re: [PATCHv2 net 1/3] bonding: move mutex lock to a work queue for XFRM GC tasks
2025-02-25 9:40 ` [PATCHv2 net 1/3] bonding: move mutex lock to a work queue for XFRM GC tasks Hangbin Liu
@ 2025-02-25 11:05 ` Nikolay Aleksandrov
2025-02-25 13:13 ` Hangbin Liu
2025-02-25 14:00 ` Cosmin Ratiu
1 sibling, 1 reply; 13+ messages in thread
From: Nikolay Aleksandrov @ 2025-02-25 11:05 UTC (permalink / raw)
To: Hangbin Liu, netdev
Cc: Jay Vosburgh, Andrew Lunn, David S. Miller, Eric Dumazet,
Jakub Kicinski, Paolo Abeni, Simon Horman, Shuah Khan,
Tariq Toukan, Jianbo Liu, Jarod Wilson, Steffen Klassert,
Cosmin Ratiu, linux-kselftest, linux-kernel
On 2/25/25 11:40, Hangbin Liu wrote:
> The fixed commit placed mutex_lock() inside spin_lock_bh(), which triggers
> a warning like:
>
> BUG: sleeping function called from invalid context at...
>
> Fix this by moving the mutex_lock() operation to a work queue.
>
> Fixes: 2aeeef906d5a ("bonding: change ipsec_lock from spin lock to mutex")
> Reported-by: Jakub Kicinski <kuba@kernel.org>
> Closes: https://lore.kernel.org/netdev/20241212062734.182a0164@kernel.org
> Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
> ---
> drivers/net/bonding/bond_main.c | 41 +++++++++++++++++++++++++--------
> include/net/bonding.h | 6 +++++
> 2 files changed, 37 insertions(+), 10 deletions(-)
>
Hi,
I think there are a few issues with this solution, comments below.
> diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
> index e45bba240cbc..cc7064aa4b35 100644
> --- a/drivers/net/bonding/bond_main.c
> +++ b/drivers/net/bonding/bond_main.c
> @@ -551,6 +551,25 @@ static void bond_ipsec_add_sa_all(struct bonding *bond)
> mutex_unlock(&bond->ipsec_lock);
> }
>
> +static void bond_xfrm_state_gc_work(struct work_struct *work)
> +{
> + struct bond_xfrm_work *xfrm_work = container_of(work, struct bond_xfrm_work, work);
> + struct bonding *bond = xfrm_work->bond;
> + struct xfrm_state *xs = xfrm_work->xs;
> + struct bond_ipsec *ipsec;
> +
> + mutex_lock(&bond->ipsec_lock);
> + list_for_each_entry(ipsec, &bond->ipsec_list, list) {
> + if (ipsec->xs == xs) {
> + list_del(&ipsec->list);
> + kfree(ipsec);
> + xfrm_state_put(xs);
> + break;
> + }
> + }
> + mutex_unlock(&bond->ipsec_lock);
> +}
> +
> /**
> * bond_ipsec_del_sa - clear out this specific SA
> * @xs: pointer to transformer state struct
> @@ -558,9 +577,9 @@ static void bond_ipsec_add_sa_all(struct bonding *bond)
> static void bond_ipsec_del_sa(struct xfrm_state *xs)
> {
> struct net_device *bond_dev = xs->xso.dev;
> + struct bond_xfrm_work *xfrm_work;
> struct net_device *real_dev;
> netdevice_tracker tracker;
> - struct bond_ipsec *ipsec;
> struct bonding *bond;
> struct slave *slave;
>
> @@ -592,15 +611,17 @@ static void bond_ipsec_del_sa(struct xfrm_state *xs)
> real_dev->xfrmdev_ops->xdo_dev_state_delete(xs);
> out:
> netdev_put(real_dev, &tracker);
> - mutex_lock(&bond->ipsec_lock);
> - list_for_each_entry(ipsec, &bond->ipsec_list, list) {
> - if (ipsec->xs == xs) {
> - list_del(&ipsec->list);
> - kfree(ipsec);
> - break;
> - }
> - }
> - mutex_unlock(&bond->ipsec_lock);
> +
> + xfrm_work = kmalloc(sizeof(*xfrm_work), GFP_ATOMIC);
> + if (!xfrm_work)
> + return;
> +
What happens if this allocation fails? I think you'll leak memory and
potentially call the xdo_dev callbacks for this xs again because it's
still in the list. Also this xfrm_work memory doesn't get freed anywhere, so
you're leaking it as well.
Perhaps you can do this allocation in add_sa, it seems you can sleep
there and potentially return an error if it fails, so this can never
fail later. You'll have to be careful with the freeing dance though.
Alternatively, make the work a part of struct bond so it doesn't need
memory management, but then you need a mechanism to queue these items (e.g.
a separate list with a spinlock) and would have more complexity with freeing
in parallel.
> + INIT_WORK(&xfrm_work->work, bond_xfrm_state_gc_work);
> + xfrm_work->bond = bond;
> + xfrm_work->xs = xs;
> + xfrm_state_hold(xs);
> +
> + queue_work(bond->wq, &xfrm_work->work);
Note that nothing waits for this work anywhere and .ndo_uninit runs before
bond's .priv_destructor which means ipsec_lock will be destroyed and will be
used afterwards when destroying bond->wq from the destructor if there were
any queued works.
[snip]
Cheers,
Nik
^ permalink raw reply [flat|nested] 13+ messages in thread* Re: [PATCHv2 net 1/3] bonding: move mutex lock to a work queue for XFRM GC tasks
2025-02-25 11:05 ` Nikolay Aleksandrov
@ 2025-02-25 13:13 ` Hangbin Liu
2025-02-25 13:30 ` Nikolay Aleksandrov
0 siblings, 1 reply; 13+ messages in thread
From: Hangbin Liu @ 2025-02-25 13:13 UTC (permalink / raw)
To: Nikolay Aleksandrov
Cc: netdev, Jay Vosburgh, Andrew Lunn, David S. Miller, Eric Dumazet,
Jakub Kicinski, Paolo Abeni, Simon Horman, Shuah Khan,
Tariq Toukan, Jianbo Liu, Jarod Wilson, Steffen Klassert,
Cosmin Ratiu, linux-kselftest, linux-kernel
On Tue, Feb 25, 2025 at 01:05:24PM +0200, Nikolay Aleksandrov wrote:
> > @@ -592,15 +611,17 @@ static void bond_ipsec_del_sa(struct xfrm_state *xs)
> > real_dev->xfrmdev_ops->xdo_dev_state_delete(xs);
> > out:
> > netdev_put(real_dev, &tracker);
> > - mutex_lock(&bond->ipsec_lock);
> > - list_for_each_entry(ipsec, &bond->ipsec_list, list) {
> > - if (ipsec->xs == xs) {
> > - list_del(&ipsec->list);
> > - kfree(ipsec);
> > - break;
> > - }
> > - }
> > - mutex_unlock(&bond->ipsec_lock);
> > +
> > + xfrm_work = kmalloc(sizeof(*xfrm_work), GFP_ATOMIC);
> > + if (!xfrm_work)
> > + return;
> > +
>
> What happens if this allocation fails? I think you'll leak memory and
> potentially call the xdo_dev callbacks for this xs again because it's
> still in the list. Also this xfrm_work memory doesn't get freed anywhere, so
> you're leaking it as well.
Yes, I thought this too simply and forgot free the memory.
>
> Perhaps you can do this allocation in add_sa, it seems you can sleep
> there and potentially return an error if it fails, so this can never
> fail later. You'll have to be careful with the freeing dance though.
Hmm, if we allocation this in add_sa, how to we get the xfrm_work
in del_sa? Add the xfrm_work to another list will need to sleep again
to find it out in del_sa.
> Alternatively, make the work a part of struct bond so it doesn't need
> memory management, but then you need a mechanism to queue these items (e.g.
> a separate list with a spinlock) and would have more complexity with freeing
> in parallel.
I used a dealy work queue in bond for my draft patch. As you said,
it need another list to queue the xs. And during the gc works, we need
to use spinlock again to get the xs out...
>
> > + INIT_WORK(&xfrm_work->work, bond_xfrm_state_gc_work);
> > + xfrm_work->bond = bond;
> > + xfrm_work->xs = xs;
> > + xfrm_state_hold(xs);
> > +
> > + queue_work(bond->wq, &xfrm_work->work);
>
> Note that nothing waits for this work anywhere and .ndo_uninit runs before
> bond's .priv_destructor which means ipsec_lock will be destroyed and will be
> used afterwards when destroying bond->wq from the destructor if there were
> any queued works.
Do you mean we need to register the work queue in bond_init and cancel
it in bond_work_cancel_all()?
Thanks
Hangbin
^ permalink raw reply [flat|nested] 13+ messages in thread* Re: [PATCHv2 net 1/3] bonding: move mutex lock to a work queue for XFRM GC tasks
2025-02-25 13:13 ` Hangbin Liu
@ 2025-02-25 13:30 ` Nikolay Aleksandrov
0 siblings, 0 replies; 13+ messages in thread
From: Nikolay Aleksandrov @ 2025-02-25 13:30 UTC (permalink / raw)
To: Hangbin Liu
Cc: netdev, Jay Vosburgh, Andrew Lunn, David S. Miller, Eric Dumazet,
Jakub Kicinski, Paolo Abeni, Simon Horman, Shuah Khan,
Tariq Toukan, Jianbo Liu, Jarod Wilson, Steffen Klassert,
Cosmin Ratiu, linux-kselftest, linux-kernel
On 2/25/25 15:13, Hangbin Liu wrote:
> On Tue, Feb 25, 2025 at 01:05:24PM +0200, Nikolay Aleksandrov wrote:
>>> @@ -592,15 +611,17 @@ static void bond_ipsec_del_sa(struct xfrm_state *xs)
>>> real_dev->xfrmdev_ops->xdo_dev_state_delete(xs);
>>> out:
>>> netdev_put(real_dev, &tracker);
>>> - mutex_lock(&bond->ipsec_lock);
>>> - list_for_each_entry(ipsec, &bond->ipsec_list, list) {
>>> - if (ipsec->xs == xs) {
>>> - list_del(&ipsec->list);
>>> - kfree(ipsec);
>>> - break;
>>> - }
>>> - }
>>> - mutex_unlock(&bond->ipsec_lock);
>>> +
>>> + xfrm_work = kmalloc(sizeof(*xfrm_work), GFP_ATOMIC);
>>> + if (!xfrm_work)
>>> + return;
>>> +
>>
>> What happens if this allocation fails? I think you'll leak memory and
>> potentially call the xdo_dev callbacks for this xs again because it's
>> still in the list. Also this xfrm_work memory doesn't get freed anywhere, so
>> you're leaking it as well.
>
> Yes, I thought this too simply and forgot free the memory.
>>
>> Perhaps you can do this allocation in add_sa, it seems you can sleep
>> there and potentially return an error if it fails, so this can never
>> fail later. You'll have to be careful with the freeing dance though.
>
> Hmm, if we allocation this in add_sa, how to we get the xfrm_work
> in del_sa? Add the xfrm_work to another list will need to sleep again
> to find it out in del_sa.
>
Well, you have struct bond_ipsec and it is tied with the work's lifetime
so you can stick it there. :)
I haven't looked closely how feasible it is.
>> Alternatively, make the work a part of struct bond so it doesn't need
>> memory management, but then you need a mechanism to queue these items (e.g.
>> a separate list with a spinlock) and would have more complexity with freeing
>> in parallel.
>
> I used a dealy work queue in bond for my draft patch. As you said,
> it need another list to queue the xs. And during the gc works, we need
> to use spinlock again to get the xs out...
>
Correct, it's a different kind of mess. :)
>>
>>> + INIT_WORK(&xfrm_work->work, bond_xfrm_state_gc_work);
>>> + xfrm_work->bond = bond;
>>> + xfrm_work->xs = xs;
>>> + xfrm_state_hold(xs);
>>> +
>>> + queue_work(bond->wq, &xfrm_work->work);
>>
>> Note that nothing waits for this work anywhere and .ndo_uninit runs before
>> bond's .priv_destructor which means ipsec_lock will be destroyed and will be
>> used afterwards when destroying bond->wq from the destructor if there were
>> any queued works.
>
> Do you mean we need to register the work queue in bond_init and cancel
> it in bond_work_cancel_all()?
>
> Thanks
> Hangbin
That is one way, the other is if you have access to the work queue items then
you can cancel them which should be easier (i.e. cancel_delayed_work_sync).
Regardless of which way you choose to solve this (gc or work in bond_ipsec), there will
be some dance to be done for the sequence of events that will not be straight-forward.
Cheers,
Nik
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCHv2 net 1/3] bonding: move mutex lock to a work queue for XFRM GC tasks
2025-02-25 9:40 ` [PATCHv2 net 1/3] bonding: move mutex lock to a work queue for XFRM GC tasks Hangbin Liu
2025-02-25 11:05 ` Nikolay Aleksandrov
@ 2025-02-25 14:00 ` Cosmin Ratiu
2025-02-25 14:27 ` Nikolay Aleksandrov
2025-02-26 9:48 ` Hangbin Liu
1 sibling, 2 replies; 13+ messages in thread
From: Cosmin Ratiu @ 2025-02-25 14:00 UTC (permalink / raw)
To: netdev@vger.kernel.org, liuhangbin@gmail.com
Cc: andrew+netdev@lunn.ch, jarod@redhat.com, razor@blackwall.org,
davem@davemloft.net, Tariq Toukan, linux-kernel@vger.kernel.org,
shuah@kernel.org, steffen.klassert@secunet.com, jv@jvosburgh.net,
pabeni@redhat.com, horms@kernel.org, edumazet@google.com,
kuba@kernel.org, linux-kselftest@vger.kernel.org, Jianbo Liu
On Tue, 2025-02-25 at 09:40 +0000, Hangbin Liu wrote:
> The fixed commit placed mutex_lock() inside spin_lock_bh(), which
> triggers
> a warning like:
>
> BUG: sleeping function called from invalid context at...
>
> Fix this by moving the mutex_lock() operation to a work queue.
>
> Fixes: 2aeeef906d5a ("bonding: change ipsec_lock from spin lock to
> mutex")
> Reported-by: Jakub Kicinski <kuba@kernel.org>
> Closes:
> https://lore.kernel.org/netdev/20241212062734.182a0164@kernel.org
> Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
> ---
> drivers/net/bonding/bond_main.c | 41 +++++++++++++++++++++++++------
> --
> include/net/bonding.h | 6 +++++
> 2 files changed, 37 insertions(+), 10 deletions(-)
>
> diff --git a/drivers/net/bonding/bond_main.c
> b/drivers/net/bonding/bond_main.c
> index e45bba240cbc..cc7064aa4b35 100644
> --- a/drivers/net/bonding/bond_main.c
> +++ b/drivers/net/bonding/bond_main.c
> @@ -551,6 +551,25 @@ static void bond_ipsec_add_sa_all(struct bonding
> *bond)
> mutex_unlock(&bond->ipsec_lock);
> }
>
> +static void bond_xfrm_state_gc_work(struct work_struct *work)
> +{
> + struct bond_xfrm_work *xfrm_work = container_of(work, struct
> bond_xfrm_work, work);
> + struct bonding *bond = xfrm_work->bond;
> + struct xfrm_state *xs = xfrm_work->xs;
> + struct bond_ipsec *ipsec;
> +
> + mutex_lock(&bond->ipsec_lock);
> + list_for_each_entry(ipsec, &bond->ipsec_list, list) {
> + if (ipsec->xs == xs) {
> + list_del(&ipsec->list);
> + kfree(ipsec);
> + xfrm_state_put(xs);
I would expect xfrm_state_put to be called from outside the loop,
regardless of whether an entry is found in the list or not, because it
was unconditionally referenced when the work was created.
> + break;
> + }
> + }
> + mutex_unlock(&bond->ipsec_lock);
> +}
> +
> /**
> * bond_ipsec_del_sa - clear out this specific SA
> * @xs: pointer to transformer state struct
> @@ -558,9 +577,9 @@ static void bond_ipsec_add_sa_all(struct bonding
> *bond)
> static void bond_ipsec_del_sa(struct xfrm_state *xs)
> {
> struct net_device *bond_dev = xs->xso.dev;
> + struct bond_xfrm_work *xfrm_work;
> struct net_device *real_dev;
> netdevice_tracker tracker;
> - struct bond_ipsec *ipsec;
> struct bonding *bond;
> struct slave *slave;
>
> @@ -592,15 +611,17 @@ static void bond_ipsec_del_sa(struct xfrm_state
> *xs)
> real_dev->xfrmdev_ops->xdo_dev_state_delete(xs);
> out:
> netdev_put(real_dev, &tracker);
> - mutex_lock(&bond->ipsec_lock);
> - list_for_each_entry(ipsec, &bond->ipsec_list, list) {
> - if (ipsec->xs == xs) {
> - list_del(&ipsec->list);
> - kfree(ipsec);
> - break;
> - }
> - }
> - mutex_unlock(&bond->ipsec_lock);
> +
> + xfrm_work = kmalloc(sizeof(*xfrm_work), GFP_ATOMIC);
> + if (!xfrm_work)
> + return;
> +
> + INIT_WORK(&xfrm_work->work, bond_xfrm_state_gc_work);
> + xfrm_work->bond = bond;
> + xfrm_work->xs = xs;
> + xfrm_state_hold(xs);
> +
> + queue_work(bond->wq, &xfrm_work->work);
> }
>
> static void bond_ipsec_del_sa_all(struct bonding *bond)
> diff --git a/include/net/bonding.h b/include/net/bonding.h
> index 8bb5f016969f..d54ba5e3affb 100644
> --- a/include/net/bonding.h
> +++ b/include/net/bonding.h
> @@ -209,6 +209,12 @@ struct bond_ipsec {
> struct xfrm_state *xs;
> };
>
> +struct bond_xfrm_work {
> + struct work_struct work;
> + struct bonding *bond;
> + struct xfrm_state *xs;
> +};
Also, like Nikolai said, something needs to wait on all in-flight work
items.
This got me to stare at the code again. What if we move the removal of
the xs from bond->ipsec from bond_ipsec_del_sa to bond_ipsec_free_sa?
bond_ipsec_free_sa, unlike bond_ipsec_del_sa, is not called with x-
>lock held. It is called from the xfrm gc task or directly via
xfrm_state_put_sync and therefore wouldn't suffer from the locking
issue.
The tricky part is to make sure that inactive bond->ipsec entries
(after bond_ipsec_del_sa calls) do not cause issues if there's a
migration (bond_ipsec_del_sa_all is called) happening before
bond_ipsec_free_sa. Perhaps filtering by x->km.state != XFRM_STATE_DEAD
in bond_ipsec_del_sa_all.
What do you think about this idea?
Cosmin.
^ permalink raw reply [flat|nested] 13+ messages in thread* Re: [PATCHv2 net 1/3] bonding: move mutex lock to a work queue for XFRM GC tasks
2025-02-25 14:00 ` Cosmin Ratiu
@ 2025-02-25 14:27 ` Nikolay Aleksandrov
2025-02-26 9:48 ` Hangbin Liu
1 sibling, 0 replies; 13+ messages in thread
From: Nikolay Aleksandrov @ 2025-02-25 14:27 UTC (permalink / raw)
To: Cosmin Ratiu, netdev@vger.kernel.org, liuhangbin@gmail.com
Cc: andrew+netdev@lunn.ch, jarod@redhat.com, davem@davemloft.net,
Tariq Toukan, linux-kernel@vger.kernel.org, shuah@kernel.org,
steffen.klassert@secunet.com, jv@jvosburgh.net, pabeni@redhat.com,
horms@kernel.org, edumazet@google.com, kuba@kernel.org,
linux-kselftest@vger.kernel.org, Jianbo Liu
On 2/25/25 16:00, Cosmin Ratiu wrote:
> On Tue, 2025-02-25 at 09:40 +0000, Hangbin Liu wrote:
>> The fixed commit placed mutex_lock() inside spin_lock_bh(), which
>> triggers
>> a warning like:
>>
>> BUG: sleeping function called from invalid context at...
>>
>> Fix this by moving the mutex_lock() operation to a work queue.
>>
>> Fixes: 2aeeef906d5a ("bonding: change ipsec_lock from spin lock to
>> mutex")
>> Reported-by: Jakub Kicinski <kuba@kernel.org>
>> Closes:
>> https://lore.kernel.org/netdev/20241212062734.182a0164@kernel.org
>> Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
>> ---
>> drivers/net/bonding/bond_main.c | 41 +++++++++++++++++++++++++------
>> --
>> include/net/bonding.h | 6 +++++
>> 2 files changed, 37 insertions(+), 10 deletions(-)
>>
>> diff --git a/drivers/net/bonding/bond_main.c
>> b/drivers/net/bonding/bond_main.c
>> index e45bba240cbc..cc7064aa4b35 100644
>> --- a/drivers/net/bonding/bond_main.c
>> +++ b/drivers/net/bonding/bond_main.c
>> @@ -551,6 +551,25 @@ static void bond_ipsec_add_sa_all(struct bonding
>> *bond)
>> mutex_unlock(&bond->ipsec_lock);
>> }
>>
>> +static void bond_xfrm_state_gc_work(struct work_struct *work)
>> +{
>> + struct bond_xfrm_work *xfrm_work = container_of(work, struct
>> bond_xfrm_work, work);
>> + struct bonding *bond = xfrm_work->bond;
>> + struct xfrm_state *xs = xfrm_work->xs;
>> + struct bond_ipsec *ipsec;
>> +
>> + mutex_lock(&bond->ipsec_lock);
>> + list_for_each_entry(ipsec, &bond->ipsec_list, list) {
>> + if (ipsec->xs == xs) {
>> + list_del(&ipsec->list);
>> + kfree(ipsec);
>> + xfrm_state_put(xs);
>
> I would expect xfrm_state_put to be called from outside the loop,
> regardless of whether an entry is found in the list or not, because it
> was unconditionally referenced when the work was created.
>
>> + break;
>> + }
>> + }
>> + mutex_unlock(&bond->ipsec_lock);
>> +}
>> +
>> /**
>> * bond_ipsec_del_sa - clear out this specific SA
>> * @xs: pointer to transformer state struct
>> @@ -558,9 +577,9 @@ static void bond_ipsec_add_sa_all(struct bonding
>> *bond)
>> static void bond_ipsec_del_sa(struct xfrm_state *xs)
>> {
>> struct net_device *bond_dev = xs->xso.dev;
>> + struct bond_xfrm_work *xfrm_work;
>> struct net_device *real_dev;
>> netdevice_tracker tracker;
>> - struct bond_ipsec *ipsec;
>> struct bonding *bond;
>> struct slave *slave;
>>
>> @@ -592,15 +611,17 @@ static void bond_ipsec_del_sa(struct xfrm_state
>> *xs)
>> real_dev->xfrmdev_ops->xdo_dev_state_delete(xs);
>> out:
>> netdev_put(real_dev, &tracker);
>> - mutex_lock(&bond->ipsec_lock);
>> - list_for_each_entry(ipsec, &bond->ipsec_list, list) {
>> - if (ipsec->xs == xs) {
>> - list_del(&ipsec->list);
>> - kfree(ipsec);
>> - break;
>> - }
>> - }
>> - mutex_unlock(&bond->ipsec_lock);
>> +
>> + xfrm_work = kmalloc(sizeof(*xfrm_work), GFP_ATOMIC);
>> + if (!xfrm_work)
>> + return;
>> +
>> + INIT_WORK(&xfrm_work->work, bond_xfrm_state_gc_work);
>> + xfrm_work->bond = bond;
>> + xfrm_work->xs = xs;
>> + xfrm_state_hold(xs);
>> +
>> + queue_work(bond->wq, &xfrm_work->work);
>> }
>>
>> static void bond_ipsec_del_sa_all(struct bonding *bond)
>> diff --git a/include/net/bonding.h b/include/net/bonding.h
>> index 8bb5f016969f..d54ba5e3affb 100644
>> --- a/include/net/bonding.h
>> +++ b/include/net/bonding.h
>> @@ -209,6 +209,12 @@ struct bond_ipsec {
>> struct xfrm_state *xs;
>> };
>>
>> +struct bond_xfrm_work {
>> + struct work_struct work;
>> + struct bonding *bond;
>> + struct xfrm_state *xs;
>> +};
>
> Also, like Nikolai said, something needs to wait on all in-flight work
> items.
>
> This got me to stare at the code again. What if we move the removal of
> the xs from bond->ipsec from bond_ipsec_del_sa to bond_ipsec_free_sa?
> bond_ipsec_free_sa, unlike bond_ipsec_del_sa, is not called with x-
>> lock held. It is called from the xfrm gc task or directly via
> xfrm_state_put_sync and therefore wouldn't suffer from the locking
> issue.
>
> The tricky part is to make sure that inactive bond->ipsec entries
> (after bond_ipsec_del_sa calls) do not cause issues if there's a
> migration (bond_ipsec_del_sa_all is called) happening before
> bond_ipsec_free_sa. Perhaps filtering by x->km.state != XFRM_STATE_DEAD
> in bond_ipsec_del_sa_all.
>
> What do you think about this idea?
>
> Cosmin.
I know the question was for Hangbin, but I do like this solution. I missed
the xdo_dev_state_free callback, it could lead to a much simpler solution
with some care.
Cheers,
Nik
^ permalink raw reply [flat|nested] 13+ messages in thread* Re: [PATCHv2 net 1/3] bonding: move mutex lock to a work queue for XFRM GC tasks
2025-02-25 14:00 ` Cosmin Ratiu
2025-02-25 14:27 ` Nikolay Aleksandrov
@ 2025-02-26 9:48 ` Hangbin Liu
2025-02-26 11:05 ` Cosmin Ratiu
1 sibling, 1 reply; 13+ messages in thread
From: Hangbin Liu @ 2025-02-26 9:48 UTC (permalink / raw)
To: Cosmin Ratiu
Cc: netdev@vger.kernel.org, andrew+netdev@lunn.ch, jarod@redhat.com,
razor@blackwall.org, davem@davemloft.net, Tariq Toukan,
linux-kernel@vger.kernel.org, shuah@kernel.org,
steffen.klassert@secunet.com, jv@jvosburgh.net, pabeni@redhat.com,
horms@kernel.org, edumazet@google.com, kuba@kernel.org,
linux-kselftest@vger.kernel.org, Jianbo Liu
Hi Cosmin,
On Tue, Feb 25, 2025 at 02:00:05PM +0000, Cosmin Ratiu wrote:
> This got me to stare at the code again. What if we move the removal of
> the xs from bond->ipsec from bond_ipsec_del_sa to bond_ipsec_free_sa?
> bond_ipsec_free_sa, unlike bond_ipsec_del_sa, is not called with x-
> >lock held. It is called from the xfrm gc task or directly via
> xfrm_state_put_sync and therefore wouldn't suffer from the locking
> issue.
>
> The tricky part is to make sure that inactive bond->ipsec entries
> (after bond_ipsec_del_sa calls) do not cause issues if there's a
> migration (bond_ipsec_del_sa_all is called) happening before
> bond_ipsec_free_sa. Perhaps filtering by x->km.state != XFRM_STATE_DEAD
> in bond_ipsec_del_sa_all.
>
> What do you think about this idea?
Thanks a lot for the comments. I also skipped the DEAD xs in add_sa_all.
What about the patch like:
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index e45bba240cbc..0e4db43a833a 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -537,6 +537,12 @@ static void bond_ipsec_add_sa_all(struct bonding *bond)
}
list_for_each_entry(ipsec, &bond->ipsec_list, list) {
+ /* No need to handle DEAD XFRM, as it has already been
+ * deleted and will be freed later.
+ */
+ if (ipsec->xs->km.state == XFRM_STATE_DEAD)
+ continue;
+
/* If new state is added before ipsec_lock acquired */
if (ipsec->xs->xso.real_dev == real_dev)
continue;
@@ -592,15 +598,6 @@ static void bond_ipsec_del_sa(struct xfrm_state *xs)
real_dev->xfrmdev_ops->xdo_dev_state_delete(xs);
out:
netdev_put(real_dev, &tracker);
- mutex_lock(&bond->ipsec_lock);
- list_for_each_entry(ipsec, &bond->ipsec_list, list) {
- if (ipsec->xs == xs) {
- list_del(&ipsec->list);
- kfree(ipsec);
- break;
- }
- }
- mutex_unlock(&bond->ipsec_lock);
}
static void bond_ipsec_del_sa_all(struct bonding *bond)
@@ -617,6 +614,12 @@ static void bond_ipsec_del_sa_all(struct bonding *bond)
mutex_lock(&bond->ipsec_lock);
list_for_each_entry(ipsec, &bond->ipsec_list, list) {
+ /* No need to handle DEAD XFRM, as it has already been
+ * deleted and will be freed later.
+ */
+ if (ipsec->xs->km.state == XFRM_STATE_DEAD)
+ continue;
+
if (!ipsec->xs->xso.real_dev)
continue;
@@ -666,6 +669,16 @@ static void bond_ipsec_free_sa(struct xfrm_state *xs)
real_dev->xfrmdev_ops->xdo_dev_state_free(xs);
out:
netdev_put(real_dev, &tracker);
+
+ mutex_lock(&bond->ipsec_lock);
+ list_for_each_entry(ipsec, &bond->ipsec_list, list) {
+ if (ipsec->xs == xs) {
+ list_del(&ipsec->list);
+ kfree(ipsec);
+ break;
+ }
+ }
+ mutex_unlock(&bond->ipsec_lock);
}
/**
--
2.46.0
^ permalink raw reply related [flat|nested] 13+ messages in thread* Re: [PATCHv2 net 1/3] bonding: move mutex lock to a work queue for XFRM GC tasks
2025-02-26 9:48 ` Hangbin Liu
@ 2025-02-26 11:05 ` Cosmin Ratiu
2025-02-26 12:07 ` Hangbin Liu
0 siblings, 1 reply; 13+ messages in thread
From: Cosmin Ratiu @ 2025-02-26 11:05 UTC (permalink / raw)
To: liuhangbin@gmail.com
Cc: shuah@kernel.org, andrew+netdev@lunn.ch, davem@davemloft.net,
jv@jvosburgh.net, jarod@redhat.com, razor@blackwall.org,
linux-kernel@vger.kernel.org, pabeni@redhat.com, Jianbo Liu,
edumazet@google.com, horms@kernel.org, kuba@kernel.org,
Tariq Toukan, netdev@vger.kernel.org,
steffen.klassert@secunet.com, linux-kselftest@vger.kernel.org
On Wed, 2025-02-26 at 09:48 +0000, Hangbin Liu wrote:
> Hi Cosmin,
> On Tue, Feb 25, 2025 at 02:00:05PM +0000, Cosmin Ratiu wrote:
> > This got me to stare at the code again. What if we move the removal
> > of
> > the xs from bond->ipsec from bond_ipsec_del_sa to
> > bond_ipsec_free_sa?
> > bond_ipsec_free_sa, unlike bond_ipsec_del_sa, is not called with x-
> > > lock held. It is called from the xfrm gc task or directly via
> > xfrm_state_put_sync and therefore wouldn't suffer from the locking
> > issue.
> >
> > The tricky part is to make sure that inactive bond->ipsec entries
> > (after bond_ipsec_del_sa calls) do not cause issues if there's a
> > migration (bond_ipsec_del_sa_all is called) happening before
> > bond_ipsec_free_sa. Perhaps filtering by x->km.state !=
> > XFRM_STATE_DEAD
> > in bond_ipsec_del_sa_all.
> >
> > What do you think about this idea?
>
> Thanks a lot for the comments. I also skipped the DEAD xs in
> add_sa_all.
> What about the patch like:
This is what I had in mind, thanks for proposing it. Maybe you should
package it in a new submission with a proper title/etc.?
I'll do the initial review here.
>
> diff --git a/drivers/net/bonding/bond_main.c
> b/drivers/net/bonding/bond_main.c
> index e45bba240cbc..0e4db43a833a 100644
> --- a/drivers/net/bonding/bond_main.c
> +++ b/drivers/net/bonding/bond_main.c
> @@ -537,6 +537,12 @@ static void bond_ipsec_add_sa_all(struct bonding
> *bond)
> }
>
> list_for_each_entry(ipsec, &bond->ipsec_list, list) {
> + /* No need to handle DEAD XFRM, as it has already
> been
> + * deleted and will be freed later.
> + */
Nit: Maybe rephrase that as "Skip dead xfrm states, they'll be freed
later."
> + if (ipsec->xs->km.state == XFRM_STATE_DEAD)
> + continue;
> +
> /* If new state is added before ipsec_lock acquired
> */
> if (ipsec->xs->xso.real_dev == real_dev)
> continue;
> @@ -592,15 +598,6 @@ static void bond_ipsec_del_sa(struct xfrm_state
> *xs)
> real_dev->xfrmdev_ops->xdo_dev_state_delete(xs);
> out:
> netdev_put(real_dev, &tracker);
> - mutex_lock(&bond->ipsec_lock);
> - list_for_each_entry(ipsec, &bond->ipsec_list, list) {
> - if (ipsec->xs == xs) {
> - list_del(&ipsec->list);
> - kfree(ipsec);
> - break;
> - }
> - }
> - mutex_unlock(&bond->ipsec_lock);
> }
>
> static void bond_ipsec_del_sa_all(struct bonding *bond)
> @@ -617,6 +614,12 @@ static void bond_ipsec_del_sa_all(struct bonding
> *bond)
>
> mutex_lock(&bond->ipsec_lock);
> list_for_each_entry(ipsec, &bond->ipsec_list, list) {
> + /* No need to handle DEAD XFRM, as it has already
> been
> + * deleted and will be freed later.
> + */
> + if (ipsec->xs->km.state == XFRM_STATE_DEAD)
> + continue;
> +
If this doesn't free dead entries now and bond_ipsec_add_sa_all is
called soon after, the pending bond_ipsec_free_sa() call will then hit
the WARN_ON(xs->xso.real_dev != real_dev) before attempting to call
free on the wrong device.
To fix that, these entries should be freed here and the WARN_ON in
bond_ipsec_free_sa() should be converted to an if...goto out, so that
bond_ipsec_free_sa() calls would hit one of these conditions:
1. "if (!slave)", when no active device exists.
2. "if (!xs->xso.real_dev)", when xdo_dev_state_add() failed.
3. "if (xs->xso.real_dev != real_dev)", when a DEAD xs was already
freed by bond_ipsec_del_sa_all() migration to a new device.
In all 3 cases, xdo_dev_state_free() shouldn't be called, only xs
removed from the bond->ipsec list.
I hope I didn't miss any corner case.
> if (!ipsec->xs->xso.real_dev)
> continue;
>
> @@ -666,6 +669,16 @@ static void bond_ipsec_free_sa(struct xfrm_state
> *xs)
> real_dev->xfrmdev_ops->xdo_dev_state_free(xs);
> out:
> netdev_put(real_dev, &tracker);
> +
> + mutex_lock(&bond->ipsec_lock);
> + list_for_each_entry(ipsec, &bond->ipsec_list, list) {
> + if (ipsec->xs == xs) {
> + list_del(&ipsec->list);
> + kfree(ipsec);
> + break;
> + }
> + }
> + mutex_unlock(&bond->ipsec_lock);
> }
>
> /**
Cosmin.
^ permalink raw reply [flat|nested] 13+ messages in thread* Re: [PATCHv2 net 1/3] bonding: move mutex lock to a work queue for XFRM GC tasks
2025-02-26 11:05 ` Cosmin Ratiu
@ 2025-02-26 12:07 ` Hangbin Liu
2025-02-26 14:05 ` Cosmin Ratiu
0 siblings, 1 reply; 13+ messages in thread
From: Hangbin Liu @ 2025-02-26 12:07 UTC (permalink / raw)
To: Cosmin Ratiu
Cc: shuah@kernel.org, andrew+netdev@lunn.ch, davem@davemloft.net,
jv@jvosburgh.net, jarod@redhat.com, razor@blackwall.org,
linux-kernel@vger.kernel.org, pabeni@redhat.com, Jianbo Liu,
edumazet@google.com, horms@kernel.org, kuba@kernel.org,
Tariq Toukan, netdev@vger.kernel.org,
steffen.klassert@secunet.com, linux-kselftest@vger.kernel.org
On Wed, Feb 26, 2025 at 11:05:47AM +0000, Cosmin Ratiu wrote:
> > > What do you think about this idea?
> >
> > Thanks a lot for the comments. I also skipped the DEAD xs in
> > add_sa_all.
> > What about the patch like:
>
> This is what I had in mind, thanks for proposing it. Maybe you should
> package it in a new submission with a proper title/etc.?
> I'll do the initial review here.
This is a draft patch and I think there may have something need to be fixed.
So I just paste it here :)
>
> >
> > diff --git a/drivers/net/bonding/bond_main.c
> > b/drivers/net/bonding/bond_main.c
> > index e45bba240cbc..0e4db43a833a 100644
> > --- a/drivers/net/bonding/bond_main.c
> > +++ b/drivers/net/bonding/bond_main.c
> > @@ -537,6 +537,12 @@ static void bond_ipsec_add_sa_all(struct bonding
> > *bond)
> > }
> >
> > list_for_each_entry(ipsec, &bond->ipsec_list, list) {
> > + /* No need to handle DEAD XFRM, as it has already
> > been
> > + * deleted and will be freed later.
> > + */
>
> Nit: Maybe rephrase that as "Skip dead xfrm states, they'll be freed
> later."
>
> > + if (ipsec->xs->km.state == XFRM_STATE_DEAD)
> > + continue;
> > +
> > /* If new state is added before ipsec_lock acquired
> > */
> > if (ipsec->xs->xso.real_dev == real_dev)
> > continue;
> > @@ -592,15 +598,6 @@ static void bond_ipsec_del_sa(struct xfrm_state
> > *xs)
> > real_dev->xfrmdev_ops->xdo_dev_state_delete(xs);
> > out:
> > netdev_put(real_dev, &tracker);
> > - mutex_lock(&bond->ipsec_lock);
> > - list_for_each_entry(ipsec, &bond->ipsec_list, list) {
> > - if (ipsec->xs == xs) {
> > - list_del(&ipsec->list);
> > - kfree(ipsec);
> > - break;
> > - }
> > - }
> > - mutex_unlock(&bond->ipsec_lock);
> > }
> >
> > static void bond_ipsec_del_sa_all(struct bonding *bond)
> > @@ -617,6 +614,12 @@ static void bond_ipsec_del_sa_all(struct bonding
> > *bond)
> >
> > mutex_lock(&bond->ipsec_lock);
> > list_for_each_entry(ipsec, &bond->ipsec_list, list) {
> > + /* No need to handle DEAD XFRM, as it has already
> > been
> > + * deleted and will be freed later.
> > + */
> > + if (ipsec->xs->km.state == XFRM_STATE_DEAD)
> > + continue;
> > +
>
> If this doesn't free dead entries now and bond_ipsec_add_sa_all is
> called soon after, the pending bond_ipsec_free_sa() call will then hit
> the WARN_ON(xs->xso.real_dev != real_dev) before attempting to call
> free on the wrong device.
> To fix that, these entries should be freed here and the WARN_ON in
> bond_ipsec_free_sa() should be converted to an if...goto out, so that
> bond_ipsec_free_sa() calls would hit one of these conditions:
> 1. "if (!slave)", when no active device exists.
> 2. "if (!xs->xso.real_dev)", when xdo_dev_state_add() failed.
> 3. "if (xs->xso.real_dev != real_dev)", when a DEAD xs was already
> freed by bond_ipsec_del_sa_all() migration to a new device.
> In all 3 cases, xdo_dev_state_free() shouldn't be called, only xs
> removed from the bond->ipsec list.
>
> I hope I didn't miss any corner case.
Thumb up! Thanks a lot for your review and comments. You thought much more
than me. During bonding testing, we also found a case that would trigger
the WARN_ON(xs->xso.real_dev != real_dev).
If we create active-backup mode bonding and create ipsec tunnel over
bonding device, then remove bonding device. There is a possibility that
the bond call bond_ipsec_del_sa_all() to delete the ipsec state first,
then change active slave to another interface.
At the same time, ipsec gc was called and then bond_ipsec_free_sa().
This will cause the xs->xso.real_dev != active_slave as the failover
triggered. The call traces looks like:
[14504.421247] bond0: (slave enp23s0f1np1): Enslaving as a backup interface with an up link
[14506.761933] mlx5_core 0000:17:00.0: lag map active ports: 1
[14506.767520] mlx5_core 0000:17:00.0: shared_fdb:0 mode:hash
[14550.992133] bond0: (slave enp23s0f0np0): Releasing backup interface
[14550.994150] mlx5_core 0000:17:00.0: lag map active ports: 1, 2
[14550.998407] bond0: (slave enp23s0f1np1): making interface the new active one
[14551.013286] ------------[ cut here ]------------
[14551.017912] WARNING: CPU: 7 PID: 1537 at drivers/net/bonding/bond_main.c:664 bond_ipsec_free_sa+0x9b/0xa0 [bonding]
[14551.117875] Unloaded tainted modules: bonding(E):33 fjes(E):1 padlock_aes(E):2 [last unloaded: bonding(E)]
[14551.148449] CPU: 7 UID: 0 PID: 1537 Comm: kworker/7:2 Kdump: loaded Tainted: G E 6.13.0-rc7+ #5
[14551.158536] Tainted: [E]=UNSIGNED_MODULE
[14551.162461] Hardware name: Dell Inc. PowerEdge R750/0WT8Y6, BIOS 1.5.4 12/17/2021
[14551.169941] Workqueue: events xfrm_state_gc_task
[14551.174559] RIP: 0010:bond_ipsec_free_sa+0x9b/0xa0 [bonding]
[14551.180227] Code: 8b 85 38 05 00 00 65 ff 08 5b 5d c3 cc cc cc cc 5b 5d e9 e8 e3 01 da e8 e3 e3 01 da 48 83 bb b0 02 00 00 00 74 e3 0f 0b eb df <0f> 0b eb b4 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 f3
[14551.198972] RSP: 0018:ff61163a49eb3e00 EFLAGS: 00010287
[14551.204200] RAX: ff42be3fe4bd8000 RBX: ff42be3fa7359d40 RCX: 00000000802a0025
[14551.211336] RDX: ff42be4edc534280 RSI: 00000000fffffe00 RDI: ff42be4edc534280
[14551.218476] RBP: ff42be3f50128000 R08: 0000000000000000 R09: 0000000000000001
[14551.225606] R10: 00000000802a0025 R11: ff42be404d917f60 R12: ff42be5e7edb4e80
[14551.232740] R13: ff42be4edc534280 R14: ffffffff9db3db40 R15: 0000000000000000
[14551.239872] FS: 0000000000000000(0000) GS:ff42be5e7ed80000(0000) knlGS:0000000000000000
[14551.247957] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[14551.253704] CR2: 00007fff69f55df0 CR3: 0000001158a22002 CR4: 0000000000773ef0
[14551.260836] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[14551.267970] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[14551.275101] PKRU: 55555554
[14551.277814] Call Trace:
[14551.280268] <TASK>
[14551.282374] ? show_trace_log_lvl+0x1b0/0x2f0
[14551.286742] ? show_trace_log_lvl+0x1b0/0x2f0
[14551.291102] ? xfrm_dev_state_free+0x84/0xb0
[14551.295374] ? bond_ipsec_free_sa+0x9b/0xa0 [bonding]
[14551.300435] ? __warn.cold+0x93/0xf4
[14551.304020] ? bond_ipsec_free_sa+0x9b/0xa0 [bonding]
[14551.309076] ? report_bug+0xff/0x140
[14551.312662] ? handle_bug+0x53/0x90
[14551.316157] ? exc_invalid_op+0x17/0x70
[14551.319994] ? asm_exc_invalid_op+0x1a/0x20
[14551.324183] ? bond_ipsec_free_sa+0x9b/0xa0 [bonding]
[14551.329242] xfrm_dev_state_free+0x84/0xb0
[14551.333343] ___xfrm_state_destroy+0xe3/0x160
[14551.337701] xfrm_state_gc_task+0x7a/0xb0
[14551.341713] process_one_work+0x174/0x330
[14551.345729] worker_thread+0x252/0x390
[14551.349487] ? __pfx_worker_thread+0x10/0x10
[14551.353761] kthread+0xcf/0x100
[14551.356908] ? __pfx_kthread+0x10/0x10
[14551.360668] ret_from_fork+0x31/0x50
[14551.364249] ? __pfx_kthread+0x10/0x10
[14551.368009] ret_from_fork_asm+0x1a/0x30
[14551.371943] </TASK>
[14551.374136] ---[ end trace 0000000000000000 ]---
[14551.735092] bond0: (slave enp23s0f1np1): Releasing backup interface
[14552.110577] bond0 (unregistering): Released all slaves
This seems like another situation that could not simply fit
3. "if (xs->xso.real_dev != real_dev), goto out.
I'm not sure what's the xs->km.state should be during xfrm_state_gc_task().
Is it also set to XFRM_STATE_DEAD, because I didn't see it.
Especially if the bond change active slave and xfrm_state_gc_task() run
in parallel, like
bond_ipsec_del_sa_all()
xfrm_state_gc_task()
xfrm_dev_state_free()
bond_ipsec_free_sa()
bond_ipsec_add_sa_all()
If the xs->km.state is not XFRM_STATE_DEAD. How to avoid the
WARN_ON(xs->xso.real_dev != real_dev) in bond_ipsec_free_sa()
and how to make bond_ipsec_add_sa_all() not added the entry again.
Thanks
Hangbin
^ permalink raw reply [flat|nested] 13+ messages in thread* Re: [PATCHv2 net 1/3] bonding: move mutex lock to a work queue for XFRM GC tasks
2025-02-26 12:07 ` Hangbin Liu
@ 2025-02-26 14:05 ` Cosmin Ratiu
0 siblings, 0 replies; 13+ messages in thread
From: Cosmin Ratiu @ 2025-02-26 14:05 UTC (permalink / raw)
To: liuhangbin@gmail.com
Cc: shuah@kernel.org, andrew+netdev@lunn.ch, davem@davemloft.net,
Jianbo Liu, jarod@redhat.com, razor@blackwall.org,
linux-kernel@vger.kernel.org, pabeni@redhat.com,
edumazet@google.com, jv@jvosburgh.net, horms@kernel.org,
kuba@kernel.org, Tariq Toukan, netdev@vger.kernel.org,
steffen.klassert@secunet.com, linux-kselftest@vger.kernel.org
On Wed, 2025-02-26 at 12:07 +0000, Hangbin Liu wrote:
>
> During bonding testing, we also found a case that would trigger
> the WARN_ON(xs->xso.real_dev != real_dev).
>
> If we create active-backup mode bonding and create ipsec tunnel over
> bonding device, then remove bonding device. There is a possibility
> that
> the bond call bond_ipsec_del_sa_all() to delete the ipsec state
> first,
> then change active slave to another interface.
>
> At the same time, ipsec gc was called and then bond_ipsec_free_sa().
> This will cause the xs->xso.real_dev != active_slave as the failover
> triggered. The call traces looks like:
> [..]
>
> This seems like another situation that could not simply fit
> 3. "if (xs->xso.real_dev != real_dev), goto out.
> I'm not sure what's the xs->km.state should be during
> xfrm_state_gc_task().
> Is it also set to XFRM_STATE_DEAD, because I didn't see it.
XFRM_STATE_DEAD is set in __xfrm_state_delete() (and other places for
what seems like error conditions), plus there's a WARN_ON(x->km.state
!= XFRM_STATE_DEAD) in __xfrm_state_destroy(). This last function is
the main way xfrm states are destroyed, besides xfrm_dev_state_flush
and xfrm_state_find (where xfrm_state_delete + xfrm_dev_state_free are
used directly).
So I am pretty sure that when bond .xdo_dev_state_free() is called via
either one of the above three mechanisms, the state should be
XFRM_STATE_DEAD. But maybe I'm missing something.
>
> Especially if the bond change active slave and xfrm_state_gc_task()
> run
> in parallel, like
>
> bond_ipsec_del_sa_all()
> xfrm_state_gc_task()
> xfrm_dev_state_free()
> bond_ipsec_free_sa()
> bond_ipsec_add_sa_all()
>
> If the xs->km.state is not XFRM_STATE_DEAD. How to avoid the
> WARN_ON(xs->xso.real_dev != real_dev) in bond_ipsec_free_sa()
> and how to make bond_ipsec_add_sa_all() not added the entry again.
I am proposing you change this WARN_ON to an if, avoid calling
xdo_dev_state_free on real_dev in that case and just remove the entry
from bond->ipsec.
Cosmin.
^ permalink raw reply [flat|nested] 13+ messages in thread
* [PATCHv2 net 2/3] bonding: fix xfrm offload feature setup on active-backup mode
2025-02-25 9:40 [PATCHv2 net 0/3] bond: fix xfrm offload issues Hangbin Liu
2025-02-25 9:40 ` [PATCHv2 net 1/3] bonding: move mutex lock to a work queue for XFRM GC tasks Hangbin Liu
@ 2025-02-25 9:40 ` Hangbin Liu
2025-02-25 9:40 ` [PATCHv2 net 3/3] selftests: bonding: add ipsec offload test Hangbin Liu
2 siblings, 0 replies; 13+ messages in thread
From: Hangbin Liu @ 2025-02-25 9:40 UTC (permalink / raw)
To: netdev
Cc: Jay Vosburgh, Andrew Lunn, David S. Miller, Eric Dumazet,
Jakub Kicinski, Paolo Abeni, Nikolay Aleksandrov, Simon Horman,
Shuah Khan, Tariq Toukan, Jianbo Liu, Jarod Wilson,
Steffen Klassert, Cosmin Ratiu, linux-kselftest, linux-kernel,
Hangbin Liu
The active-backup bonding mode supports XFRM ESP offload. However, when
a bond is added using command like `ip link add bond0 type bond mode 1
miimon 100`, the `ethtool -k` command shows that the XFRM ESP offload is
disabled. This occurs because, in bond_newlink(), we change bond link
first and register bond device later. So the XFRM feature update in
bond_option_mode_set() is not called as the bond device is not yet
registered, leading to the offload feature not being set successfully.
To resolve this issue, we can modify the code order in bond_newlink() to
ensure that the bond device is registered first before changing the bond
link parameters. This change will allow the XFRM ESP offload feature to be
correctly enabled.
Fixes: 007ab5345545 ("bonding: fix feature flag setting at init time")
Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
---
drivers/net/bonding/bond_main.c | 2 +-
drivers/net/bonding/bond_netlink.c | 16 +++++++++-------
include/net/bonding.h | 1 +
3 files changed, 11 insertions(+), 8 deletions(-)
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index cc7064aa4b35..881d8d94dd9f 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -4410,7 +4410,7 @@ void bond_work_init_all(struct bonding *bond)
INIT_DELAYED_WORK(&bond->slave_arr_work, bond_slave_arr_handler);
}
-static void bond_work_cancel_all(struct bonding *bond)
+void bond_work_cancel_all(struct bonding *bond)
{
cancel_delayed_work_sync(&bond->mii_work);
cancel_delayed_work_sync(&bond->arp_work);
diff --git a/drivers/net/bonding/bond_netlink.c b/drivers/net/bonding/bond_netlink.c
index 2a6a424806aa..ed16af6db557 100644
--- a/drivers/net/bonding/bond_netlink.c
+++ b/drivers/net/bonding/bond_netlink.c
@@ -568,18 +568,20 @@ static int bond_newlink(struct net *src_net, struct net_device *bond_dev,
struct nlattr *tb[], struct nlattr *data[],
struct netlink_ext_ack *extack)
{
+ struct bonding *bond = netdev_priv(bond_dev);
int err;
- err = bond_changelink(bond_dev, tb, data, extack);
- if (err < 0)
+ err = register_netdevice(bond_dev);
+ if (err)
return err;
- err = register_netdevice(bond_dev);
- if (!err) {
- struct bonding *bond = netdev_priv(bond_dev);
+ netif_carrier_off(bond_dev);
+ bond_work_init_all(bond);
- netif_carrier_off(bond_dev);
- bond_work_init_all(bond);
+ err = bond_changelink(bond_dev, tb, data, extack);
+ if (err) {
+ bond_work_cancel_all(bond);
+ unregister_netdevice(bond_dev);
}
return err;
diff --git a/include/net/bonding.h b/include/net/bonding.h
index d54ba5e3affb..aa326fd90eba 100644
--- a/include/net/bonding.h
+++ b/include/net/bonding.h
@@ -713,6 +713,7 @@ struct bond_vlan_tag *bond_verify_device_path(struct net_device *start_dev,
int bond_update_slave_arr(struct bonding *bond, struct slave *skipslave);
void bond_slave_arr_work_rearm(struct bonding *bond, unsigned long delay);
void bond_work_init_all(struct bonding *bond);
+void bond_work_cancel_all(struct bonding *bond);
#ifdef CONFIG_PROC_FS
void bond_create_proc_entry(struct bonding *bond);
--
2.46.0
^ permalink raw reply related [flat|nested] 13+ messages in thread* [PATCHv2 net 3/3] selftests: bonding: add ipsec offload test
2025-02-25 9:40 [PATCHv2 net 0/3] bond: fix xfrm offload issues Hangbin Liu
2025-02-25 9:40 ` [PATCHv2 net 1/3] bonding: move mutex lock to a work queue for XFRM GC tasks Hangbin Liu
2025-02-25 9:40 ` [PATCHv2 net 2/3] bonding: fix xfrm offload feature setup on active-backup mode Hangbin Liu
@ 2025-02-25 9:40 ` Hangbin Liu
2 siblings, 0 replies; 13+ messages in thread
From: Hangbin Liu @ 2025-02-25 9:40 UTC (permalink / raw)
To: netdev
Cc: Jay Vosburgh, Andrew Lunn, David S. Miller, Eric Dumazet,
Jakub Kicinski, Paolo Abeni, Nikolay Aleksandrov, Simon Horman,
Shuah Khan, Tariq Toukan, Jianbo Liu, Jarod Wilson,
Steffen Klassert, Cosmin Ratiu, linux-kselftest, linux-kernel,
Hangbin Liu
This introduces a test for IPSec offload over bonding, utilizing netdevsim
for the testing process, as veth interfaces do not support IPSec offload.
The test will ensure that the IPSec offload functionality remains operational
even after a failover event occurs in the bonding configuration.
Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
---
.../selftests/drivers/net/bonding/Makefile | 3 +-
.../drivers/net/bonding/bond_ipsec_offload.sh | 155 ++++++++++++++++++
.../selftests/drivers/net/bonding/config | 4 +
3 files changed, 161 insertions(+), 1 deletion(-)
create mode 100755 tools/testing/selftests/drivers/net/bonding/bond_ipsec_offload.sh
diff --git a/tools/testing/selftests/drivers/net/bonding/Makefile b/tools/testing/selftests/drivers/net/bonding/Makefile
index 2b10854e4b1e..d5a7de16d33a 100644
--- a/tools/testing/selftests/drivers/net/bonding/Makefile
+++ b/tools/testing/selftests/drivers/net/bonding/Makefile
@@ -10,7 +10,8 @@ TEST_PROGS := \
mode-2-recovery-updelay.sh \
bond_options.sh \
bond-eth-type-change.sh \
- bond_macvlan_ipvlan.sh
+ bond_macvlan_ipvlan.sh \
+ bond_ipsec_offload.sh
TEST_FILES := \
lag_lib.sh \
diff --git a/tools/testing/selftests/drivers/net/bonding/bond_ipsec_offload.sh b/tools/testing/selftests/drivers/net/bonding/bond_ipsec_offload.sh
new file mode 100755
index 000000000000..169866b47a67
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/bonding/bond_ipsec_offload.sh
@@ -0,0 +1,155 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# IPsec over bonding offload test:
+#
+# +----------------+
+# | bond0 |
+# | | |
+# | eth0 eth1 |
+# +---+-------+----+
+#
+# We use netdevsim instead of physical interfaces
+#-------------------------------------------------------------------
+# Example commands
+# ip x s add proto esp src 192.0.2.1 dst 192.0.2.2 \
+# spi 0x07 mode transport reqid 0x07 replay-window 32 \
+# aead 'rfc4106(gcm(aes))' 1234567890123456dcba 128 \
+# sel src 192.0.2.1/24 dst 192.0.2.2/24
+# offload dev bond0 dir out
+# ip x p add dir out src 192.0.2.1/24 dst 192.0.2.2/24 \
+# tmpl proto esp src 192.0.2.1 dst 192.0.2.2 \
+# spi 0x07 mode transport reqid 0x07
+#
+#-------------------------------------------------------------------
+
+lib_dir=$(dirname "$0")
+source "$lib_dir"/../../../net/lib.sh
+algo="aead rfc4106(gcm(aes)) 0x3132333435363738393031323334353664636261 128"
+srcip=192.0.2.1
+dstip=192.0.2.2
+ipsec0=/sys/kernel/debug/netdevsim/netdevsim0/ports/0/ipsec
+ipsec1=/sys/kernel/debug/netdevsim/netdevsim0/ports/1/ipsec
+ret=0
+
+cleanup()
+{
+ modprobe -r netdevsim
+ cleanup_ns $ns
+}
+
+active_slave_changed()
+{
+ local old_active_slave=$1
+ local new_active_slave=$(ip -n ${ns} -d -j link show bond0 | \
+ jq -r ".[].linkinfo.info_data.active_slave")
+ [ "$new_active_slave" != "$old_active_slave" -a "$new_active_slave" != "null" ]
+}
+
+test_offload()
+{
+ # use ping to exercise the Tx path
+ ip netns exec $ns ping -I bond0 -c 3 -W 1 -i 0 $dstip >/dev/null
+
+ active_slave=$(ip -n ${ns} -d -j link show bond0 | \
+ jq -r ".[].linkinfo.info_data.active_slave")
+
+ if [ $active_slave = $nic0 ]; then
+ sysfs=$ipsec0
+ elif [ $active_slave = $nic1 ]; then
+ sysfs=$ipsec1
+ else
+ echo "FAIL: bond_ipsec_offload invalid active_slave $active_slave"
+ ret=1
+ fi
+
+ # The tx/rx order in sysfs may changed after failover
+ if grep -q "SA count=2 tx=3" $sysfs && grep -q "tx ipaddr=$dstip" $sysfs; then
+ echo "PASS: bond_ipsec_offload has correct tx count with link ${active_slave}"
+ else
+ echo "FAIL: bond_ipsec_offload incorrect tx count with link ${active_slave}"
+ ret=1
+ fi
+}
+
+if ! mount | grep -q debugfs; then
+ mount -t debugfs none /sys/kernel/debug/ &> /dev/null
+fi
+
+# setup netdevsim since dummy/veth dev doesn't have offload support
+if [ ! -w /sys/bus/netdevsim/new_device ] ; then
+ modprobe -q netdevsim
+ if [ $? -ne 0 ]; then
+ echo "SKIP: can't load netdevsim for ipsec offload"
+ exit $ksft_skip
+ fi
+fi
+
+trap cleanup EXIT
+
+setup_ns ns
+ip -n $ns link add bond0 type bond mode active-backup miimon 100
+ip -n $ns addr add $srcip/24 dev bond0
+ip -n $ns link set bond0 up
+
+ifaces=$(ip netns exec $ns bash -c '
+ sysfsnet=/sys/bus/netdevsim/devices/netdevsim0/net/
+ echo "0 2" > /sys/bus/netdevsim/new_device
+ while [ ! -d $sysfsnet ] ; do :; done
+ udevadm settle
+ ls $sysfsnet
+')
+nic0=$(echo $ifaces | cut -f1 -d ' ')
+nic1=$(echo $ifaces | cut -f2 -d ' ')
+ip -n $ns link set $nic0 master bond0
+ip -n $ns link set $nic1 master bond0
+
+# create offloaded SAs, both in and out
+ip -n $ns x p add dir out src $srcip/24 dst $dstip/24 \
+ tmpl proto esp src $srcip dst $dstip spi 9 \
+ mode transport reqid 42
+
+ip -n $ns x p add dir in src $dstip/24 dst $srcip/24 \
+ tmpl proto esp src $dstip dst $srcip spi 9 \
+ mode transport reqid 42
+
+ip -n $ns x s add proto esp src $srcip dst $dstip spi 9 \
+ mode transport reqid 42 $algo sel src $srcip/24 dst $dstip/24 \
+ offload dev bond0 dir out
+
+ip -n $ns x s add proto esp src $dstip dst $srcip spi 9 \
+ mode transport reqid 42 $algo sel src $dstip/24 dst $srcip/24 \
+ offload dev bond0 dir in
+
+# does offload show up in ip output
+lines=`ip -n $ns x s list | grep -c "crypto offload parameters: dev bond0 dir"`
+if [ $lines -ne 2 ] ; then
+ echo "FAIL: bond_ipsec_offload SA offload missing from list output"
+ ret=1
+fi
+
+# we didn't create a peer, make sure we can Tx by adding a permanent neighbour
+# this need to be added after enslave
+ip -n $ns neigh add $dstip dev bond0 lladdr 00:11:22:33:44:55
+
+# start Offload testing
+test_offload
+
+# do failover
+ip -n $ns link set $active_slave down
+slowwait 5 active_slave_changed $active_slave
+test_offload
+
+# make sure offload get removed from driver
+ip -n $ns x s flush
+ip -n $ns x p flush
+line0=$(grep -c "SA count=0" $ipsec0)
+line1=$(grep -c "SA count=0" $ipsec1)
+if [ $line0 -ne 1 -o $line1 -ne 1 ] ; then
+ echo "FAIL: bond_ipsec_offload SA not removed from driver"
+ ret=1
+else
+ echo "PASS: bond_ipsec_offload SA removed from driver"
+fi
+
+exit $ret
diff --git a/tools/testing/selftests/drivers/net/bonding/config b/tools/testing/selftests/drivers/net/bonding/config
index dad4e5fda4db..054fb772846f 100644
--- a/tools/testing/selftests/drivers/net/bonding/config
+++ b/tools/testing/selftests/drivers/net/bonding/config
@@ -9,3 +9,7 @@ CONFIG_NET_CLS_FLOWER=y
CONFIG_NET_SCH_INGRESS=y
CONFIG_NLMON=y
CONFIG_VETH=y
+CONFIG_INET_ESP=y
+CONFIG_INET_ESP_OFFLOAD=y
+CONFIG_XFRM_USER=m
+CONFIG_NETDEVSIM=m
--
2.46.0
^ permalink raw reply related [flat|nested] 13+ messages in thread