* [PATCH RFC net-next 1/4] xfrm: add a lower-device offload handle resolver
2026-05-20 8:10 [PATCH RFC net-next 0/4] bonding: support LAG IPsec offload with replicated SAs Jihong Min
@ 2026-05-20 8:10 ` Jihong Min
2026-05-20 8:10 ` [PATCH RFC net-next 2/4] bonding: replicate XFRM offload state across LAG slaves Jihong Min
` (2 subsequent siblings)
3 siblings, 0 replies; 5+ messages in thread
From: Jihong Min @ 2026-05-20 8:10 UTC (permalink / raw)
To: netdev
Cc: Jay Vosburgh, Andrew Lunn, David S. Miller, Eric Dumazet,
Jakub Kicinski, Paolo Abeni, Simon Horman, Steffen Klassert,
Herbert Xu, linux-kernel, Jihong Min
An upper device can own an XFRM offload state while the selected
datapath device is one of its lower devices. A single xso.offload_handle
is not enough for that case because each lower device may return a
different hardware handle for the same state.
Add an optional xfrmdev_ops resolver and a lower-driver opt-in flag so
helper-aware lower drivers can resolve the handle for the lower device
they are transmitting or receiving on. Keep the direct-device path as
the fast path and clear upper private state when device offload state is
freed.
Assisted-by: Codex:gpt-5.5
Signed-off-by: Jihong Min <hurryman2212@gmail.com>
---
include/linux/netdevice.h | 27 ++++++++++++++++++++++
include/net/xfrm.h | 48 +++++++++++++++++++++++++++++++++++++--
net/xfrm/xfrm_state.c | 1 +
3 files changed, 74 insertions(+), 2 deletions(-)
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 0e1e581efc5a..b4e844e90db8 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1033,6 +1033,16 @@ struct netdev_bpf {
#define XDP_WAKEUP_TX (1 << 1)
#ifdef CONFIG_XFRM_OFFLOAD
+/*
+ * xfrmdev_ops.flags values.
+ *
+ * XFRMDEV_OPS_F_LOWER_HANDLE marks a lower driver whose datapath gets XFRM
+ * hardware handles with xfrm_dev_state_lower_handle(). This is required when
+ * the XFRM state is owned by an upper device because xso.offload_handle may
+ * not contain the handle for the current lower device.
+ */
+#define XFRMDEV_OPS_F_LOWER_HANDLE BIT(0)
+
struct xfrmdev_ops {
int (*xdo_dev_state_add)(struct net_device *dev,
struct xfrm_state *x,
@@ -1048,6 +1058,23 @@ struct xfrmdev_ops {
int (*xdo_dev_policy_add) (struct xfrm_policy *x, struct netlink_ext_ack *extack);
void (*xdo_dev_policy_delete) (struct xfrm_policy *x);
void (*xdo_dev_policy_free) (struct xfrm_policy *x);
+ /*
+ * Resolve the offload handle for lower_dev when this upper device
+ * owns the XFRM state. This belongs in xfrmdev_ops because the
+ * resolver is an XFRM offload operation of the device that owns the
+ * state. Keeping the dispatch here avoids a bonding-specific dependency
+ * in the XFRM helper.
+ *
+ * Upper devices like bonding may implement this callback when they
+ * keep the lower-device handle mapping. Lower devices must leave it
+ * NULL because they do not own that map. Lower drivers advertise
+ * that their datapath calls the resolver with
+ * XFRMDEV_OPS_F_LOWER_HANDLE instead.
+ */
+ unsigned long (*xdo_dev_state_lower_handle)(struct net_device *dev,
+ struct xfrm_state *x,
+ struct net_device *lower_dev);
+ u32 flags;
};
#endif
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 10d3edde6b2f..b61e2c023eb4 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -162,6 +162,10 @@ struct xfrm_dev_offload {
*/
struct net_device *real_dev;
unsigned long offload_handle;
+ /* Private state owned by dev in this structure when that device is an
+ * upper device. Lower drivers must not use this directly.
+ */
+ void __rcu *upper_priv;
u8 dir : 2;
u8 type : 2;
u8 flags : 2;
@@ -1700,6 +1704,37 @@ struct xfrm_state *xfrm_state_lookup_byspi(struct net *net, __be32 spi,
int xfrm_state_check_expire(struct xfrm_state *x);
void xfrm_state_update_stats(struct net *net);
#ifdef CONFIG_XFRM_OFFLOAD
+/*
+ * Return the hardware offload handle lower_dev should use for x. States
+ * installed directly on lower_dev use xso.offload_handle. States owned by an
+ * upper device are resolved through the owner's xdo_dev_state_lower_handle().
+ * Bonding uses that callback for replicated XFRM states because it installs the
+ * state on each slave and keeps the per-slave hardware handles internally.
+ */
+static inline unsigned long
+xfrm_dev_state_lower_handle(struct xfrm_state *x, struct net_device *lower_dev)
+{
+ struct xfrm_dev_offload *xdo = &x->xso;
+ struct net_device *real_dev = READ_ONCE(xdo->real_dev);
+ struct net_device *dev = READ_ONCE(xdo->dev);
+ unsigned long offload_handle = READ_ONCE(xdo->offload_handle);
+
+ if (!dev || !lower_dev)
+ return 0;
+
+ if (dev == lower_dev)
+ return offload_handle;
+
+ if (dev->xfrmdev_ops && dev->xfrmdev_ops->xdo_dev_state_lower_handle)
+ return dev->xfrmdev_ops->xdo_dev_state_lower_handle(dev, x,
+ lower_dev);
+
+ if (real_dev == lower_dev)
+ return offload_handle;
+
+ return 0;
+}
+
static inline void xfrm_dev_state_update_stats(struct xfrm_state *x)
{
struct xfrm_dev_offload *xdo = &x->xso;
@@ -1711,6 +1746,12 @@ static inline void xfrm_dev_state_update_stats(struct xfrm_state *x)
}
#else
+static inline unsigned long
+xfrm_dev_state_lower_handle(struct xfrm_state *x, struct net_device *lower_dev)
+{
+ return 0;
+}
+
static inline void xfrm_dev_state_update_stats(struct xfrm_state *x) {}
#endif
void xfrm_state_insert(struct xfrm_state *x);
@@ -2089,15 +2130,18 @@ static inline void xfrm_dev_state_advance_esn(struct xfrm_state *x)
static inline bool xfrm_dst_offload_ok(struct dst_entry *dst)
{
struct xfrm_state *x = dst->xfrm;
+ bool has_offload_state;
struct xfrm_dst *xdst;
if (!x || !x->type_offload)
return false;
xdst = (struct xfrm_dst *) dst;
- if (!x->xso.offload_handle && !xdst->child->xfrm)
+ has_offload_state = x->xso.offload_handle ||
+ rcu_access_pointer(x->xso.upper_priv);
+ if (!has_offload_state && !xdst->child->xfrm)
return true;
- if (x->xso.offload_handle && (x->xso.dev == xfrm_dst_path(dst)->dev) &&
+ if (has_offload_state && (x->xso.dev == xfrm_dst_path(dst)->dev) &&
!xdst->child->xfrm)
return true;
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 686014d39429..584f913751bf 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -791,6 +791,7 @@ void xfrm_dev_state_free(struct xfrm_state *x)
if (dev->xfrmdev_ops->xdo_dev_state_free)
dev->xfrmdev_ops->xdo_dev_state_free(dev, x);
WRITE_ONCE(xso->dev, NULL);
+ RCU_INIT_POINTER(xso->upper_priv, NULL);
xso->type = XFRM_DEV_OFFLOAD_UNSPECIFIED;
netdev_put(dev, &xso->dev_tracker);
}
--
2.53.0
^ permalink raw reply related [flat|nested] 5+ messages in thread* [PATCH RFC net-next 2/4] bonding: replicate XFRM offload state across LAG slaves
2026-05-20 8:10 [PATCH RFC net-next 0/4] bonding: support LAG IPsec offload with replicated SAs Jihong Min
2026-05-20 8:10 ` [PATCH RFC net-next 1/4] xfrm: add a lower-device offload handle resolver Jihong Min
@ 2026-05-20 8:10 ` Jihong Min
2026-05-20 8:10 ` [PATCH RFC net-next 3/4] bonding: expose user-controlled IPsec features for LAG Jihong Min
2026-05-20 8:10 ` [PATCH RFC net-next 4/4] bonding: handle replicated IPsec SAs across LAG changes Jihong Min
3 siblings, 0 replies; 5+ messages in thread
From: Jihong Min @ 2026-05-20 8:10 UTC (permalink / raw)
To: netdev
Cc: Jay Vosburgh, Andrew Lunn, David S. Miller, Eric Dumazet,
Jakub Kicinski, Paolo Abeni, Simon Horman, Steffen Klassert,
Herbert Xu, linux-kernel, Jihong Min
LAG bonds need to install the same IPsec/XFRM state on every eligible
lower device, but each lower device may return a different hardware
handle. Add a replicated bonding-private XFRM state object that stores
per-lower-device instances and handles.
Use the replicated model for 802.3ad and balance-xor with layer3+4
hashing. Install the state on every eligible running slave, capture each
lower handle, and roll back in reverse order on failure. Keep
active-backup on the existing single-lower path and expose a bonding
resolver for lower drivers that call xfrm_dev_state_lower_handle().
Assisted-by: Codex:gpt-5.5
Signed-off-by: Jihong Min <hurryman2212@gmail.com>
---
drivers/net/bonding/bond_main.c | 578 +++++++++++++++++++++++++++++++-
include/net/bonding.h | 29 +-
2 files changed, 595 insertions(+), 12 deletions(-)
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index af82a3df2c5d..66435de852e9 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -455,6 +455,432 @@ static struct net_device *bond_ipsec_dev(struct xfrm_state *xs)
return slave->dev;
}
+static void bond_ipsec_inst_rcu_free(struct rcu_head *rcu)
+{
+ struct bond_ipsec_inst *inst;
+
+ inst = container_of(rcu, struct bond_ipsec_inst, rcu);
+ netdev_put(inst->real_dev, &inst->dev_tracker);
+ kfree(inst);
+}
+
+static void bond_ipsec_rcu_free(struct rcu_head *rcu)
+{
+ struct bond_ipsec *ipsec;
+
+ ipsec = container_of(rcu, struct bond_ipsec, rcu);
+ kfree(ipsec);
+}
+
+static bool bond_ipsec_slave_has_xfrm_ops(struct net_device *real_dev)
+{
+ const struct xfrmdev_ops *ops;
+
+ if (!real_dev || netif_is_bond_master(real_dev))
+ return false;
+
+ ops = real_dev->xfrmdev_ops;
+ if (!ops)
+ return false;
+
+ return ops->xdo_dev_state_add && ops->xdo_dev_state_delete;
+}
+
+static bool bond_ipsec_lag_slave_has_ops(struct net_device *real_dev)
+{
+ return bond_ipsec_slave_has_xfrm_ops(real_dev) &&
+ real_dev->xfrmdev_ops->flags & XFRMDEV_OPS_F_LOWER_HANDLE;
+}
+
+static bool bond_ipsec_lag_slave_ok(struct net_device *real_dev)
+{
+ return (real_dev->features & NETIF_F_HW_ESP) &&
+ bond_ipsec_lag_slave_has_ops(real_dev);
+}
+
+static void bond_ipsec_lag_free_instances(struct bond_ipsec *ipsec)
+{
+ struct bond_ipsec_inst *inst, *tmp;
+
+ list_for_each_entry_safe(inst, tmp, &ipsec->inst_list, list) {
+ list_del_rcu(&inst->list);
+ call_rcu(&inst->rcu, bond_ipsec_inst_rcu_free);
+ }
+}
+
+static void bond_ipsec_lag_call_inst(struct xfrm_state *xs,
+ struct bond_ipsec_inst *inst,
+ bool delete_state,
+ bool free_state)
+{
+ unsigned long bond_handle = xs->xso.offload_handle;
+ struct net_device *bond_real_dev = xs->xso.real_dev;
+ const struct xfrmdev_ops *ops = inst->real_dev->xfrmdev_ops;
+
+ if (!inst->lower_handle)
+ return;
+
+ if (!ops)
+ return;
+
+ xs->xso.real_dev = inst->real_dev;
+ xs->xso.offload_handle = inst->lower_handle;
+ if (delete_state) {
+ WRITE_ONCE(inst->added, false);
+ if (!inst->deleted && ops->xdo_dev_state_delete) {
+ ops->xdo_dev_state_delete(inst->real_dev, xs);
+ xs->xso.offload_handle = inst->lower_handle;
+ inst->deleted = true;
+ }
+ }
+ if (free_state && ops->xdo_dev_state_free)
+ ops->xdo_dev_state_free(inst->real_dev, xs);
+ if (free_state)
+ inst->lower_handle = 0;
+
+ xs->xso.real_dev = bond_real_dev;
+ xs->xso.offload_handle = bond_handle;
+}
+
+static void bond_ipsec_lag_call_state(struct xfrm_state *xs,
+ struct bond_ipsec *ipsec,
+ bool delete_state,
+ bool free_state)
+{
+ struct bond_ipsec_inst *inst;
+
+ list_for_each_entry_reverse(inst, &ipsec->inst_list, list) {
+ bond_ipsec_lag_call_inst(xs, inst, delete_state, free_state);
+ }
+}
+
+static int bond_ipsec_lag_add_inst(struct xfrm_state *xs,
+ struct bond_ipsec_inst *inst,
+ struct netlink_ext_ack *extack)
+{
+ unsigned long bond_handle = xs->xso.offload_handle;
+ struct net_device *bond_real_dev = xs->xso.real_dev;
+ const struct xfrmdev_ops *ops;
+ int err;
+
+ if (!bond_ipsec_lag_slave_ok(inst->real_dev))
+ return -EOPNOTSUPP;
+
+ ops = inst->real_dev->xfrmdev_ops;
+ xs->xso.real_dev = inst->real_dev;
+ xs->xso.offload_handle = 0;
+ err = ops->xdo_dev_state_add(inst->real_dev, xs, extack);
+ if (err)
+ goto out;
+
+ inst->lower_handle = xs->xso.offload_handle;
+ if (!inst->lower_handle) {
+ err = -EINVAL;
+ NL_SET_ERR_MSG_MOD(extack, "Slave did not return an IPsec offload handle");
+ if (ops->xdo_dev_state_delete)
+ ops->xdo_dev_state_delete(inst->real_dev, xs);
+ if (ops->xdo_dev_state_free)
+ ops->xdo_dev_state_free(inst->real_dev, xs);
+ goto out;
+ }
+
+ inst->deleted = false;
+ inst->added = true;
+
+out:
+ xs->xso.real_dev = bond_real_dev;
+ xs->xso.offload_handle = bond_handle;
+ return err;
+}
+
+static int bond_ipsec_lag_add_sa(struct net_device *bond_dev,
+ struct xfrm_state *xs,
+ struct netlink_ext_ack *extack)
+{
+ struct bonding *bond = netdev_priv(bond_dev);
+ struct bond_ipsec_inst *inst;
+ struct bond_ipsec *ipsec;
+ struct list_head *iter;
+ struct slave *slave;
+ int err = 0;
+ int count = 0;
+
+ if (xs->xso.type != XFRM_DEV_OFFLOAD_CRYPTO) {
+ NL_SET_ERR_MSG_MOD(extack, "LAG supports only XFRM crypto offload");
+ return -EOPNOTSUPP;
+ }
+
+ if (xs->props.flags & XFRM_STATE_ESN) {
+ NL_SET_ERR_MSG_MOD(extack, "LAG does not support XFRM ESN offload");
+ return -EOPNOTSUPP;
+ }
+
+ ipsec = kmalloc_obj(*ipsec);
+ if (!ipsec)
+ return -ENOMEM;
+
+ ipsec->xs = xs;
+ ipsec->replicated = true;
+ INIT_LIST_HEAD(&ipsec->list);
+ INIT_LIST_HEAD(&ipsec->inst_list);
+
+ /* Serialize with slave down/remove and LAG eligibility changes so they
+ * cannot miss lower SAs installed before this state is published.
+ */
+ mutex_lock(&bond->ipsec_lock);
+ if (bond->ipsec_lag_blocked) {
+ err = -EAGAIN;
+ NL_SET_ERR_MSG_MOD(extack, "Bond LAG XFRM state add is blocked");
+ goto err_free_unlock;
+ }
+ if (!(bond_dev->features & NETIF_F_HW_ESP)) {
+ err = -EOPNOTSUPP;
+ NL_SET_ERR_MSG_MOD(extack, "Bond IPsec offload is disabled");
+ goto err_free_unlock;
+ }
+ if (!bond_mode_can_use_lag_xfrm(bond)) {
+ err = -EAGAIN;
+ NL_SET_ERR_MSG_MOD(extack, "Bond LAG XFRM eligibility changed");
+ goto err_free_unlock;
+ }
+ rcu_read_lock();
+ bond_for_each_slave_rcu(bond, slave, iter) {
+ struct net_device *real_dev = slave->dev;
+
+ if (!netif_running(real_dev))
+ continue;
+
+ if (!bond_ipsec_lag_slave_ok(real_dev)) {
+ err = -EOPNOTSUPP;
+ break;
+ }
+
+ inst = kzalloc_obj(*inst, GFP_ATOMIC);
+ if (!inst) {
+ err = -ENOMEM;
+ break;
+ }
+
+ inst->real_dev = real_dev;
+ netdev_hold(real_dev, &inst->dev_tracker, GFP_ATOMIC);
+ list_add_tail(&inst->list, &ipsec->inst_list);
+ count++;
+ }
+ rcu_read_unlock();
+
+ if (!err && !count)
+ err = -ENODEV;
+ if (err) {
+ if (err == -EOPNOTSUPP)
+ NL_SET_ERR_MSG_MOD(extack, "Not all slaves support IPsec offload");
+ goto err_free_unlock;
+ }
+
+ list_for_each_entry(inst, &ipsec->inst_list, list) {
+ err = bond_ipsec_lag_add_inst(xs, inst, extack);
+ if (err)
+ goto err_delete;
+ }
+
+ xs->xso.real_dev = NULL;
+ xs->xso.offload_handle = 0;
+ if (!bond_mode_can_use_lag_xfrm(bond)) {
+ err = -EAGAIN;
+ NL_SET_ERR_MSG_MOD(extack, "Bond LAG XFRM eligibility changed");
+ goto err_delete;
+ }
+ rcu_assign_pointer(xs->xso.upper_priv, ipsec);
+ list_add(&ipsec->list, &bond->ipsec_list);
+ mutex_unlock(&bond->ipsec_lock);
+
+ return 0;
+
+err_delete:
+ bond_ipsec_lag_call_state(xs, ipsec, true, true);
+ xs->xso.real_dev = NULL;
+ xs->xso.offload_handle = 0;
+ RCU_INIT_POINTER(xs->xso.upper_priv, NULL);
+err_free_unlock:
+ mutex_unlock(&bond->ipsec_lock);
+ bond_ipsec_lag_free_instances(ipsec);
+ kfree(ipsec);
+ return err;
+}
+
+static void bond_ipsec_lag_flush_pending(struct bonding *bond)
+{
+ struct bond_ipsec *ipsec, *tmp;
+
+ /* Caller must hold ipsec_lock to serialize with LAG SA add. */
+ list_for_each_entry_safe(ipsec, tmp, &bond->ipsec_list, list) {
+ struct xfrm_dev_offload *xso;
+ struct xfrm_state *xs;
+ struct net *net;
+ bool pending;
+
+ if (!ipsec->replicated)
+ continue;
+
+ xs = ipsec->xs;
+ net = xs_net(xs);
+ spin_lock_bh(&net->xfrm.xfrm_state_lock);
+ pending = hlist_unhashed(&xs->bydst) &&
+ xs->km.state != XFRM_STATE_DEAD;
+ spin_unlock_bh(&net->xfrm.xfrm_state_lock);
+ if (!pending)
+ continue;
+
+ xso = &xs->xso;
+ list_del(&ipsec->list);
+ RCU_INIT_POINTER(xso->upper_priv, NULL);
+ bond_ipsec_lag_call_state(xs, ipsec, true, true);
+ bond_ipsec_lag_free_instances(ipsec);
+ call_rcu(&ipsec->rcu, bond_ipsec_rcu_free);
+
+ xso->real_dev = NULL;
+ xso->offload_handle = 0;
+ if (xso->dev == bond->dev) {
+ WRITE_ONCE(xso->dev, NULL);
+ xso->dir = 0;
+ xso->type = XFRM_DEV_OFFLOAD_UNSPECIFIED;
+ netdev_put(bond->dev, &xso->dev_tracker);
+ xfrm_unset_type_offload(xs);
+ }
+ }
+}
+
+void bond_ipsec_lag_begin_flush(struct bonding *bond)
+{
+ mutex_lock(&bond->ipsec_lock);
+ bond->ipsec_lag_blocked = true;
+ bond_ipsec_lag_flush_pending(bond);
+ mutex_unlock(&bond->ipsec_lock);
+}
+
+void bond_ipsec_lag_end_flush(struct bonding *bond)
+{
+ mutex_lock(&bond->ipsec_lock);
+ bond->ipsec_lag_blocked = false;
+ mutex_unlock(&bond->ipsec_lock);
+}
+
+static void bond_ipsec_lag_remove_slave(struct bonding *bond,
+ struct net_device *real_dev)
+{
+ struct bond_ipsec_inst *inst, *tmp;
+ struct bond_ipsec *ipsec;
+ bool removed = false;
+
+ if (!bond_mode_can_use_lag_xfrm(bond))
+ return;
+
+ mutex_lock(&bond->ipsec_lock);
+ list_for_each_entry(ipsec, &bond->ipsec_list, list) {
+ if (!ipsec->replicated)
+ continue;
+
+ list_for_each_entry(inst, &ipsec->inst_list, list) {
+ if (inst->real_dev != real_dev)
+ continue;
+
+ WRITE_ONCE(inst->added, false);
+ removed = true;
+ }
+ }
+ if (!removed)
+ goto out;
+
+ synchronize_net();
+
+ list_for_each_entry(ipsec, &bond->ipsec_list, list) {
+ if (!ipsec->replicated)
+ continue;
+
+ list_for_each_entry_safe(inst, tmp, &ipsec->inst_list, list) {
+ if (inst->real_dev != real_dev)
+ continue;
+
+ bond_ipsec_lag_call_inst(ipsec->xs, inst, true, true);
+ list_del_rcu(&inst->list);
+ call_rcu(&inst->rcu, bond_ipsec_inst_rcu_free);
+ }
+ }
+out:
+ mutex_unlock(&bond->ipsec_lock);
+}
+
+static int bond_ipsec_lag_add_slave(struct bonding *bond,
+ struct slave *slave,
+ struct netlink_ext_ack *extack)
+{
+ struct net_device *real_dev = slave->dev;
+ struct bond_ipsec_inst *inst;
+ struct bond_ipsec *ipsec;
+ bool have_states = false;
+ bool slave_ok;
+ int err = 0;
+
+ if (!bond_mode_can_use_lag_xfrm(bond) || !netif_running(real_dev))
+ return 0;
+
+ slave_ok = bond_ipsec_lag_slave_ok(real_dev);
+
+ mutex_lock(&bond->ipsec_lock);
+ list_for_each_entry(ipsec, &bond->ipsec_list, list) {
+ bool found = false;
+
+ if (!ipsec->replicated)
+ continue;
+ have_states = true;
+
+ if (ipsec->xs->km.state == XFRM_STATE_DEAD)
+ continue;
+
+ if (!slave_ok) {
+ err = -EOPNOTSUPP;
+ break;
+ }
+
+ list_for_each_entry(inst, &ipsec->inst_list, list) {
+ if (inst->real_dev == real_dev) {
+ found = true;
+ break;
+ }
+ }
+ if (found)
+ continue;
+
+ inst = kzalloc_obj(*inst, GFP_KERNEL);
+ if (!inst) {
+ err = -ENOMEM;
+ break;
+ }
+
+ inst->real_dev = real_dev;
+ netdev_hold(real_dev, &inst->dev_tracker, GFP_KERNEL);
+
+ err = bond_ipsec_lag_add_inst(ipsec->xs, inst, extack);
+ if (err) {
+ netdev_put(real_dev, &inst->dev_tracker);
+ kfree(inst);
+ break;
+ }
+
+ list_add_tail_rcu(&inst->list, &ipsec->inst_list);
+ }
+ mutex_unlock(&bond->ipsec_lock);
+
+ if (err && have_states) {
+ slave_warn(bond->dev, real_dev,
+ "failed to replicate IPsec SA, flushing bond states\n");
+ bond_ipsec_lag_begin_flush(bond);
+ xfrm_dev_state_flush(dev_net(bond->dev), bond->dev, true);
+ bond_ipsec_lag_end_flush(bond);
+ }
+
+ return err;
+}
+
/**
* bond_ipsec_add_sa - program device with a security association
* @bond_dev: pointer to the bond net device
@@ -475,8 +901,15 @@ static int bond_ipsec_add_sa(struct net_device *bond_dev,
if (!bond_dev)
return -EINVAL;
- rcu_read_lock();
bond = netdev_priv(bond_dev);
+ if (bond_mode_can_use_lag_xfrm(bond))
+ return bond_ipsec_lag_add_sa(bond_dev, xs, extack);
+ if (BOND_MODE(bond) != BOND_MODE_ACTIVEBACKUP) {
+ NL_SET_ERR_MSG_MOD(extack, "Bond mode does not support IPsec offload");
+ return -EOPNOTSUPP;
+ }
+
+ rcu_read_lock();
slave = rcu_dereference(bond->curr_active_slave);
real_dev = slave ? slave->dev : NULL;
netdev_hold(real_dev, &tracker, GFP_ATOMIC);
@@ -504,7 +937,9 @@ static int bond_ipsec_add_sa(struct net_device *bond_dev,
if (!err) {
xs->xso.real_dev = real_dev;
ipsec->xs = xs;
+ ipsec->replicated = false;
INIT_LIST_HEAD(&ipsec->list);
+ INIT_LIST_HEAD(&ipsec->inst_list);
mutex_lock(&bond->ipsec_lock);
list_add(&ipsec->list, &bond->ipsec_list);
mutex_unlock(&bond->ipsec_lock);
@@ -523,6 +958,9 @@ static void bond_ipsec_add_sa_all(struct bonding *bond)
struct bond_ipsec *ipsec;
struct slave *slave;
+ if (BOND_MODE(bond) != BOND_MODE_ACTIVEBACKUP)
+ return;
+
slave = rtnl_dereference(bond->curr_active_slave);
real_dev = slave ? slave->dev : NULL;
if (!real_dev)
@@ -540,6 +978,9 @@ static void bond_ipsec_add_sa_all(struct bonding *bond)
}
list_for_each_entry(ipsec, &bond->ipsec_list, list) {
+ if (ipsec->replicated)
+ continue;
+
/* If new state is added before ipsec_lock acquired */
if (ipsec->xs->xso.real_dev == real_dev)
continue;
@@ -568,6 +1009,19 @@ static void bond_ipsec_add_sa_all(struct bonding *bond)
mutex_unlock(&bond->ipsec_lock);
}
+static struct bond_ipsec *bond_ipsec_find(struct bonding *bond,
+ struct xfrm_state *xs)
+{
+ struct bond_ipsec *ipsec;
+
+ list_for_each_entry(ipsec, &bond->ipsec_list, list) {
+ if (ipsec->xs == xs)
+ return ipsec;
+ }
+
+ return NULL;
+}
+
/**
* bond_ipsec_del_sa - clear out this specific SA
* @bond_dev: pointer to the bond net device
@@ -577,8 +1031,24 @@ static void bond_ipsec_del_sa(struct net_device *bond_dev,
struct xfrm_state *xs)
{
struct net_device *real_dev;
+ struct bond_ipsec *ipsec;
+ struct bonding *bond;
+
+ if (!bond_dev)
+ return;
+
+ bond = netdev_priv(bond_dev);
- if (!bond_dev || !xs->xso.real_dev)
+ mutex_lock(&bond->ipsec_lock);
+ ipsec = bond_ipsec_find(bond, xs);
+ if (ipsec && ipsec->replicated) {
+ bond_ipsec_lag_call_state(xs, ipsec, true, false);
+ mutex_unlock(&bond->ipsec_lock);
+ return;
+ }
+ mutex_unlock(&bond->ipsec_lock);
+
+ if (!xs->xso.real_dev)
return;
real_dev = xs->xso.real_dev;
@@ -600,6 +1070,9 @@ static void bond_ipsec_del_sa_all(struct bonding *bond)
struct bond_ipsec *ipsec;
struct slave *slave;
+ if (BOND_MODE(bond) != BOND_MODE_ACTIVEBACKUP)
+ return;
+
slave = rtnl_dereference(bond->curr_active_slave);
real_dev = slave ? slave->dev : NULL;
if (!real_dev)
@@ -607,6 +1080,9 @@ static void bond_ipsec_del_sa_all(struct bonding *bond)
mutex_lock(&bond->ipsec_lock);
list_for_each_entry(ipsec, &bond->ipsec_list, list) {
+ if (ipsec->replicated)
+ continue;
+
if (!ipsec->xs->xso.real_dev)
continue;
@@ -647,23 +1123,33 @@ static void bond_ipsec_free_sa(struct net_device *bond_dev,
bond = netdev_priv(bond_dev);
mutex_lock(&bond->ipsec_lock);
- if (!xs->xso.real_dev)
+ ipsec = bond_ipsec_find(bond, xs);
+ if (ipsec && ipsec->replicated) {
+ list_del(&ipsec->list);
+ RCU_INIT_POINTER(xs->xso.upper_priv, NULL);
+ bond_ipsec_lag_call_state(xs, ipsec, false, true);
+ bond_ipsec_lag_free_instances(ipsec);
+ call_rcu(&ipsec->rcu, bond_ipsec_rcu_free);
+ xs->xso.real_dev = NULL;
+ xs->xso.offload_handle = 0;
goto out;
+ }
real_dev = xs->xso.real_dev;
+ if (!real_dev)
+ goto free_ipsec;
xs->xso.real_dev = NULL;
if (real_dev->xfrmdev_ops &&
real_dev->xfrmdev_ops->xdo_dev_state_free)
real_dev->xfrmdev_ops->xdo_dev_state_free(real_dev, xs);
-out:
- list_for_each_entry(ipsec, &bond->ipsec_list, list) {
- if (ipsec->xs == xs) {
- list_del(&ipsec->list);
- kfree(ipsec);
- break;
- }
+
+free_ipsec:
+ if (ipsec) {
+ list_del(&ipsec->list);
+ kfree(ipsec);
}
+out:
mutex_unlock(&bond->ipsec_lock);
}
@@ -674,7 +1160,17 @@ static void bond_ipsec_free_sa(struct net_device *bond_dev,
**/
static bool bond_ipsec_offload_ok(struct sk_buff *skb, struct xfrm_state *xs)
{
+ struct net_device *bond_dev = xs->xso.dev;
struct net_device *real_dev;
+ struct bonding *bond;
+
+ if (!bond_dev)
+ return false;
+
+ bond = netdev_priv(bond_dev);
+ if (bond_mode_can_use_lag_xfrm(bond))
+ return xs->xso.type == XFRM_DEV_OFFLOAD_CRYPTO &&
+ rcu_access_pointer(xs->xso.upper_priv);
rcu_read_lock();
real_dev = bond_ipsec_dev(xs);
@@ -735,6 +1231,47 @@ static void bond_xfrm_update_stats(struct xfrm_state *xs)
rcu_read_unlock();
}
+/*
+ * xdo_dev_state_lower_handle implementation for bond-owned XFRM states.
+ * lower_dev is the slave selected by the lower driver datapath. Replicated LAG
+ * state is resolved from the bond private instance list. Single-lower
+ * active-backup state is resolved from xso.real_dev/offload_handle here because
+ * xfrm_dev_state_lower_handle() delegates all bond-owned lookups to bonding.
+ */
+static unsigned long bond_ipsec_lower_handle(struct net_device *bond_dev,
+ struct xfrm_state *xs,
+ struct net_device *lower_dev)
+{
+ struct bonding *bond = netdev_priv(bond_dev);
+ struct bond_ipsec_inst *inst;
+ struct bond_ipsec *ipsec;
+ unsigned long handle = 0;
+
+ if (BOND_MODE(bond) == BOND_MODE_ACTIVEBACKUP) {
+ struct net_device *real_dev = READ_ONCE(xs->xso.real_dev);
+
+ return real_dev == lower_dev ? READ_ONCE(xs->xso.offload_handle) : 0;
+ }
+ if (!bond_mode_can_use_lag_xfrm(bond))
+ return 0;
+
+ rcu_read_lock();
+ ipsec = rcu_dereference(xs->xso.upper_priv);
+ if (!ipsec || !ipsec->replicated || ipsec->xs != xs)
+ goto out;
+
+ list_for_each_entry_rcu(inst, &ipsec->inst_list, list) {
+ if (READ_ONCE(inst->added) && inst->real_dev == lower_dev) {
+ handle = inst->lower_handle;
+ break;
+ }
+ }
+
+out:
+ rcu_read_unlock();
+ return handle;
+}
+
static const struct xfrmdev_ops bond_xfrmdev_ops = {
.xdo_dev_state_add = bond_ipsec_add_sa,
.xdo_dev_state_delete = bond_ipsec_del_sa,
@@ -742,7 +1279,25 @@ static const struct xfrmdev_ops bond_xfrmdev_ops = {
.xdo_dev_offload_ok = bond_ipsec_offload_ok,
.xdo_dev_state_advance_esn = bond_advance_esn_state,
.xdo_dev_state_update_stats = bond_xfrm_update_stats,
+ .xdo_dev_state_lower_handle = bond_ipsec_lower_handle,
};
+#else
+static void bond_ipsec_lag_remove_slave(struct bonding *bond,
+ struct net_device *real_dev)
+{
+}
+
+static int bond_ipsec_lag_add_slave(struct bonding *bond,
+ struct slave *slave,
+ struct netlink_ext_ack *extack)
+{
+ return 0;
+}
+
+static void bond_sync_slave_xfrm_features(struct bonding *bond,
+ struct slave *slave)
+{
+}
#endif /* CONFIG_XFRM_OFFLOAD */
/*------------------------------- Link status -------------------------------*/
@@ -6006,10 +6561,11 @@ void bond_setup(struct net_device *bond_dev)
bond_dev->priv_flags &= ~(IFF_XMIT_DST_RELEASE | IFF_TX_SKB_SHARING);
#ifdef CONFIG_XFRM_OFFLOAD
- /* set up xfrm device ops (only supported in active-backup right now) */
+ /* set up xfrm device ops */
bond_dev->xfrmdev_ops = &bond_xfrmdev_ops;
INIT_LIST_HEAD(&bond->ipsec_list);
mutex_init(&bond->ipsec_lock);
+ bond->ipsec_lag_blocked = false;
#endif /* CONFIG_XFRM_OFFLOAD */
/* don't acquire bond device's netif_tx_lock when transmitting */
diff --git a/include/net/bonding.h b/include/net/bonding.h
index edd1942dcd73..a581252b5b06 100644
--- a/include/net/bonding.h
+++ b/include/net/bonding.h
@@ -203,9 +203,24 @@ struct bond_up_slave {
*/
#define BOND_LINK_NOCHANGE -1
+/* XFRM offload state tracked by bonding for one xfrm_state. */
struct bond_ipsec {
struct list_head list;
struct xfrm_state *xs;
+ struct list_head inst_list;
+ struct rcu_head rcu;
+ bool replicated;
+};
+
+/* Per-lower-device instance of a replicated LAG XFRM state. */
+struct bond_ipsec_inst {
+ struct list_head list;
+ struct net_device *real_dev;
+ netdevice_tracker dev_tracker;
+ unsigned long lower_handle;
+ struct rcu_head rcu;
+ bool added;
+ bool deleted;
};
/*
@@ -259,8 +274,9 @@ struct bonding {
struct rtnl_link_stats64 bond_stats;
#ifdef CONFIG_XFRM_OFFLOAD
struct list_head ipsec_list;
- /* protecting ipsec_list */
+ /* protecting ipsec_list and ipsec_lag_blocked */
struct mutex ipsec_lock;
+ bool ipsec_lag_blocked;
#endif /* CONFIG_XFRM_OFFLOAD */
struct bpf_prog *xdp_prog;
};
@@ -325,6 +341,13 @@ static inline bool bond_mode_can_use_xmit_hash(const struct bonding *bond)
BOND_MODE(bond) == BOND_MODE_ALB);
}
+static inline bool bond_mode_can_use_lag_xfrm(const struct bonding *bond)
+{
+ return (BOND_MODE(bond) == BOND_MODE_8023AD ||
+ BOND_MODE(bond) == BOND_MODE_XOR) &&
+ bond->params.xmit_policy == BOND_XMIT_POLICY_LAYER34;
+}
+
static inline bool bond_mode_uses_xmit_hash(const struct bonding *bond)
{
return (BOND_MODE(bond) == BOND_MODE_8023AD ||
@@ -712,6 +735,10 @@ void bond_slave_arr_work_rearm(struct bonding *bond, unsigned long delay);
void bond_peer_notify_work_rearm(struct bonding *bond, unsigned long delay);
void bond_work_init_all(struct bonding *bond);
void bond_work_cancel_all(struct bonding *bond);
+#if IS_ENABLED(CONFIG_XFRM_OFFLOAD)
+void bond_ipsec_lag_begin_flush(struct bonding *bond);
+void bond_ipsec_lag_end_flush(struct bonding *bond);
+#endif
#ifdef CONFIG_PROC_FS
void bond_create_proc_entry(struct bonding *bond);
--
2.53.0
^ permalink raw reply related [flat|nested] 5+ messages in thread* [PATCH RFC net-next 3/4] bonding: expose user-controlled IPsec features for LAG
2026-05-20 8:10 [PATCH RFC net-next 0/4] bonding: support LAG IPsec offload with replicated SAs Jihong Min
2026-05-20 8:10 ` [PATCH RFC net-next 1/4] xfrm: add a lower-device offload handle resolver Jihong Min
2026-05-20 8:10 ` [PATCH RFC net-next 2/4] bonding: replicate XFRM offload state across LAG slaves Jihong Min
@ 2026-05-20 8:10 ` Jihong Min
2026-05-20 8:10 ` [PATCH RFC net-next 4/4] bonding: handle replicated IPsec SAs across LAG changes Jihong Min
3 siblings, 0 replies; 5+ messages in thread
From: Jihong Min @ 2026-05-20 8:10 UTC (permalink / raw)
To: netdev
Cc: Jay Vosburgh, Andrew Lunn, David S. Miller, Eric Dumazet,
Jakub Kicinski, Paolo Abeni, Simon Horman, Steffen Klassert,
Herbert Xu, linux-kernel, Jihong Min
Expose LAG IPsec offload as user-controlled bonding features instead of
enabling it by default. Keep the existing active-backup default behavior,
but make newly eligible LAG bonds start with ESP/XFRM features explicitly
disabled so users opt in with ethtool.
Let 802.3ad and balance-xor with layer3+4 advertise the intersection of
XFRM features across running eligible slaves, with supported features
shown as mutable off features rather than fixed-off capabilities.
Propagate mutable XFRM feature requests to running lower devices, verify
that requested features are actually enabled, and roll lower devices back
if propagation fails. Disable dependent ESP checksum and segmentation
features when HW ESP is not available.
Assisted-by: Codex:gpt-5.5
Signed-off-by: Jihong Min <hurryman2212@gmail.com>
---
drivers/net/bonding/bond_main.c | 232 +++++++++++++++++++++++++++++
drivers/net/bonding/bond_options.c | 2 +-
2 files changed, 233 insertions(+), 1 deletion(-)
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 66435de852e9..d81dae5a1902 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -2048,6 +2048,13 @@ static netdev_features_t bond_fix_features(struct net_device *dev,
struct list_head *iter;
netdev_features_t mask;
struct slave *slave;
+#ifdef CONFIG_XFRM_OFFLOAD
+ netdev_features_t lag_xfrm_features = BOND_XFRM_FEATURES;
+ bool ab_xfrm = BOND_MODE(bond) == BOND_MODE_ACTIVEBACKUP;
+ bool lag_xfrm_ok = true;
+ bool lag_xfrm = bond_mode_can_use_lag_xfrm(bond);
+ int lag_xfrm_slaves = 0;
+#endif /* CONFIG_XFRM_OFFLOAD */
mask = features;
features = netdev_base_features(features);
@@ -2056,12 +2063,234 @@ static netdev_features_t bond_fix_features(struct net_device *dev,
features = netdev_increment_features(features,
slave->dev->features,
mask);
+#ifdef CONFIG_XFRM_OFFLOAD
+ if (lag_xfrm && (mask & BOND_XFRM_FEATURES) &&
+ netif_running(slave->dev)) {
+ netdev_features_t slave_xfrm_features;
+ netdev_features_t slave_xfrm_enableable;
+ netdev_features_t missing;
+
+ slave_xfrm_features = slave->dev->features &
+ BOND_XFRM_FEATURES;
+ slave_xfrm_enableable = slave->dev->hw_features &
+ mask & BOND_XFRM_FEATURES;
+ slave_xfrm_features |= slave_xfrm_enableable;
+ missing = (BOND_XFRM_FEATURES & mask) &
+ ~slave_xfrm_features;
+ if (missing)
+ slave_dbg(dev, slave->dev,
+ "missing LAG XFRM feature(s) %pNF\n",
+ &missing);
+ lag_xfrm_features &= slave_xfrm_features;
+
+ if (!(slave_xfrm_features & NETIF_F_HW_ESP) ||
+ !bond_ipsec_lag_slave_has_ops(slave->dev)) {
+ slave_dbg(dev, slave->dev,
+ "missing LAG XFRM offload ops\n");
+ lag_xfrm_ok = false;
+ }
+ lag_xfrm_slaves++;
+ }
+#endif /* CONFIG_XFRM_OFFLOAD */
}
features = netdev_add_tso_features(features, mask);
+#ifdef CONFIG_XFRM_OFFLOAD
+ if (!ab_xfrm && !lag_xfrm)
+ features &= ~BOND_XFRM_FEATURES;
+ else if (lag_xfrm && (!lag_xfrm_ok || !lag_xfrm_slaves))
+ features &= ~BOND_XFRM_FEATURES;
+ else if (lag_xfrm)
+ features = (features & ~BOND_XFRM_FEATURES) |
+ (lag_xfrm_features & mask);
+ if (!(features & NETIF_F_HW_ESP))
+ features &= ~(NETIF_F_HW_ESP_TX_CSUM | NETIF_F_GSO_ESP);
+#endif /* CONFIG_XFRM_OFFLOAD */
+
return features;
}
+#ifdef CONFIG_XFRM_OFFLOAD
+static int bond_set_slave_xfrm_features(struct bonding *bond,
+ struct slave *slave,
+ netdev_features_t features)
+{
+ struct net_device *real_dev = slave->dev;
+ netdev_features_t xfrm_features;
+ netdev_features_t mutable;
+ bool notifier_ctx;
+ int err = 0;
+
+ mutable = real_dev->hw_features & BOND_XFRM_FEATURES;
+ if (!mutable)
+ return 0;
+
+ xfrm_features = features & BOND_XFRM_FEATURES;
+
+ notifier_ctx = bond->notifier_ctx;
+ bond->notifier_ctx = true;
+ netdev_lock_ops(real_dev);
+ real_dev->wanted_features &= ~mutable;
+ real_dev->wanted_features |= xfrm_features & mutable;
+ err = __netdev_update_features(real_dev);
+ if (err)
+ netdev_features_change(real_dev);
+ netdev_unlock_ops(real_dev);
+ bond->notifier_ctx = notifier_ctx;
+
+ return err < 0 ? err : 0;
+}
+
+static void bond_restore_slave_xfrm_features(struct bonding *bond,
+ netdev_features_t features)
+{
+ struct list_head *iter;
+ struct slave *slave;
+
+ bond_for_each_slave(bond, slave, iter) {
+ if (!netif_running(slave->dev))
+ continue;
+
+ bond_set_slave_xfrm_features(bond, slave, features);
+ }
+}
+
+static void bond_sync_slave_xfrm_features(struct bonding *bond,
+ struct slave *slave)
+{
+ netdev_features_t requested = bond->dev->wanted_features;
+ netdev_features_t old_wanted = slave->dev->wanted_features;
+ netdev_features_t available;
+ netdev_features_t missing;
+ int err;
+
+ if (!bond_mode_can_use_lag_xfrm(bond))
+ return;
+
+ if (!netif_running(slave->dev))
+ return;
+
+ requested &= BOND_XFRM_FEATURES;
+ if (!requested)
+ return;
+
+ available = slave->dev->features | slave->dev->hw_features;
+ missing = requested & ~available;
+ if ((requested & NETIF_F_HW_ESP) &&
+ !bond_ipsec_lag_slave_has_ops(slave->dev))
+ missing |= NETIF_F_HW_ESP;
+ if (missing)
+ goto disable_missing;
+
+ err = bond_set_slave_xfrm_features(bond, slave, requested);
+ missing = requested & ~slave->dev->features;
+ if (err && !missing)
+ missing = requested;
+ if (!missing)
+ return;
+
+ bond_set_slave_xfrm_features(bond, slave, old_wanted);
+
+disable_missing:
+ if (missing & NETIF_F_HW_ESP)
+ missing |= BOND_XFRM_FEATURES;
+ slave_warn(bond->dev, slave->dev,
+ "disabling XFRM feature(s) %pNF after slave enable failed\n",
+ &missing);
+ bond->dev->wanted_features &= ~missing;
+}
+
+static int bond_set_features(struct net_device *dev, netdev_features_t features)
+{
+ struct bonding *bond = netdev_priv(dev);
+ netdev_features_t changed;
+ netdev_features_t enabled;
+ struct list_head *iter;
+ struct slave *slave;
+ int err = 0;
+
+ if (!bond_mode_can_use_lag_xfrm(bond))
+ return 0;
+
+ changed = (dev->features ^ features) & BOND_XFRM_FEATURES;
+ if (!changed)
+ return 0;
+
+ enabled = features & BOND_XFRM_FEATURES;
+ if (enabled) {
+ int targets = 0;
+
+ bond_for_each_slave(bond, slave, iter) {
+ netdev_features_t available;
+ netdev_features_t missing;
+
+ if (!netif_running(slave->dev))
+ continue;
+
+ available = slave->dev->features | slave->dev->hw_features;
+ missing = enabled & ~available;
+ if ((enabled & NETIF_F_HW_ESP) &&
+ !bond_ipsec_lag_slave_has_ops(slave->dev))
+ missing |= NETIF_F_HW_ESP;
+ if (missing) {
+ slave_warn(dev, slave->dev,
+ "missing XFRM feature(s) %pNF\n",
+ &missing);
+ return -EOPNOTSUPP;
+ }
+ targets++;
+ }
+ if (!targets)
+ return -EOPNOTSUPP;
+ }
+
+ if ((dev->features & NETIF_F_HW_ESP) &&
+ !(features & NETIF_F_HW_ESP)) {
+ bond_ipsec_lag_begin_flush(bond);
+ xfrm_dev_state_flush(dev_net(dev), dev, true);
+ }
+
+ bond_for_each_slave(bond, slave, iter) {
+ if (!netif_running(slave->dev))
+ continue;
+
+ err = bond_set_slave_xfrm_features(bond, slave, features);
+ if (err)
+ break;
+ }
+ if (err) {
+ bond_restore_slave_xfrm_features(bond, dev->features);
+ if ((dev->features & NETIF_F_HW_ESP) &&
+ !(features & NETIF_F_HW_ESP))
+ bond_ipsec_lag_end_flush(bond);
+ return err;
+ }
+
+ bond_for_each_slave(bond, slave, iter) {
+ netdev_features_t missing = enabled & ~slave->dev->features;
+
+ if (!netif_running(slave->dev))
+ continue;
+
+ if (missing) {
+ slave_warn(dev, slave->dev,
+ "failed to enable XFRM feature(s) %pNF\n",
+ &missing);
+ bond_restore_slave_xfrm_features(bond, dev->features);
+ if ((dev->features & NETIF_F_HW_ESP) &&
+ !(features & NETIF_F_HW_ESP))
+ bond_ipsec_lag_end_flush(bond);
+ return -EOPNOTSUPP;
+ }
+ }
+
+ if (features & NETIF_F_HW_ESP)
+ bond_ipsec_lag_end_flush(bond);
+
+ return 0;
+}
+#endif /* CONFIG_XFRM_OFFLOAD */
+
static int bond_header_create(struct sk_buff *skb, struct net_device *bond_dev,
unsigned short type, const void *daddr,
const void *saddr, unsigned int len)
@@ -6510,6 +6739,9 @@ static const struct net_device_ops bond_netdev_ops = {
.ndo_add_slave = bond_enslave,
.ndo_del_slave = bond_release,
.ndo_fix_features = bond_fix_features,
+#ifdef CONFIG_XFRM_OFFLOAD
+ .ndo_set_features = bond_set_features,
+#endif /* CONFIG_XFRM_OFFLOAD */
.ndo_features_check = passthru_features_check,
.ndo_get_xmit_slave = bond_xmit_get_slave,
.ndo_sk_get_lower_dev = bond_sk_get_lower_dev,
diff --git a/drivers/net/bonding/bond_options.c b/drivers/net/bonding/bond_options.c
index 7380cc4ee75a..634b42c0d8e9 100644
--- a/drivers/net/bonding/bond_options.c
+++ b/drivers/net/bonding/bond_options.c
@@ -885,7 +885,7 @@ static bool bond_set_xfrm_features(struct bonding *bond)
if (BOND_MODE(bond) == BOND_MODE_ACTIVEBACKUP)
bond->dev->wanted_features |= BOND_XFRM_FEATURES;
- else
+ else if (!bond_mode_can_use_lag_xfrm(bond))
bond->dev->wanted_features &= ~BOND_XFRM_FEATURES;
return true;
--
2.53.0
^ permalink raw reply related [flat|nested] 5+ messages in thread* [PATCH RFC net-next 4/4] bonding: handle replicated IPsec SAs across LAG changes
2026-05-20 8:10 [PATCH RFC net-next 0/4] bonding: support LAG IPsec offload with replicated SAs Jihong Min
` (2 preceding siblings ...)
2026-05-20 8:10 ` [PATCH RFC net-next 3/4] bonding: expose user-controlled IPsec features for LAG Jihong Min
@ 2026-05-20 8:10 ` Jihong Min
3 siblings, 0 replies; 5+ messages in thread
From: Jihong Min @ 2026-05-20 8:10 UTC (permalink / raw)
To: netdev
Cc: Jay Vosburgh, Andrew Lunn, David S. Miller, Eric Dumazet,
Jakub Kicinski, Paolo Abeni, Simon Horman, Steffen Klassert,
Herbert Xu, linux-kernel, Jihong Min
Keep replicated bonding IPsec state consistent as the LAG changes. Add
newly usable slaves to existing replicated states, remove only the
departing lower instance on down/remove, and update the usable slave
array before hiding lower handles.
Flush bond-owned XFRM offload state when mode or hash policy leaves the
LAG XFRM eligible configuration. Block new LAG offload adds while
pending replicated states are cleaned up and the XFRM table is flushed.
Assisted-by: Codex:gpt-5.5
Signed-off-by: Jihong Min <hurryman2212@gmail.com>
---
drivers/net/bonding/bond_main.c | 45 ++++++++++++++++++++---
drivers/net/bonding/bond_options.c | 57 ++++++++++++++++++++++++++++++
2 files changed, 98 insertions(+), 4 deletions(-)
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index d81dae5a1902..0243950c2fa6 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -3104,6 +3104,18 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev,
bpf_prog_inc(bond->xdp_prog);
}
+#ifdef CONFIG_XFRM_OFFLOAD
+ if ((bond_dev->wanted_features & BOND_XFRM_FEATURES) &&
+ bond_mode_can_use_lag_xfrm(bond)) {
+ bond_sync_slave_xfrm_features(bond, new_slave);
+ bond->notifier_ctx = true;
+ netdev_compute_master_upper_features(bond->dev, true);
+ bond->notifier_ctx = false;
+ }
+#endif /* CONFIG_XFRM_OFFLOAD */
+
+ bond_ipsec_lag_add_slave(bond, new_slave, extack);
+
/* broadcast mode uses the all_slaves to loop through slaves. */
if (bond_mode_can_use_xmit_hash(bond) ||
BOND_MODE(bond) == BOND_MODE_BROADCAST)
@@ -3222,6 +3234,9 @@ static int __bond_release_one(struct net_device *bond_dev,
}
bond_set_slave_inactive_flags(slave, BOND_SLAVE_NOTIFY_NOW);
+ if (bond_mode_can_use_xmit_hash(bond) ||
+ BOND_MODE(bond) == BOND_MODE_BROADCAST)
+ bond_update_slave_arr(bond, slave);
bond_sysfs_slave_del(slave);
@@ -3239,8 +3254,10 @@ static int __bond_release_one(struct net_device *bond_dev,
slave_warn(bond_dev, slave_dev, "failed to unload XDP program\n");
}
- /* unregister rx_handler early so bond_handle_frame wouldn't be called
- * for this slave anymore.
+ bond_ipsec_lag_remove_slave(bond, slave_dev);
+
+ /* unregister rx_handler after lower IPsec state is gone so RX cannot
+ * bypass the bond while a bond-owned SA is still installed.
*/
netdev_rx_handler_unregister(slave_dev);
@@ -4758,8 +4775,13 @@ static int bond_slave_netdev_event(unsigned long event,
if (BOND_MODE(bond) == BOND_MODE_8023AD)
bond_3ad_adapter_speed_duplex_changed(slave);
- fallthrough;
- case NETDEV_DOWN:
+ bond_sync_slave_xfrm_features(bond, slave);
+ if (bond_mode_can_use_lag_xfrm(bond)) {
+ bond->notifier_ctx = true;
+ netdev_compute_master_upper_features(bond->dev, true);
+ bond->notifier_ctx = false;
+ }
+ bond_ipsec_lag_add_slave(bond, slave, NULL);
/* Refresh slave-array if applicable!
* If the setup does not use miimon or arpmon (mode-specific!),
* then these events will not cause the slave-array to be
@@ -4771,6 +4793,19 @@ static int bond_slave_netdev_event(unsigned long event,
if (bond_mode_can_use_xmit_hash(bond))
bond_update_slave_arr(bond, NULL);
break;
+ case NETDEV_DOWN:
+ /* Refresh slave-array before deleting IPsec state so no new
+ * TX path picks this slave after its offload handle is hidden.
+ */
+ if (bond_mode_can_use_xmit_hash(bond))
+ bond_update_slave_arr(bond, slave);
+ bond_ipsec_lag_remove_slave(bond, slave_dev);
+ if (bond_mode_can_use_lag_xfrm(bond)) {
+ bond->notifier_ctx = true;
+ netdev_compute_master_upper_features(bond->dev, true);
+ bond->notifier_ctx = false;
+ }
+ break;
case NETDEV_CHANGEMTU:
/* TODO: Should slaves be allowed to
* independently alter their MTU? For
@@ -4809,10 +4844,12 @@ static int bond_slave_netdev_event(unsigned long event,
break;
case NETDEV_FEAT_CHANGE:
if (!bond->notifier_ctx) {
+ bond_sync_slave_xfrm_features(bond, slave);
bond->notifier_ctx = true;
netdev_compute_master_upper_features(bond->dev, true);
bond->notifier_ctx = false;
}
+ bond_ipsec_lag_add_slave(bond, slave, NULL);
break;
case NETDEV_RESEND_IGMP:
/* Propagate to master device */
diff --git a/drivers/net/bonding/bond_options.c b/drivers/net/bonding/bond_options.c
index 634b42c0d8e9..ee3ffc698d7d 100644
--- a/drivers/net/bonding/bond_options.c
+++ b/drivers/net/bonding/bond_options.c
@@ -17,6 +17,7 @@
#include <net/bonding.h>
#include <net/ndisc.h>
+#include <net/xfrm.h>
static int bond_option_active_slave_set(struct bonding *bond,
const struct bond_opt_value *newval);
@@ -894,6 +895,13 @@ static bool bond_set_xfrm_features(struct bonding *bond)
static int bond_option_mode_set(struct bonding *bond,
const struct bond_opt_value *newval)
{
+#if IS_ENABLED(CONFIG_XFRM_OFFLOAD)
+ bool old_ab_xfrm = BOND_MODE(bond) == BOND_MODE_ACTIVEBACKUP;
+ bool old_lag_xfrm = bond_mode_can_use_lag_xfrm(bond);
+ bool new_lag_xfrm;
+ bool flush_lag_xfrm = false;
+#endif
+
if (bond->xdp_prog && !bond_xdp_check(bond, newval->value))
return -EOPNOTSUPP;
@@ -918,8 +926,26 @@ static int bond_option_mode_set(struct bonding *bond,
/* don't cache arp_validate between modes */
bond->params.arp_validate = BOND_ARP_VALIDATE_NONE;
+
bond->params.mode = newval->value;
+#if IS_ENABLED(CONFIG_XFRM_OFFLOAD)
+ new_lag_xfrm = bond_mode_can_use_lag_xfrm(bond);
+ if (old_ab_xfrm && new_lag_xfrm)
+ bond->dev->wanted_features &= ~BOND_XFRM_FEATURES;
+ if (old_lag_xfrm && !new_lag_xfrm) {
+ bond_ipsec_lag_begin_flush(bond);
+ flush_lag_xfrm = true;
+ }
+
+ if (flush_lag_xfrm) {
+ if (bond->dev->reg_state == NETREG_REGISTERED)
+ xfrm_dev_state_flush(dev_net(bond->dev), bond->dev,
+ true);
+ bond_ipsec_lag_end_flush(bond);
+ }
+#endif
+
/* When changing mode, the bond device is down, we may reduce
* the bond_bcast_neigh_enabled in bond_close() if broadcast_neighbor
* enabled in 8023ad mode. Therefore, only clear broadcast_neighbor
@@ -1575,12 +1601,43 @@ static int bond_option_fail_over_mac_set(struct bonding *bond,
static int bond_option_xmit_hash_policy_set(struct bonding *bond,
const struct bond_opt_value *newval)
{
+#if IS_ENABLED(CONFIG_XFRM_OFFLOAD)
+ bool old_lag_xfrm = bond_mode_can_use_lag_xfrm(bond);
+ bool new_lag_xfrm;
+ bool flush_lag_xfrm = false;
+#endif
+
if (bond->xdp_prog && !__bond_xdp_check(BOND_MODE(bond), newval->value))
return -EOPNOTSUPP;
netdev_dbg(bond->dev, "Setting xmit hash policy to %s (%llu)\n",
newval->string, newval->value);
+
bond->params.xmit_policy = newval->value;
+#if IS_ENABLED(CONFIG_XFRM_OFFLOAD)
+ new_lag_xfrm = bond_mode_can_use_lag_xfrm(bond);
+ if (old_lag_xfrm && !new_lag_xfrm) {
+ bond_ipsec_lag_begin_flush(bond);
+ flush_lag_xfrm = true;
+ }
+
+ if (flush_lag_xfrm) {
+ if (bond->dev->reg_state == NETREG_REGISTERED)
+ xfrm_dev_state_flush(dev_net(bond->dev), bond->dev,
+ true);
+ bond_ipsec_lag_end_flush(bond);
+ }
+#endif
+
+ if (bond->dev->reg_state == NETREG_REGISTERED) {
+ bool update = false;
+
+ update |= bond_set_xfrm_features(bond);
+
+ if (update)
+ netdev_update_features(bond->dev);
+ }
+
return 0;
}
--
2.53.0
^ permalink raw reply related [flat|nested] 5+ messages in thread