Netdev List
 help / color / mirror / Atom feed
From: Jihong Min <hurryman2212@gmail.com>
To: netdev@vger.kernel.org
Cc: Jay Vosburgh <jv@jvosburgh.net>,
	Andrew Lunn <andrew+netdev@lunn.ch>,
	"David S. Miller" <davem@davemloft.net>,
	Eric Dumazet <edumazet@google.com>,
	Jakub Kicinski <kuba@kernel.org>, Paolo Abeni <pabeni@redhat.com>,
	Simon Horman <horms@kernel.org>,
	Steffen Klassert <steffen.klassert@secunet.com>,
	Herbert Xu <herbert@gondor.apana.org.au>,
	linux-kernel@vger.kernel.org, Jihong Min <hurryman2212@gmail.com>
Subject: [PATCH RFC net-next 2/4] bonding: replicate XFRM offload state across LAG slaves
Date: Wed, 20 May 2026 17:10:02 +0900	[thread overview]
Message-ID: <20260520081004.2232091-3-hurryman2212@gmail.com> (raw)
In-Reply-To: <20260520081004.2232091-1-hurryman2212@gmail.com>

LAG bonds need to install the same IPsec/XFRM state on every eligible
lower device, but each lower device may return a different hardware
handle. Add a replicated bonding-private XFRM state object that stores
per-lower-device instances and handles.

Use the replicated model for 802.3ad and balance-xor with layer3+4
hashing. Install the state on every eligible running slave, capture each
lower handle, and roll back in reverse order on failure. Keep
active-backup on the existing single-lower path and expose a bonding
resolver for lower drivers that call xfrm_dev_state_lower_handle().

Assisted-by: Codex:gpt-5.5
Signed-off-by: Jihong Min <hurryman2212@gmail.com>
---
 drivers/net/bonding/bond_main.c | 578 +++++++++++++++++++++++++++++++-
 include/net/bonding.h           |  29 +-
 2 files changed, 595 insertions(+), 12 deletions(-)

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index af82a3df2c5d..66435de852e9 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -455,6 +455,432 @@ static struct net_device *bond_ipsec_dev(struct xfrm_state *xs)
 	return slave->dev;
 }
 
+static void bond_ipsec_inst_rcu_free(struct rcu_head *rcu)
+{
+	struct bond_ipsec_inst *inst;
+
+	inst = container_of(rcu, struct bond_ipsec_inst, rcu);
+	netdev_put(inst->real_dev, &inst->dev_tracker);
+	kfree(inst);
+}
+
+static void bond_ipsec_rcu_free(struct rcu_head *rcu)
+{
+	struct bond_ipsec *ipsec;
+
+	ipsec = container_of(rcu, struct bond_ipsec, rcu);
+	kfree(ipsec);
+}
+
+static bool bond_ipsec_slave_has_xfrm_ops(struct net_device *real_dev)
+{
+	const struct xfrmdev_ops *ops;
+
+	if (!real_dev || netif_is_bond_master(real_dev))
+		return false;
+
+	ops = real_dev->xfrmdev_ops;
+	if (!ops)
+		return false;
+
+	return ops->xdo_dev_state_add && ops->xdo_dev_state_delete;
+}
+
+static bool bond_ipsec_lag_slave_has_ops(struct net_device *real_dev)
+{
+	return bond_ipsec_slave_has_xfrm_ops(real_dev) &&
+	       real_dev->xfrmdev_ops->flags & XFRMDEV_OPS_F_LOWER_HANDLE;
+}
+
+static bool bond_ipsec_lag_slave_ok(struct net_device *real_dev)
+{
+	return (real_dev->features & NETIF_F_HW_ESP) &&
+	       bond_ipsec_lag_slave_has_ops(real_dev);
+}
+
+static void bond_ipsec_lag_free_instances(struct bond_ipsec *ipsec)
+{
+	struct bond_ipsec_inst *inst, *tmp;
+
+	list_for_each_entry_safe(inst, tmp, &ipsec->inst_list, list) {
+		list_del_rcu(&inst->list);
+		call_rcu(&inst->rcu, bond_ipsec_inst_rcu_free);
+	}
+}
+
+static void bond_ipsec_lag_call_inst(struct xfrm_state *xs,
+				     struct bond_ipsec_inst *inst,
+				     bool delete_state,
+				     bool free_state)
+{
+	unsigned long bond_handle = xs->xso.offload_handle;
+	struct net_device *bond_real_dev = xs->xso.real_dev;
+	const struct xfrmdev_ops *ops = inst->real_dev->xfrmdev_ops;
+
+	if (!inst->lower_handle)
+		return;
+
+	if (!ops)
+		return;
+
+	xs->xso.real_dev = inst->real_dev;
+	xs->xso.offload_handle = inst->lower_handle;
+	if (delete_state) {
+		WRITE_ONCE(inst->added, false);
+		if (!inst->deleted && ops->xdo_dev_state_delete) {
+			ops->xdo_dev_state_delete(inst->real_dev, xs);
+			xs->xso.offload_handle = inst->lower_handle;
+			inst->deleted = true;
+		}
+	}
+	if (free_state && ops->xdo_dev_state_free)
+		ops->xdo_dev_state_free(inst->real_dev, xs);
+	if (free_state)
+		inst->lower_handle = 0;
+
+	xs->xso.real_dev = bond_real_dev;
+	xs->xso.offload_handle = bond_handle;
+}
+
+static void bond_ipsec_lag_call_state(struct xfrm_state *xs,
+				      struct bond_ipsec *ipsec,
+				      bool delete_state,
+				      bool free_state)
+{
+	struct bond_ipsec_inst *inst;
+
+	list_for_each_entry_reverse(inst, &ipsec->inst_list, list) {
+		bond_ipsec_lag_call_inst(xs, inst, delete_state, free_state);
+	}
+}
+
+static int bond_ipsec_lag_add_inst(struct xfrm_state *xs,
+				   struct bond_ipsec_inst *inst,
+				   struct netlink_ext_ack *extack)
+{
+	unsigned long bond_handle = xs->xso.offload_handle;
+	struct net_device *bond_real_dev = xs->xso.real_dev;
+	const struct xfrmdev_ops *ops;
+	int err;
+
+	if (!bond_ipsec_lag_slave_ok(inst->real_dev))
+		return -EOPNOTSUPP;
+
+	ops = inst->real_dev->xfrmdev_ops;
+	xs->xso.real_dev = inst->real_dev;
+	xs->xso.offload_handle = 0;
+	err = ops->xdo_dev_state_add(inst->real_dev, xs, extack);
+	if (err)
+		goto out;
+
+	inst->lower_handle = xs->xso.offload_handle;
+	if (!inst->lower_handle) {
+		err = -EINVAL;
+		NL_SET_ERR_MSG_MOD(extack, "Slave did not return an IPsec offload handle");
+		if (ops->xdo_dev_state_delete)
+			ops->xdo_dev_state_delete(inst->real_dev, xs);
+		if (ops->xdo_dev_state_free)
+			ops->xdo_dev_state_free(inst->real_dev, xs);
+		goto out;
+	}
+
+	inst->deleted = false;
+	inst->added = true;
+
+out:
+	xs->xso.real_dev = bond_real_dev;
+	xs->xso.offload_handle = bond_handle;
+	return err;
+}
+
+static int bond_ipsec_lag_add_sa(struct net_device *bond_dev,
+				 struct xfrm_state *xs,
+				 struct netlink_ext_ack *extack)
+{
+	struct bonding *bond = netdev_priv(bond_dev);
+	struct bond_ipsec_inst *inst;
+	struct bond_ipsec *ipsec;
+	struct list_head *iter;
+	struct slave *slave;
+	int err = 0;
+	int count = 0;
+
+	if (xs->xso.type != XFRM_DEV_OFFLOAD_CRYPTO) {
+		NL_SET_ERR_MSG_MOD(extack, "LAG supports only XFRM crypto offload");
+		return -EOPNOTSUPP;
+	}
+
+	if (xs->props.flags & XFRM_STATE_ESN) {
+		NL_SET_ERR_MSG_MOD(extack, "LAG does not support XFRM ESN offload");
+		return -EOPNOTSUPP;
+	}
+
+	ipsec = kmalloc_obj(*ipsec);
+	if (!ipsec)
+		return -ENOMEM;
+
+	ipsec->xs = xs;
+	ipsec->replicated = true;
+	INIT_LIST_HEAD(&ipsec->list);
+	INIT_LIST_HEAD(&ipsec->inst_list);
+
+	/* Serialize with slave down/remove and LAG eligibility changes so they
+	 * cannot miss lower SAs installed before this state is published.
+	 */
+	mutex_lock(&bond->ipsec_lock);
+	if (bond->ipsec_lag_blocked) {
+		err = -EAGAIN;
+		NL_SET_ERR_MSG_MOD(extack, "Bond LAG XFRM state add is blocked");
+		goto err_free_unlock;
+	}
+	if (!(bond_dev->features & NETIF_F_HW_ESP)) {
+		err = -EOPNOTSUPP;
+		NL_SET_ERR_MSG_MOD(extack, "Bond IPsec offload is disabled");
+		goto err_free_unlock;
+	}
+	if (!bond_mode_can_use_lag_xfrm(bond)) {
+		err = -EAGAIN;
+		NL_SET_ERR_MSG_MOD(extack, "Bond LAG XFRM eligibility changed");
+		goto err_free_unlock;
+	}
+	rcu_read_lock();
+	bond_for_each_slave_rcu(bond, slave, iter) {
+		struct net_device *real_dev = slave->dev;
+
+		if (!netif_running(real_dev))
+			continue;
+
+		if (!bond_ipsec_lag_slave_ok(real_dev)) {
+			err = -EOPNOTSUPP;
+			break;
+		}
+
+		inst = kzalloc_obj(*inst, GFP_ATOMIC);
+		if (!inst) {
+			err = -ENOMEM;
+			break;
+		}
+
+		inst->real_dev = real_dev;
+		netdev_hold(real_dev, &inst->dev_tracker, GFP_ATOMIC);
+		list_add_tail(&inst->list, &ipsec->inst_list);
+		count++;
+	}
+	rcu_read_unlock();
+
+	if (!err && !count)
+		err = -ENODEV;
+	if (err) {
+		if (err == -EOPNOTSUPP)
+			NL_SET_ERR_MSG_MOD(extack, "Not all slaves support IPsec offload");
+		goto err_free_unlock;
+	}
+
+	list_for_each_entry(inst, &ipsec->inst_list, list) {
+		err = bond_ipsec_lag_add_inst(xs, inst, extack);
+		if (err)
+			goto err_delete;
+	}
+
+	xs->xso.real_dev = NULL;
+	xs->xso.offload_handle = 0;
+	if (!bond_mode_can_use_lag_xfrm(bond)) {
+		err = -EAGAIN;
+		NL_SET_ERR_MSG_MOD(extack, "Bond LAG XFRM eligibility changed");
+		goto err_delete;
+	}
+	rcu_assign_pointer(xs->xso.upper_priv, ipsec);
+	list_add(&ipsec->list, &bond->ipsec_list);
+	mutex_unlock(&bond->ipsec_lock);
+
+	return 0;
+
+err_delete:
+	bond_ipsec_lag_call_state(xs, ipsec, true, true);
+	xs->xso.real_dev = NULL;
+	xs->xso.offload_handle = 0;
+	RCU_INIT_POINTER(xs->xso.upper_priv, NULL);
+err_free_unlock:
+	mutex_unlock(&bond->ipsec_lock);
+	bond_ipsec_lag_free_instances(ipsec);
+	kfree(ipsec);
+	return err;
+}
+
+static void bond_ipsec_lag_flush_pending(struct bonding *bond)
+{
+	struct bond_ipsec *ipsec, *tmp;
+
+	/* Caller must hold ipsec_lock to serialize with LAG SA add. */
+	list_for_each_entry_safe(ipsec, tmp, &bond->ipsec_list, list) {
+		struct xfrm_dev_offload *xso;
+		struct xfrm_state *xs;
+		struct net *net;
+		bool pending;
+
+		if (!ipsec->replicated)
+			continue;
+
+		xs = ipsec->xs;
+		net = xs_net(xs);
+		spin_lock_bh(&net->xfrm.xfrm_state_lock);
+		pending = hlist_unhashed(&xs->bydst) &&
+			  xs->km.state != XFRM_STATE_DEAD;
+		spin_unlock_bh(&net->xfrm.xfrm_state_lock);
+		if (!pending)
+			continue;
+
+		xso = &xs->xso;
+		list_del(&ipsec->list);
+		RCU_INIT_POINTER(xso->upper_priv, NULL);
+		bond_ipsec_lag_call_state(xs, ipsec, true, true);
+		bond_ipsec_lag_free_instances(ipsec);
+		call_rcu(&ipsec->rcu, bond_ipsec_rcu_free);
+
+		xso->real_dev = NULL;
+		xso->offload_handle = 0;
+		if (xso->dev == bond->dev) {
+			WRITE_ONCE(xso->dev, NULL);
+			xso->dir = 0;
+			xso->type = XFRM_DEV_OFFLOAD_UNSPECIFIED;
+			netdev_put(bond->dev, &xso->dev_tracker);
+			xfrm_unset_type_offload(xs);
+		}
+	}
+}
+
+void bond_ipsec_lag_begin_flush(struct bonding *bond)
+{
+	mutex_lock(&bond->ipsec_lock);
+	bond->ipsec_lag_blocked = true;
+	bond_ipsec_lag_flush_pending(bond);
+	mutex_unlock(&bond->ipsec_lock);
+}
+
+void bond_ipsec_lag_end_flush(struct bonding *bond)
+{
+	mutex_lock(&bond->ipsec_lock);
+	bond->ipsec_lag_blocked = false;
+	mutex_unlock(&bond->ipsec_lock);
+}
+
+static void bond_ipsec_lag_remove_slave(struct bonding *bond,
+					struct net_device *real_dev)
+{
+	struct bond_ipsec_inst *inst, *tmp;
+	struct bond_ipsec *ipsec;
+	bool removed = false;
+
+	if (!bond_mode_can_use_lag_xfrm(bond))
+		return;
+
+	mutex_lock(&bond->ipsec_lock);
+	list_for_each_entry(ipsec, &bond->ipsec_list, list) {
+		if (!ipsec->replicated)
+			continue;
+
+		list_for_each_entry(inst, &ipsec->inst_list, list) {
+			if (inst->real_dev != real_dev)
+				continue;
+
+			WRITE_ONCE(inst->added, false);
+			removed = true;
+		}
+	}
+	if (!removed)
+		goto out;
+
+	synchronize_net();
+
+	list_for_each_entry(ipsec, &bond->ipsec_list, list) {
+		if (!ipsec->replicated)
+			continue;
+
+		list_for_each_entry_safe(inst, tmp, &ipsec->inst_list, list) {
+			if (inst->real_dev != real_dev)
+				continue;
+
+			bond_ipsec_lag_call_inst(ipsec->xs, inst, true, true);
+			list_del_rcu(&inst->list);
+			call_rcu(&inst->rcu, bond_ipsec_inst_rcu_free);
+		}
+	}
+out:
+	mutex_unlock(&bond->ipsec_lock);
+}
+
+static int bond_ipsec_lag_add_slave(struct bonding *bond,
+				    struct slave *slave,
+				    struct netlink_ext_ack *extack)
+{
+	struct net_device *real_dev = slave->dev;
+	struct bond_ipsec_inst *inst;
+	struct bond_ipsec *ipsec;
+	bool have_states = false;
+	bool slave_ok;
+	int err = 0;
+
+	if (!bond_mode_can_use_lag_xfrm(bond) || !netif_running(real_dev))
+		return 0;
+
+	slave_ok = bond_ipsec_lag_slave_ok(real_dev);
+
+	mutex_lock(&bond->ipsec_lock);
+	list_for_each_entry(ipsec, &bond->ipsec_list, list) {
+		bool found = false;
+
+		if (!ipsec->replicated)
+			continue;
+		have_states = true;
+
+		if (ipsec->xs->km.state == XFRM_STATE_DEAD)
+			continue;
+
+		if (!slave_ok) {
+			err = -EOPNOTSUPP;
+			break;
+		}
+
+		list_for_each_entry(inst, &ipsec->inst_list, list) {
+			if (inst->real_dev == real_dev) {
+				found = true;
+				break;
+			}
+		}
+		if (found)
+			continue;
+
+		inst = kzalloc_obj(*inst, GFP_KERNEL);
+		if (!inst) {
+			err = -ENOMEM;
+			break;
+		}
+
+		inst->real_dev = real_dev;
+		netdev_hold(real_dev, &inst->dev_tracker, GFP_KERNEL);
+
+		err = bond_ipsec_lag_add_inst(ipsec->xs, inst, extack);
+		if (err) {
+			netdev_put(real_dev, &inst->dev_tracker);
+			kfree(inst);
+			break;
+		}
+
+		list_add_tail_rcu(&inst->list, &ipsec->inst_list);
+	}
+	mutex_unlock(&bond->ipsec_lock);
+
+	if (err && have_states) {
+		slave_warn(bond->dev, real_dev,
+			   "failed to replicate IPsec SA, flushing bond states\n");
+		bond_ipsec_lag_begin_flush(bond);
+		xfrm_dev_state_flush(dev_net(bond->dev), bond->dev, true);
+		bond_ipsec_lag_end_flush(bond);
+	}
+
+	return err;
+}
+
 /**
  * bond_ipsec_add_sa - program device with a security association
  * @bond_dev: pointer to the bond net device
@@ -475,8 +901,15 @@ static int bond_ipsec_add_sa(struct net_device *bond_dev,
 	if (!bond_dev)
 		return -EINVAL;
 
-	rcu_read_lock();
 	bond = netdev_priv(bond_dev);
+	if (bond_mode_can_use_lag_xfrm(bond))
+		return bond_ipsec_lag_add_sa(bond_dev, xs, extack);
+	if (BOND_MODE(bond) != BOND_MODE_ACTIVEBACKUP) {
+		NL_SET_ERR_MSG_MOD(extack, "Bond mode does not support IPsec offload");
+		return -EOPNOTSUPP;
+	}
+
+	rcu_read_lock();
 	slave = rcu_dereference(bond->curr_active_slave);
 	real_dev = slave ? slave->dev : NULL;
 	netdev_hold(real_dev, &tracker, GFP_ATOMIC);
@@ -504,7 +937,9 @@ static int bond_ipsec_add_sa(struct net_device *bond_dev,
 	if (!err) {
 		xs->xso.real_dev = real_dev;
 		ipsec->xs = xs;
+		ipsec->replicated = false;
 		INIT_LIST_HEAD(&ipsec->list);
+		INIT_LIST_HEAD(&ipsec->inst_list);
 		mutex_lock(&bond->ipsec_lock);
 		list_add(&ipsec->list, &bond->ipsec_list);
 		mutex_unlock(&bond->ipsec_lock);
@@ -523,6 +958,9 @@ static void bond_ipsec_add_sa_all(struct bonding *bond)
 	struct bond_ipsec *ipsec;
 	struct slave *slave;
 
+	if (BOND_MODE(bond) != BOND_MODE_ACTIVEBACKUP)
+		return;
+
 	slave = rtnl_dereference(bond->curr_active_slave);
 	real_dev = slave ? slave->dev : NULL;
 	if (!real_dev)
@@ -540,6 +978,9 @@ static void bond_ipsec_add_sa_all(struct bonding *bond)
 	}
 
 	list_for_each_entry(ipsec, &bond->ipsec_list, list) {
+		if (ipsec->replicated)
+			continue;
+
 		/* If new state is added before ipsec_lock acquired */
 		if (ipsec->xs->xso.real_dev == real_dev)
 			continue;
@@ -568,6 +1009,19 @@ static void bond_ipsec_add_sa_all(struct bonding *bond)
 	mutex_unlock(&bond->ipsec_lock);
 }
 
+static struct bond_ipsec *bond_ipsec_find(struct bonding *bond,
+					  struct xfrm_state *xs)
+{
+	struct bond_ipsec *ipsec;
+
+	list_for_each_entry(ipsec, &bond->ipsec_list, list) {
+		if (ipsec->xs == xs)
+			return ipsec;
+	}
+
+	return NULL;
+}
+
 /**
  * bond_ipsec_del_sa - clear out this specific SA
  * @bond_dev: pointer to the bond net device
@@ -577,8 +1031,24 @@ static void bond_ipsec_del_sa(struct net_device *bond_dev,
 			      struct xfrm_state *xs)
 {
 	struct net_device *real_dev;
+	struct bond_ipsec *ipsec;
+	struct bonding *bond;
+
+	if (!bond_dev)
+		return;
+
+	bond = netdev_priv(bond_dev);
 
-	if (!bond_dev || !xs->xso.real_dev)
+	mutex_lock(&bond->ipsec_lock);
+	ipsec = bond_ipsec_find(bond, xs);
+	if (ipsec && ipsec->replicated) {
+		bond_ipsec_lag_call_state(xs, ipsec, true, false);
+		mutex_unlock(&bond->ipsec_lock);
+		return;
+	}
+	mutex_unlock(&bond->ipsec_lock);
+
+	if (!xs->xso.real_dev)
 		return;
 
 	real_dev = xs->xso.real_dev;
@@ -600,6 +1070,9 @@ static void bond_ipsec_del_sa_all(struct bonding *bond)
 	struct bond_ipsec *ipsec;
 	struct slave *slave;
 
+	if (BOND_MODE(bond) != BOND_MODE_ACTIVEBACKUP)
+		return;
+
 	slave = rtnl_dereference(bond->curr_active_slave);
 	real_dev = slave ? slave->dev : NULL;
 	if (!real_dev)
@@ -607,6 +1080,9 @@ static void bond_ipsec_del_sa_all(struct bonding *bond)
 
 	mutex_lock(&bond->ipsec_lock);
 	list_for_each_entry(ipsec, &bond->ipsec_list, list) {
+		if (ipsec->replicated)
+			continue;
+
 		if (!ipsec->xs->xso.real_dev)
 			continue;
 
@@ -647,23 +1123,33 @@ static void bond_ipsec_free_sa(struct net_device *bond_dev,
 	bond = netdev_priv(bond_dev);
 
 	mutex_lock(&bond->ipsec_lock);
-	if (!xs->xso.real_dev)
+	ipsec = bond_ipsec_find(bond, xs);
+	if (ipsec && ipsec->replicated) {
+		list_del(&ipsec->list);
+		RCU_INIT_POINTER(xs->xso.upper_priv, NULL);
+		bond_ipsec_lag_call_state(xs, ipsec, false, true);
+		bond_ipsec_lag_free_instances(ipsec);
+		call_rcu(&ipsec->rcu, bond_ipsec_rcu_free);
+		xs->xso.real_dev = NULL;
+		xs->xso.offload_handle = 0;
 		goto out;
+	}
 
 	real_dev = xs->xso.real_dev;
+	if (!real_dev)
+		goto free_ipsec;
 
 	xs->xso.real_dev = NULL;
 	if (real_dev->xfrmdev_ops &&
 	    real_dev->xfrmdev_ops->xdo_dev_state_free)
 		real_dev->xfrmdev_ops->xdo_dev_state_free(real_dev, xs);
-out:
-	list_for_each_entry(ipsec, &bond->ipsec_list, list) {
-		if (ipsec->xs == xs) {
-			list_del(&ipsec->list);
-			kfree(ipsec);
-			break;
-		}
+
+free_ipsec:
+	if (ipsec) {
+		list_del(&ipsec->list);
+		kfree(ipsec);
 	}
+out:
 	mutex_unlock(&bond->ipsec_lock);
 }
 
@@ -674,7 +1160,17 @@ static void bond_ipsec_free_sa(struct net_device *bond_dev,
  **/
 static bool bond_ipsec_offload_ok(struct sk_buff *skb, struct xfrm_state *xs)
 {
+	struct net_device *bond_dev = xs->xso.dev;
 	struct net_device *real_dev;
+	struct bonding *bond;
+
+	if (!bond_dev)
+		return false;
+
+	bond = netdev_priv(bond_dev);
+	if (bond_mode_can_use_lag_xfrm(bond))
+		return xs->xso.type == XFRM_DEV_OFFLOAD_CRYPTO &&
+		       rcu_access_pointer(xs->xso.upper_priv);
 
 	rcu_read_lock();
 	real_dev = bond_ipsec_dev(xs);
@@ -735,6 +1231,47 @@ static void bond_xfrm_update_stats(struct xfrm_state *xs)
 	rcu_read_unlock();
 }
 
+/*
+ * xdo_dev_state_lower_handle implementation for bond-owned XFRM states.
+ * lower_dev is the slave selected by the lower driver datapath. Replicated LAG
+ * state is resolved from the bond private instance list. Single-lower
+ * active-backup state is resolved from xso.real_dev/offload_handle here because
+ * xfrm_dev_state_lower_handle() delegates all bond-owned lookups to bonding.
+ */
+static unsigned long bond_ipsec_lower_handle(struct net_device *bond_dev,
+					     struct xfrm_state *xs,
+					     struct net_device *lower_dev)
+{
+	struct bonding *bond = netdev_priv(bond_dev);
+	struct bond_ipsec_inst *inst;
+	struct bond_ipsec *ipsec;
+	unsigned long handle = 0;
+
+	if (BOND_MODE(bond) == BOND_MODE_ACTIVEBACKUP) {
+		struct net_device *real_dev = READ_ONCE(xs->xso.real_dev);
+
+		return real_dev == lower_dev ? READ_ONCE(xs->xso.offload_handle) : 0;
+	}
+	if (!bond_mode_can_use_lag_xfrm(bond))
+		return 0;
+
+	rcu_read_lock();
+	ipsec = rcu_dereference(xs->xso.upper_priv);
+	if (!ipsec || !ipsec->replicated || ipsec->xs != xs)
+		goto out;
+
+	list_for_each_entry_rcu(inst, &ipsec->inst_list, list) {
+		if (READ_ONCE(inst->added) && inst->real_dev == lower_dev) {
+			handle = inst->lower_handle;
+			break;
+		}
+	}
+
+out:
+	rcu_read_unlock();
+	return handle;
+}
+
 static const struct xfrmdev_ops bond_xfrmdev_ops = {
 	.xdo_dev_state_add = bond_ipsec_add_sa,
 	.xdo_dev_state_delete = bond_ipsec_del_sa,
@@ -742,7 +1279,25 @@ static const struct xfrmdev_ops bond_xfrmdev_ops = {
 	.xdo_dev_offload_ok = bond_ipsec_offload_ok,
 	.xdo_dev_state_advance_esn = bond_advance_esn_state,
 	.xdo_dev_state_update_stats = bond_xfrm_update_stats,
+	.xdo_dev_state_lower_handle = bond_ipsec_lower_handle,
 };
+#else
+static void bond_ipsec_lag_remove_slave(struct bonding *bond,
+					struct net_device *real_dev)
+{
+}
+
+static int bond_ipsec_lag_add_slave(struct bonding *bond,
+				    struct slave *slave,
+				    struct netlink_ext_ack *extack)
+{
+	return 0;
+}
+
+static void bond_sync_slave_xfrm_features(struct bonding *bond,
+					  struct slave *slave)
+{
+}
 #endif /* CONFIG_XFRM_OFFLOAD */
 
 /*------------------------------- Link status -------------------------------*/
@@ -6006,10 +6561,11 @@ void bond_setup(struct net_device *bond_dev)
 	bond_dev->priv_flags &= ~(IFF_XMIT_DST_RELEASE | IFF_TX_SKB_SHARING);
 
 #ifdef CONFIG_XFRM_OFFLOAD
-	/* set up xfrm device ops (only supported in active-backup right now) */
+	/* set up xfrm device ops */
 	bond_dev->xfrmdev_ops = &bond_xfrmdev_ops;
 	INIT_LIST_HEAD(&bond->ipsec_list);
 	mutex_init(&bond->ipsec_lock);
+	bond->ipsec_lag_blocked = false;
 #endif /* CONFIG_XFRM_OFFLOAD */
 
 	/* don't acquire bond device's netif_tx_lock when transmitting */
diff --git a/include/net/bonding.h b/include/net/bonding.h
index edd1942dcd73..a581252b5b06 100644
--- a/include/net/bonding.h
+++ b/include/net/bonding.h
@@ -203,9 +203,24 @@ struct bond_up_slave {
  */
 #define BOND_LINK_NOCHANGE -1
 
+/* XFRM offload state tracked by bonding for one xfrm_state. */
 struct bond_ipsec {
 	struct list_head list;
 	struct xfrm_state *xs;
+	struct list_head inst_list;
+	struct rcu_head rcu;
+	bool replicated;
+};
+
+/* Per-lower-device instance of a replicated LAG XFRM state. */
+struct bond_ipsec_inst {
+	struct list_head list;
+	struct net_device *real_dev;
+	netdevice_tracker dev_tracker;
+	unsigned long lower_handle;
+	struct rcu_head rcu;
+	bool added;
+	bool deleted;
 };
 
 /*
@@ -259,8 +274,9 @@ struct bonding {
 	struct rtnl_link_stats64 bond_stats;
 #ifdef CONFIG_XFRM_OFFLOAD
 	struct list_head ipsec_list;
-	/* protecting ipsec_list */
+	/* protecting ipsec_list and ipsec_lag_blocked */
 	struct mutex ipsec_lock;
+	bool ipsec_lag_blocked;
 #endif /* CONFIG_XFRM_OFFLOAD */
 	struct bpf_prog *xdp_prog;
 };
@@ -325,6 +341,13 @@ static inline bool bond_mode_can_use_xmit_hash(const struct bonding *bond)
 		BOND_MODE(bond) == BOND_MODE_ALB);
 }
 
+static inline bool bond_mode_can_use_lag_xfrm(const struct bonding *bond)
+{
+	return (BOND_MODE(bond) == BOND_MODE_8023AD ||
+		BOND_MODE(bond) == BOND_MODE_XOR) &&
+	       bond->params.xmit_policy == BOND_XMIT_POLICY_LAYER34;
+}
+
 static inline bool bond_mode_uses_xmit_hash(const struct bonding *bond)
 {
 	return (BOND_MODE(bond) == BOND_MODE_8023AD ||
@@ -712,6 +735,10 @@ void bond_slave_arr_work_rearm(struct bonding *bond, unsigned long delay);
 void bond_peer_notify_work_rearm(struct bonding *bond, unsigned long delay);
 void bond_work_init_all(struct bonding *bond);
 void bond_work_cancel_all(struct bonding *bond);
+#if IS_ENABLED(CONFIG_XFRM_OFFLOAD)
+void bond_ipsec_lag_begin_flush(struct bonding *bond);
+void bond_ipsec_lag_end_flush(struct bonding *bond);
+#endif
 
 #ifdef CONFIG_PROC_FS
 void bond_create_proc_entry(struct bonding *bond);
-- 
2.53.0

  parent reply	other threads:[~2026-05-20  8:10 UTC|newest]

Thread overview: 5+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-05-20  8:10 [PATCH RFC net-next 0/4] bonding: support LAG IPsec offload with replicated SAs Jihong Min
2026-05-20  8:10 ` [PATCH RFC net-next 1/4] xfrm: add a lower-device offload handle resolver Jihong Min
2026-05-20  8:10 ` Jihong Min [this message]
2026-05-20  8:10 ` [PATCH RFC net-next 3/4] bonding: expose user-controlled IPsec features for LAG Jihong Min
2026-05-20  8:10 ` [PATCH RFC net-next 4/4] bonding: handle replicated IPsec SAs across LAG changes Jihong Min

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260520081004.2232091-3-hurryman2212@gmail.com \
    --to=hurryman2212@gmail.com \
    --cc=andrew+netdev@lunn.ch \
    --cc=davem@davemloft.net \
    --cc=edumazet@google.com \
    --cc=herbert@gondor.apana.org.au \
    --cc=horms@kernel.org \
    --cc=jv@jvosburgh.net \
    --cc=kuba@kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=netdev@vger.kernel.org \
    --cc=pabeni@redhat.com \
    --cc=steffen.klassert@secunet.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox