Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH net-next v4 08/13] dpaa2-switch: add LAG configuration API
From: Ioana Ciornei @ 2026-06-29 11:23 UTC (permalink / raw)
  To: andrew+netdev, davem, edumazet, kuba, pabeni, netdev; +Cc: linux-kernel
In-Reply-To: <20260629112309.154328-1-ioana.ciornei@nxp.com>

Add the necessary APIs to configure and control the LAG support on the
DPAA2 switch object.
 - The dpsw_lag_set() function will be used to either verify that a LAG
 configuration can be support or to actually apply it in HW.
 - The dpsw_if_set_lag_state() will get used in the next patches to
 change the per port LAG state of a specific DPSW interface.

Signed-off-by: Ioana Ciornei <ioana.ciornei@nxp.com>
---
Changes in v4:
- None

Changes in v3:
- Add a check in dpsw_lag_set() for cfg->num_ifs against
DPSW_MAX_LAG_IFS
- Add kerneldoc for the dpsw_lag_cfg structure.

Changes in v2:
- none
---
 .../net/ethernet/freescale/dpaa2/dpsw-cmd.h   | 18 +++++-
 drivers/net/ethernet/freescale/dpaa2/dpsw.c   | 60 +++++++++++++++++++
 drivers/net/ethernet/freescale/dpaa2/dpsw.h   | 30 ++++++++++
 3 files changed, 107 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/freescale/dpaa2/dpsw-cmd.h b/drivers/net/ethernet/freescale/dpaa2/dpsw-cmd.h
index 397d55f2bd99..9a2055c64983 100644
--- a/drivers/net/ethernet/freescale/dpaa2/dpsw-cmd.h
+++ b/drivers/net/ethernet/freescale/dpaa2/dpsw-cmd.h
@@ -12,7 +12,7 @@
 
 /* DPSW Version */
 #define DPSW_VER_MAJOR		8
-#define DPSW_VER_MINOR		9
+#define DPSW_VER_MINOR		13
 
 #define DPSW_CMD_BASE_VERSION	1
 #define DPSW_CMD_VERSION_2	2
@@ -92,11 +92,14 @@
 #define DPSW_CMDID_CTRL_IF_SET_POOLS        DPSW_CMD_ID(0x0A1)
 #define DPSW_CMDID_CTRL_IF_ENABLE           DPSW_CMD_ID(0x0A2)
 #define DPSW_CMDID_CTRL_IF_DISABLE          DPSW_CMD_ID(0x0A3)
+#define DPSW_CMDID_SET_LAG                  DPSW_CMD_V2(0x0A4)
 #define DPSW_CMDID_CTRL_IF_SET_QUEUE        DPSW_CMD_ID(0x0A6)
 
 #define DPSW_CMDID_SET_EGRESS_FLOOD         DPSW_CMD_ID(0x0AC)
 #define DPSW_CMDID_IF_SET_LEARNING_MODE     DPSW_CMD_ID(0x0AD)
 
+#define DPSW_CMDID_IF_SET_LAG_STATE         DPSW_CMD_ID(0x0B0)
+
 /* Macros for accessing command fields smaller than 1byte */
 #define DPSW_MASK(field)        \
 	GENMASK(DPSW_##field##_SHIFT + DPSW_##field##_SIZE - 1, \
@@ -552,5 +555,18 @@ struct dpsw_cmd_if_reflection {
 	/* only 2 bits from the LSB */
 	u8 filter;
 };
+
+struct dpsw_cmd_lag {
+	u8 group_id;
+	u8 num_ifs;
+	u8 pad[6];
+	u8 if_id[DPSW_MAX_LAG_IFS];
+	u8 phase;
+};
+
+struct dpsw_cmd_if_set_lag_state {
+	__le16 if_id;
+	u8 tx_enabled;
+};
 #pragma pack(pop)
 #endif /* __FSL_DPSW_CMD_H */
diff --git a/drivers/net/ethernet/freescale/dpaa2/dpsw.c b/drivers/net/ethernet/freescale/dpaa2/dpsw.c
index ab921d75deb2..f75cbdce42ba 100644
--- a/drivers/net/ethernet/freescale/dpaa2/dpsw.c
+++ b/drivers/net/ethernet/freescale/dpaa2/dpsw.c
@@ -1659,3 +1659,63 @@ int dpsw_if_remove_reflection(struct fsl_mc_io *mc_io, u32 cmd_flags, u16 token,
 
 	return mc_send_command(mc_io, &cmd);
 }
+
+/**
+ * dpsw_lag_set() - Set LAG configuration
+ * @mc_io:	Pointer to MC portal's I/O object
+ * @cmd_flags:	Command flags; one or more of 'MC_CMD_FLAG_'
+ * @token:	Token of DPSW object
+ * @cfg:	pointer to LAG configuration
+ *
+ * Return:   '0' on Success; Error code otherwise.
+ */
+int dpsw_lag_set(struct fsl_mc_io *mc_io, u32 cmd_flags, u16 token,
+		 const struct dpsw_lag_cfg *cfg)
+{
+	struct fsl_mc_command cmd = { 0 };
+	struct dpsw_cmd_lag *cmd_params;
+	int i = 0;
+
+	cmd.header = mc_encode_cmd_header(DPSW_CMDID_SET_LAG, cmd_flags, token);
+
+	if (cfg->num_ifs > DPSW_MAX_LAG_IFS)
+		return -EOPNOTSUPP;
+
+	cmd_params = (struct dpsw_cmd_lag *)cmd.params;
+	cmd_params->group_id = cfg->group_id;
+	cmd_params->num_ifs = cfg->num_ifs;
+	cmd_params->phase = cfg->phase;
+
+	for (i = 0; i < cfg->num_ifs; i++)
+		cmd_params->if_id[i] = cfg->if_id[i];
+
+	return mc_send_command(mc_io, &cmd);
+}
+
+/**
+ * dpsw_if_set_lag_state() - Change per port LAG state
+ * @mc_io:      Pointer to MC portal's I/O object
+ * @cmd_flags:  Command flags; one or more of 'MC_CMD_FLAG_'
+ * @token:      Token of DPSW object
+ * @if_id:      ID of the switch interface
+ * @tx_enabled: Value of the per port LAG state
+ *     - 0 if the interface will not be active as part of the LAG group
+ *     - 1 if the interface will be active in the LAG group
+ *
+ * Return:   '0' on Success; Error code otherwise.
+ */
+int dpsw_if_set_lag_state(struct fsl_mc_io *mc_io, u32 cmd_flags, u16 token,
+			  u16 if_id, u8 tx_enabled)
+{
+	struct dpsw_cmd_if_set_lag_state *cmd_params;
+	struct fsl_mc_command cmd = { 0 };
+
+	cmd.header = mc_encode_cmd_header(DPSW_CMDID_IF_SET_LAG_STATE,
+					  cmd_flags, token);
+
+	cmd_params = (struct dpsw_cmd_if_set_lag_state *)cmd.params;
+	cmd_params->if_id = cpu_to_le16(if_id);
+	cmd_params->tx_enabled = tx_enabled;
+
+	return mc_send_command(mc_io, &cmd);
+}
diff --git a/drivers/net/ethernet/freescale/dpaa2/dpsw.h b/drivers/net/ethernet/freescale/dpaa2/dpsw.h
index b90bd363f47a..89f0267de8e9 100644
--- a/drivers/net/ethernet/freescale/dpaa2/dpsw.h
+++ b/drivers/net/ethernet/freescale/dpaa2/dpsw.h
@@ -20,6 +20,8 @@ struct fsl_mc_io;
 
 #define DPSW_MAX_IF		64
 
+#define DPSW_MAX_LAG_IFS	8
+
 int dpsw_open(struct fsl_mc_io *mc_io, u32 cmd_flags, int dpsw_id, u16 *token);
 
 int dpsw_close(struct fsl_mc_io *mc_io, u32 cmd_flags, u16 token);
@@ -788,4 +790,32 @@ int dpsw_if_add_reflection(struct fsl_mc_io *mc_io, u32 cmd_flags, u16 token,
 
 int dpsw_if_remove_reflection(struct fsl_mc_io *mc_io, u32 cmd_flags, u16 token,
 			      u16 if_id, const struct dpsw_reflection_cfg *cfg);
+
+/* Link Aggregation Group configuration */
+
+#define DPSW_LAG_SET_PHASE_APPLY 0
+#define DPSW_LAG_SET_PHASE_CHECK 1
+
+/**
+ * struct dpsw_lag_cfg - Configuration structure for a LAG group
+ * @group_id: Link aggregation group ID. Valid values are in the
+ * [1, DPSW_MAX_LAG_IFS] range.
+ * @num_ifs: Number of interfaces in this LAG group, valid range is
+ * [0, DPSW_MAX_LAG_IFS].
+ * @if_id: Array containing the interface IDs of the ports part of a LAG group
+ * @phase: Use DPSW_LAG_SET_PHASE_APPLY for LAG configuration processing or
+ * DPSW_LAG_SET_PHASE_CHECK for LAG configuration validation.
+ */
+struct dpsw_lag_cfg {
+	u8 group_id;
+	u8 num_ifs;
+	u8 if_id[DPSW_MAX_LAG_IFS];
+	u8 phase;
+};
+
+int dpsw_lag_set(struct fsl_mc_io *mc_io, u32 cmd_flags, u16 token,
+		 const struct dpsw_lag_cfg *cfg);
+
+int dpsw_if_set_lag_state(struct fsl_mc_io *mc_io, u32 cmd_flags, u16 token,
+			  u16 if_id, u8 tx_enabled);
 #endif /* __FSL_DPSW_H */
-- 
2.25.1


^ permalink raw reply related

* [PATCH net-next v4 07/13] dpaa2-switch: consolidate unicast and multicast management
From: Ioana Ciornei @ 2026-06-29 11:23 UTC (permalink / raw)
  To: andrew+netdev, davem, edumazet, kuba, pabeni, netdev; +Cc: linux-kernel
In-Reply-To: <20260629112309.154328-1-ioana.ciornei@nxp.com>

This patch consolidates the unicast and multicast management by creating
two new functions - dpaa2_switch_port_fdb_[add|del]() - which can be
used for either uc or mc addresses. Having this common entrypoint for
both types of addresses will help us in the next patches to streamline
the same addresses but on LAG ports.

Signed-off-by: Ioana Ciornei <ioana.ciornei@nxp.com>
---
Changes in v4:
- Moved the commit ordering, no actual code changes

Changes in v3:
- none

Changes in v2:
- The rollback in dpaa2_switch_port_mdb_add() uses the newly introduced
dpaa2_switch_port_fdb_del() helper instead of the _mc counterpart.
---
 .../ethernet/freescale/dpaa2/dpaa2-switch.c   | 39 +++++++++++++------
 1 file changed, 27 insertions(+), 12 deletions(-)

diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c
index 88d199befbd9..3472f5d5b08a 100644
--- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c
+++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c
@@ -552,6 +552,28 @@ static int dpaa2_switch_port_fdb_del_mc(struct ethsw_port_priv *port_priv,
 	return err;
 }
 
+static int dpaa2_switch_port_fdb_add(struct ethsw_port_priv *port_priv,
+				     const unsigned char *addr)
+{
+	int err;
+
+	if (is_unicast_ether_addr(addr))
+		err = dpaa2_switch_port_fdb_add_uc(port_priv, addr);
+	else
+		err = dpaa2_switch_port_fdb_add_mc(port_priv, addr);
+
+	return err;
+}
+
+static int dpaa2_switch_port_fdb_del(struct ethsw_port_priv *port_priv,
+				     const unsigned char *addr)
+{
+	if (is_unicast_ether_addr(addr))
+		return dpaa2_switch_port_fdb_del_uc(port_priv, addr);
+	else
+		return dpaa2_switch_port_fdb_del_mc(port_priv, addr);
+}
+
 static void dpaa2_switch_port_get_stats(struct net_device *netdev,
 					struct rtnl_link_stats64 *stats)
 {
@@ -1880,7 +1902,7 @@ static int dpaa2_switch_port_mdb_add(struct net_device *netdev,
 {
 	struct ethsw_port_priv *port_priv = netdev_priv(netdev);
 
-	return dpaa2_switch_port_fdb_add_mc(port_priv, mdb->addr);
+	return dpaa2_switch_port_fdb_add(port_priv, mdb->addr);
 }
 
 static int dpaa2_switch_port_obj_add(struct net_device *netdev,
@@ -1984,7 +2006,7 @@ static int dpaa2_switch_port_mdb_del(struct net_device *netdev,
 {
 	struct ethsw_port_priv *port_priv = netdev_priv(netdev);
 
-	return dpaa2_switch_port_fdb_del_mc(port_priv, mdb->addr);
+	return dpaa2_switch_port_fdb_del(port_priv, mdb->addr);
 }
 
 static int dpaa2_switch_port_obj_del(struct net_device *netdev,
@@ -2325,12 +2347,8 @@ static void dpaa2_switch_event_work(struct work_struct *work)
 
 	switch (switchdev_work->event) {
 	case SWITCHDEV_FDB_ADD_TO_DEVICE:
-		if (is_unicast_ether_addr(fdb_info->addr))
-			err = dpaa2_switch_port_fdb_add_uc(netdev_priv(dev),
-							   fdb_info->addr);
-		else
-			err = dpaa2_switch_port_fdb_add_mc(netdev_priv(dev),
-							   fdb_info->addr);
+		err = dpaa2_switch_port_fdb_add(netdev_priv(dev),
+						fdb_info->addr);
 		if (err)
 			break;
 		fdb_info->offloaded = true;
@@ -2338,10 +2356,7 @@ static void dpaa2_switch_event_work(struct work_struct *work)
 					 &fdb_info->info, NULL);
 		break;
 	case SWITCHDEV_FDB_DEL_TO_DEVICE:
-		if (is_unicast_ether_addr(fdb_info->addr))
-			dpaa2_switch_port_fdb_del_uc(netdev_priv(dev), fdb_info->addr);
-		else
-			dpaa2_switch_port_fdb_del_mc(netdev_priv(dev), fdb_info->addr);
+		dpaa2_switch_port_fdb_del(netdev_priv(dev), fdb_info->addr);
 		break;
 	}
 
-- 
2.25.1


^ permalink raw reply related

* [PATCH net-next v4 06/13] dpaa2-switch: add dpaa2_switch_port_to_bridge_port() helper
From: Ioana Ciornei @ 2026-06-29 11:23 UTC (permalink / raw)
  To: andrew+netdev, davem, edumazet, kuba, pabeni, netdev; +Cc: linux-kernel
In-Reply-To: <20260629112309.154328-1-ioana.ciornei@nxp.com>

In preparation for adding offloading support for upper bond devices we
have to let the switchdev framework know if a specific bridge port is
offloaded or not, even if that brport is an upper device.

For this to happen, create the dpaa2_switch_port_to_bridge_port function
which will determine the bridge port corresponding to a particular DPAA2
switch interface and use it in the switchdev_bridge_port_offload call.

Signed-off-by: Ioana Ciornei <ioana.ciornei@nxp.com>
---
Changes in v4:
- Split the patch so that the first part only adds the base function and
its call sites and the logic aroung lag is added later in the patch
which actually adds the support for LAG.
- Moved the patch so that it's a preparatory patch

Changes in v3:
- Access lag field through rtnl_dereference() so that we adapt to the
__rcu change.
- Check that the brport is non-NULL before calling
switchdev_bridge_port_unoffload() on it.

Changes in v2:
- none
---
 .../ethernet/freescale/dpaa2/dpaa2-switch.c   | 23 ++++++++++++++++---
 1 file changed, 20 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c
index d4975d08fa44..88d199befbd9 100644
--- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c
+++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c
@@ -2017,6 +2017,15 @@ static int dpaa2_switch_port_attr_set_event(struct net_device *netdev,
 	return notifier_from_errno(err);
 }
 
+static struct net_device *
+dpaa2_switch_port_to_bridge_port(struct ethsw_port_priv *port_priv)
+{
+	if (!port_priv->fdb->bridge_dev)
+		return NULL;
+
+	return port_priv->netdev;
+}
+
 static int dpaa2_switch_port_bridge_join(struct net_device *netdev,
 					 struct net_device *upper_dev,
 					 struct netlink_ext_ack *extack)
@@ -2024,6 +2033,7 @@ static int dpaa2_switch_port_bridge_join(struct net_device *netdev,
 	struct ethsw_port_priv *port_priv = netdev_priv(netdev);
 	struct dpaa2_switch_fdb *old_fdb = port_priv->fdb;
 	struct ethsw_core *ethsw = port_priv->ethsw_data;
+	struct net_device *brport_dev;
 	bool learn_ena;
 	int err;
 
@@ -2035,7 +2045,8 @@ static int dpaa2_switch_port_bridge_join(struct net_device *netdev,
 	dpaa2_switch_port_set_fdb(port_priv, upper_dev, true);
 
 	/* Inherit the initial bridge port learning state */
-	learn_ena = br_port_flag_is_set(netdev, BR_LEARNING);
+	brport_dev = dpaa2_switch_port_to_bridge_port(port_priv);
+	learn_ena = br_port_flag_is_set(brport_dev, BR_LEARNING);
 	err = dpaa2_switch_port_set_learning(port_priv, learn_ena);
 	port_priv->learn_ena = learn_ena;
 
@@ -2049,7 +2060,8 @@ static int dpaa2_switch_port_bridge_join(struct net_device *netdev,
 	if (err)
 		goto err_egress_flood;
 
-	err = switchdev_bridge_port_offload(netdev, netdev, NULL,
+	brport_dev = dpaa2_switch_port_to_bridge_port(port_priv);
+	err = switchdev_bridge_port_offload(brport_dev, netdev, NULL,
 					    NULL, NULL, false, extack);
 	if (err)
 		goto err_switchdev_offload;
@@ -2086,8 +2098,13 @@ static void dpaa2_switch_port_pre_bridge_leave(struct net_device *netdev)
 {
 	struct ethsw_port_priv *port_priv = netdev_priv(netdev);
 	struct ethsw_core *ethsw = port_priv->ethsw_data;
+	struct net_device *brport_dev;
+
+	brport_dev = dpaa2_switch_port_to_bridge_port(port_priv);
+	if (!brport_dev)
+		return;
 
-	switchdev_bridge_port_unoffload(netdev, NULL, NULL, NULL);
+	switchdev_bridge_port_unoffload(brport_dev, NULL, NULL, NULL);
 
 	/* Make sure that any FDB add/del operations are completed before the
 	 * bridge layout changes
-- 
2.25.1


^ permalink raw reply related

* [PATCH net-next v4 05/13] dpaa2-switch: check early if an FDB entry should be added
From: Ioana Ciornei @ 2026-06-29 11:23 UTC (permalink / raw)
  To: andrew+netdev, davem, edumazet, kuba, pabeni, netdev; +Cc: linux-kernel
In-Reply-To: <20260629112309.154328-1-ioana.ciornei@nxp.com>

Instead of waiting until the last moment to check if an FDB entry should
be added to HW, move the check earlier (before even scheduling the work
item) so that we don't just waste time.

Signed-off-by: Ioana Ciornei <ioana.ciornei@nxp.com>
---
Changes in v4:
- none

Changes in v3:
- none

Changes in v2:
- none
---
 drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c
index c7c84bf2fde7..d4975d08fa44 100644
--- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c
+++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c
@@ -2308,8 +2308,6 @@ static void dpaa2_switch_event_work(struct work_struct *work)
 
 	switch (switchdev_work->event) {
 	case SWITCHDEV_FDB_ADD_TO_DEVICE:
-		if (!fdb_info->added_by_user || fdb_info->is_local)
-			break;
 		if (is_unicast_ether_addr(fdb_info->addr))
 			err = dpaa2_switch_port_fdb_add_uc(netdev_priv(dev),
 							   fdb_info->addr);
@@ -2323,8 +2321,6 @@ static void dpaa2_switch_event_work(struct work_struct *work)
 					 &fdb_info->info, NULL);
 		break;
 	case SWITCHDEV_FDB_DEL_TO_DEVICE:
-		if (!fdb_info->added_by_user || fdb_info->is_local)
-			break;
 		if (is_unicast_ether_addr(fdb_info->addr))
 			dpaa2_switch_port_fdb_del_uc(netdev_priv(dev), fdb_info->addr);
 		else
@@ -2350,6 +2346,9 @@ static int dpaa2_switch_port_fdb_event(struct notifier_block *nb,
 		return NOTIFY_DONE;
 	ethsw = port_priv->ethsw_data;
 
+	if (!fdb_info->added_by_user || fdb_info->is_local)
+		return NOTIFY_DONE;
+
 	switchdev_work = kzalloc_obj(*switchdev_work, GFP_ATOMIC);
 	if (!switchdev_work)
 		return NOTIFY_BAD;
-- 
2.25.1


^ permalink raw reply related

* [PATCH net-next v4 04/13] dpaa2-switch: create a separate dpaa2_switch_port_fdb_event() function
From: Ioana Ciornei @ 2026-06-29 11:23 UTC (permalink / raw)
  To: andrew+netdev, davem, edumazet, kuba, pabeni, netdev; +Cc: linux-kernel
In-Reply-To: <20260629112309.154328-1-ioana.ciornei@nxp.com>

Create a separate dpaa2_switch_port_fdb_event() function that will only
handle the FDB related events. With this change, the
dpaa2_switch_port_event() notifier handler can be written in a way that
it's easier to follow.

Signed-off-by: Ioana Ciornei <ioana.ciornei@nxp.com>
---
Changes in v4:
- none

Changes in v3:
- Get hold on port_priv->ethsw_data only after we know the device is a
dpaa2-switch one

Changes in v2:
- none
---
 .../ethernet/freescale/dpaa2/dpaa2-switch.c   | 28 ++++++++++++++-----
 1 file changed, 21 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c
index eacab00b586a..c7c84bf2fde7 100644
--- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c
+++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c
@@ -2337,21 +2337,18 @@ static void dpaa2_switch_event_work(struct work_struct *work)
 	dev_put(dev);
 }
 
-/* Called under rcu_read_lock() */
-static int dpaa2_switch_port_event(struct notifier_block *nb,
-				   unsigned long event, void *ptr)
+static int dpaa2_switch_port_fdb_event(struct notifier_block *nb,
+				       unsigned long event, void *ptr)
 {
 	struct net_device *dev = switchdev_notifier_info_to_dev(ptr);
 	struct ethsw_port_priv *port_priv = netdev_priv(dev);
 	struct ethsw_switchdev_event_work *switchdev_work;
 	struct switchdev_notifier_fdb_info *fdb_info = ptr;
-	struct ethsw_core *ethsw = port_priv->ethsw_data;
-
-	if (event == SWITCHDEV_PORT_ATTR_SET)
-		return dpaa2_switch_port_attr_set_event(dev, ptr);
+	struct ethsw_core *ethsw;
 
 	if (!dpaa2_switch_port_dev_check(dev))
 		return NOTIFY_DONE;
+	ethsw = port_priv->ethsw_data;
 
 	switchdev_work = kzalloc_obj(*switchdev_work, GFP_ATOMIC);
 	if (!switchdev_work)
@@ -2390,6 +2387,23 @@ static int dpaa2_switch_port_event(struct notifier_block *nb,
 	return NOTIFY_BAD;
 }
 
+/* Called under rcu_read_lock() */
+static int dpaa2_switch_port_event(struct notifier_block *nb,
+				   unsigned long event, void *ptr)
+{
+	struct net_device *dev = switchdev_notifier_info_to_dev(ptr);
+
+	switch (event) {
+	case SWITCHDEV_PORT_ATTR_SET:
+		return dpaa2_switch_port_attr_set_event(dev, ptr);
+	case SWITCHDEV_FDB_ADD_TO_DEVICE:
+	case SWITCHDEV_FDB_DEL_TO_DEVICE:
+		return dpaa2_switch_port_fdb_event(nb, event, ptr);
+	default:
+		return NOTIFY_DONE;
+	}
+}
+
 static int dpaa2_switch_port_obj_event(unsigned long event,
 				       struct net_device *netdev,
 				       struct switchdev_notifier_port_obj_info *port_obj_info)
-- 
2.25.1


^ permalink raw reply related

* [PATCH net-next v4 03/13] dpaa2-switch: extend the FDB management to cover bond scenarios
From: Ioana Ciornei @ 2026-06-29 11:22 UTC (permalink / raw)
  To: andrew+netdev, davem, edumazet, kuba, pabeni, netdev; +Cc: linux-kernel
In-Reply-To: <20260629112309.154328-1-ioana.ciornei@nxp.com>

The dpaa2_switch_fdb_for_join() function is responsible with determining
what FDB should be used by a port as a consequence of it joining a
bridge. The rule is that all DPAA2 switch ports under the same bridge
will use the FDB of the first port which joined that bridge. Extend the
function so that the function also covers the scenario in which there is
bridged bond device.

For this to happen, in case a bond device is encountered through the
bridge ports the function needs to descend one level through its lowers
as well.

Signed-off-by: Ioana Ciornei <ioana.ciornei@nxp.com>
---
Changes in v4:
- New patch. The same idea was present also in v3 but the implemetation
changed quite a bit since there was some restructuring work done to the
main function in the meantime.
---
 .../ethernet/freescale/dpaa2/dpaa2-switch.c   | 35 +++++++++++++------
 1 file changed, 25 insertions(+), 10 deletions(-)

diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c
index 67c639fad0db..eacab00b586a 100644
--- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c
+++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c
@@ -71,9 +71,9 @@ static struct dpaa2_switch_fdb *
 dpaa2_switch_fdb_for_join(struct ethsw_port_priv *port_priv,
 			  struct net_device *upper_dev)
 {
-	struct ethsw_port_priv *other_port_priv;
-	struct net_device *other_dev;
-	struct list_head *iter;
+	struct ethsw_port_priv *other_port_priv = NULL;
+	struct net_device *other_dev, *other_dev2;
+	struct list_head *iter, *iter2;
 
 	/* The below call to netdev_for_each_lower_dev() demands the RTNL lock
 	 * being held. Assert on it so that it's easier to catch new code
@@ -82,17 +82,32 @@ dpaa2_switch_fdb_for_join(struct ethsw_port_priv *port_priv,
 	ASSERT_RTNL();
 
 	/* If part of a bridge, use the FDB of the first dpaa2 switch interface
-	 * to be present in that bridge
+	 * to be present in that bridge. The search descends one level through
+	 * a bridged bond's lowers as well.
 	 */
 	netdev_for_each_lower_dev(upper_dev, other_dev, iter) {
-		if (!dpaa2_switch_port_dev_check(other_dev))
-			continue;
+		if (netif_is_lag_master(other_dev)) {
+			netdev_for_each_lower_dev(other_dev, other_dev2, iter2) {
+				if (!dpaa2_switch_port_dev_check(other_dev2))
+					continue;
 
-		if (other_dev == port_priv->netdev)
-			continue;
+				if (other_dev2 == port_priv->netdev)
+					continue;
 
-		other_port_priv = netdev_priv(other_dev);
-		return other_port_priv->fdb;
+				other_port_priv = netdev_priv(other_dev2);
+				break;
+			}
+		} else {
+			if (!dpaa2_switch_port_dev_check(other_dev))
+				continue;
+
+			if (other_dev == port_priv->netdev)
+				continue;
+
+			other_port_priv = netdev_priv(other_dev);
+		}
+		if (other_port_priv)
+			return other_port_priv->fdb;
 	}
 
 	return port_priv->fdb;
-- 
2.25.1


^ permalink raw reply related

* [PATCH net-next v4 01/13] dpaa2-switch: remove unnecessary dev_mc_add/dev_mc_del calls
From: Ioana Ciornei @ 2026-06-29 11:22 UTC (permalink / raw)
  To: andrew+netdev, davem, edumazet, kuba, pabeni, netdev; +Cc: linux-kernel
In-Reply-To: <20260629112309.154328-1-ioana.ciornei@nxp.com>

The DPSW object does not implement strict address filtering thus any
call to the dev_mc_add() / dev_mc_del() is pointless. Remove these calls
from the dpaa2_switch_port_mdb_add() and dpaa2_switch_port_mdb_del()
functions.

And since the multicast addresses no longer reach the netdev->mc list,
there is no point in keeping the dpaa2_switch_port_lookup_address()
function which searches through that list to verify if the same address
is added multiple times.

Signed-off-by: Ioana Ciornei <ioana.ciornei@nxp.com>
---
Changes in v4:
- new patch
---
 .../ethernet/freescale/dpaa2/dpaa2-switch.c   | 50 +------------------
 1 file changed, 2 insertions(+), 48 deletions(-)

diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c
index 858ba844ac51..d70e6f06ac15 100644
--- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c
+++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c
@@ -1860,44 +1860,12 @@ int dpaa2_switch_port_vlans_add(struct net_device *netdev,
 					  vlan->changed);
 }
 
-static int dpaa2_switch_port_lookup_address(struct net_device *netdev, int is_uc,
-					    const unsigned char *addr)
-{
-	struct netdev_hw_addr_list *list = (is_uc) ? &netdev->uc : &netdev->mc;
-	struct netdev_hw_addr *ha;
-
-	netif_addr_lock_bh(netdev);
-	list_for_each_entry(ha, &list->list, list) {
-		if (ether_addr_equal(ha->addr, addr)) {
-			netif_addr_unlock_bh(netdev);
-			return 1;
-		}
-	}
-	netif_addr_unlock_bh(netdev);
-	return 0;
-}
-
 static int dpaa2_switch_port_mdb_add(struct net_device *netdev,
 				     const struct switchdev_obj_port_mdb *mdb)
 {
 	struct ethsw_port_priv *port_priv = netdev_priv(netdev);
-	int err;
-
-	/* Check if address is already set on this port */
-	if (dpaa2_switch_port_lookup_address(netdev, 0, mdb->addr))
-		return -EEXIST;
 
-	err = dpaa2_switch_port_fdb_add_mc(port_priv, mdb->addr);
-	if (err)
-		return err;
-
-	err = dev_mc_add(netdev, mdb->addr);
-	if (err) {
-		netdev_err(netdev, "dev_mc_add err %d\n", err);
-		dpaa2_switch_port_fdb_del_mc(port_priv, mdb->addr);
-	}
-
-	return err;
+	return dpaa2_switch_port_fdb_add_mc(port_priv, mdb->addr);
 }
 
 static int dpaa2_switch_port_obj_add(struct net_device *netdev,
@@ -2000,22 +1968,8 @@ static int dpaa2_switch_port_mdb_del(struct net_device *netdev,
 				     const struct switchdev_obj_port_mdb *mdb)
 {
 	struct ethsw_port_priv *port_priv = netdev_priv(netdev);
-	int err;
 
-	if (!dpaa2_switch_port_lookup_address(netdev, 0, mdb->addr))
-		return -ENOENT;
-
-	err = dpaa2_switch_port_fdb_del_mc(port_priv, mdb->addr);
-	if (err)
-		return err;
-
-	err = dev_mc_del(netdev, mdb->addr);
-	if (err) {
-		netdev_err(netdev, "dev_mc_del err %d\n", err);
-		return err;
-	}
-
-	return err;
+	return dpaa2_switch_port_fdb_del_mc(port_priv, mdb->addr);
 }
 
 static int dpaa2_switch_port_obj_del(struct net_device *netdev,
-- 
2.25.1


^ permalink raw reply related

* [PATCH net-next v4 02/13] dpaa2-switch: avoid holding rtnl_lock in dpaa2_switch_event_work()
From: Ioana Ciornei @ 2026-06-29 11:22 UTC (permalink / raw)
  To: andrew+netdev, davem, edumazet, kuba, pabeni, netdev; +Cc: linux-kernel
In-Reply-To: <20260629112309.154328-1-ioana.ciornei@nxp.com>

The only reason why the rtnl_lock is held in the
dpaa2_switch_event_work() is so that there is no concurency between the
changeupper notifier which manages the per port FDB assignment and the
workqueue which adds / deletes addresses into that forwarding database.

To avoid this kind of concurency without a rtnl_lock, flush the event
workqueue as the last step from the pre_bridge_leave so that any
in-flight operations targeting the current FDB are finalized before the
bridge layout (and the per port FDB assignment) changes.

Signed-off-by: Ioana Ciornei <ioana.ciornei@nxp.com>
---
Changes in v4:
- New patch.
---
 drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c
index d70e6f06ac15..67c639fad0db 100644
--- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c
+++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c
@@ -2069,7 +2069,15 @@ static int dpaa2_switch_port_restore_rxvlan(struct net_device *vdev, int vid, vo
 
 static void dpaa2_switch_port_pre_bridge_leave(struct net_device *netdev)
 {
+	struct ethsw_port_priv *port_priv = netdev_priv(netdev);
+	struct ethsw_core *ethsw = port_priv->ethsw_data;
+
 	switchdev_bridge_port_unoffload(netdev, NULL, NULL, NULL);
+
+	/* Make sure that any FDB add/del operations are completed before the
+	 * bridge layout changes
+	 */
+	flush_workqueue(ethsw->workqueue);
 }
 
 static int dpaa2_switch_port_bridge_leave(struct net_device *netdev)
@@ -2281,7 +2289,6 @@ static void dpaa2_switch_event_work(struct work_struct *work)
 	struct switchdev_notifier_fdb_info *fdb_info;
 	int err;
 
-	rtnl_lock();
 	fdb_info = &switchdev_work->fdb_info;
 
 	switch (switchdev_work->event) {
@@ -2310,7 +2317,6 @@ static void dpaa2_switch_event_work(struct work_struct *work)
 		break;
 	}
 
-	rtnl_unlock();
 	kfree(switchdev_work->fdb_info.addr);
 	kfree(switchdev_work);
 	dev_put(dev);
-- 
2.25.1


^ permalink raw reply related

* [PATCH net-next v4 00/13] dpaa2-switch: add support for LAG offload
From: Ioana Ciornei @ 2026-06-29 11:22 UTC (permalink / raw)
  To: andrew+netdev, davem, edumazet, kuba, pabeni, netdev; +Cc: linux-kernel

This patch set adds support in dpaa2-switch for offloading upper bond
devices.

The first two patches remove the necessity to hold rtnl_lock during the
event processing workqueue by ensuring that all event were processed
before any changes in FDB layout happens.

Patch #3 updates the logic around choosing the FDB that should be used
on a switch port. This is necessary since with the addition of the LAG
offload, we need to take into account all ports which are under the same
bridge, even though not directly.

The next four patches clean up the FDB event by making them easier to
integrate with bond devices and also add the
dpaa2_switch_port_to_bridge_port() helper to be used in the LAG offload
support.

The 8th patch adds the necessary new APIs for the LAG configuration
while the next one uses them, both in the prechangeupper phase and the
changeupper one. Which ports can be part of the same LAG group is
configurable at boot time, thus we use the prechangeupper callback in
order to validate that a requested configuration can be offloaded or
not.

This set also extends the handling of FDBs and port objects so that they
are handled by the driver even on an offloaded bond device.

Changes in v4:
- Moved and split some patches so that any preparatory work is being
  done before the driver offloads upper bond devices.
- Add a defensive check in dpaa2_switch_port_bond_leave() for a NULL
port_priv->lag
- Extend the dpaa2_switch_prevent_bridging_with_8021q_upper() function
so that we prevent a bond device with VLAN uppers joinging a bridge.
The restriction is related to VLAN management in terms of the FDB which
can change upon a topology change. VLAN uppers can only be added once
the bridge topology is setup.
- Remove all FDB management from the bond join/leave paths. Decided to
reconfigure the FDB only on bridge join/leave since the FDB determines
the forwarding domain and when a bond is not bridged, from a
configuration standpoint, the individual lowers can be viewed as
standalone.
- Moved here the update to the dpaa2_switch_port_to_bridge_port()
function so that the LAG state is taken into account.
- Add a new per LAG field - primary - which is used to keep track of the
primary port of a LAG group instead of determining each time we need to
use it.
- Set 'skb->offload_fwd_mark' only when the port is under a bridge.
- Migrate FDBs in case the primary interface of a LAG changes.
- Use lag->primary instead of determining each time the primary
interface of a LAG device
- Link to v3: https://lore.kernel.org/all/20260603143623.3712024-1-ioana.ciornei@nxp.com/

Changes in v3:
- Add a check in dpsw_lag_set() for cfg->num_ifs against
DPSW_MAX_LAG_IFS
- Add kerneldoc for the dpsw_lag_cfg structure.
- Fix logic in prechangeupper callback in order to not call
dpaa2_switch_prechangeupper_sanity_checks() on !info->linking
- Fixed up the logic in the dpaa2_switch_port_bond_join()'s error path
so that the FDBs are cleaned-up properly and we do not end-up with FDB's
leaked, meaning that they could have been marked as in-use but actually
no port was using it.
- Mark the port_priv->lag field as __rcu and use the proper accesors for
it. This will eventually become useful in a later patch when the lag
field will be accessed concurrently from the NAPI context and the
join/leave paths
- Access lag field through rtnl_dereference() so that we adapt to the
__rcu change.
- Check that the brport is non-NULL before calling
switchdev_bridge_port_unoffload() on it.
- Get hold on port_priv->ethsw_data only after we know the device is a
dpaa2-switch one
- Update dpaa2_switch_foreign_dev_check() so that we check if there is
any port in the same switch as dev which offloads foreign_dev in case
this is a bridge port.
- Add mutex_destroy on the per LAG fdb_lock
- Make sure that all FDB events were processed on the workqueue on the
.remove() path.
- Delete the refcounted entry in dpaa2_switch_lag_fdb_del() as soon as
possible, even if the HW deletion would fail
- Access the port_priv->lag field only through the proper rcu accessors.
- Change the mask so that we restrict the trap only to the link local
addresses (01:80:c2:00:00:00 to 01:80:c2:00:00:0F) instead of the entire
reserved bridge block of addresses
- Link to v2: https://lore.kernel.org/all/20260512131554.952971-1-ioana.ciornei@nxp.com/

Changes in v2:
- Extend dpaa2_switch_prechangeupper_sanity_checks() with
netdev_walk_all_lower_dev() so that checks are done on all lower devices
of a bridge, even for the lowers of a bridged bond.
- Manage better the default VLAN on bond join
- Clean-up the error path in dpaa2_switch_port_bond_join()
- Call dpaa2_switch_port_bridge_leave() in case a port is leaving a bond
which is also a bridged port
- Update dpaa2_switch_port_bond_leave() so that in case of any failure
the driver tries to cleanup the LAG offload configuration.
- Call switchdev_bridge_port_unoffload() in a switch port is leaving a
bridge bond device.
- The rollback in dpaa2_switch_port_mdb_add() uses the newly introduced
dpaa2_switch_port_fdb_del() helper instead of the _mc counterpart.
- Update dpaa2_switch_foreign_dev_check() so that we check if between
the switch port and the foreign net_device is an offloaded path. Before
this change we also checked if the foreign_dev was offloaded or not by
the switch port.
- Update the switchdev_bridge_port_unoffload() by passing it the proper
context and the notifier blocks.
- Add dev_hold() and dev_put() calls for orig_dev
- In case dev_mc_add() fails, remove the MDB address from HW with the
proper function, dpaa2_switch_lag_fdb_del() or
dpaa2_switch_port_fdb_del(), depending on the LAG offload state.
- Fix 32bit build by using BIT_ULL
- Take a reference to port_priv->lag instead of reading it multiple
times.
- Link to v1: https://lore.kernel.org/all/20260506151540.1242997-1-ioana.ciornei@nxp.com/

Ioana Ciornei (13):
  dpaa2-switch: remove unnecessary dev_mc_add/dev_mc_del calls
  dpaa2-switch: avoid holding rtnl_lock in dpaa2_switch_event_work()
  dpaa2-switch: extend the FDB management to cover bond scenarios
  dpaa2-switch: create a separate dpaa2_switch_port_fdb_event() function
  dpaa2-switch: check early if an FDB entry should be added
  dpaa2-switch: add dpaa2_switch_port_to_bridge_port() helper
  dpaa2-switch: consolidate unicast and multicast management
  dpaa2-switch: add LAG configuration API
  dpaa2-switch: add support for LAG offload
  dpaa2-switch: offload FDBs added on an upper bond device
  dpaa2-switch: offload port objects on an upper bond device
  dpaa2-switch: trap all link local reserved addresses to the CPU
  dpaa2-switch: add support for imprecise source port

 .../ethernet/freescale/dpaa2/dpaa2-switch.c   | 931 +++++++++++++++---
 .../ethernet/freescale/dpaa2/dpaa2-switch.h   |  42 +-
 .../net/ethernet/freescale/dpaa2/dpsw-cmd.h   |  18 +-
 drivers/net/ethernet/freescale/dpaa2/dpsw.c   |  60 ++
 drivers/net/ethernet/freescale/dpaa2/dpsw.h   |  30 +
 5 files changed, 948 insertions(+), 133 deletions(-)

-- 
2.25.1


^ permalink raw reply

* Re: [RFC PATCH net-next] netpoll: hold RCU while walking napi_list
From: Breno Leitao @ 2026-06-29 11:17 UTC (permalink / raw)
  To: Runyu Xiao
  Cc: Jakub Kicinski, davem, edumazet, pabeni, horms, sashal, bigeasy,
	netdev, linux-kernel, jianhao.xu
In-Reply-To: <AFEAtADoKs22QyipHhKwe4op.3.1782623057057.Hmail.220255722@seu.edu.cn>

Hello,

On Sun, Jun 28, 2026 at 01:04:17PM +0800, Runyu Xiao wrote:
> Hi,
> 
> On Sat, 27 Jun 2026 14:21:05 -0700 Jakub Kicinski wrote:
> &gt; Please provide the stack trace from the report, rather than just saying
> &gt; that you can trigger it.

I am really suprised to see this warning. I've been runing this code with
CONFIG_PROVE_RCU_LIST for ages, and I haven't seen anything similar.

> Sure, sorry for not including it in the RFC.  The warning was from the
> reviewed reproducer used for the CONFIG_PROVE_RCU_LIST triage, not from
> a production crash.  The relevant part of the dmesg is:

Reading it, it does not come from the kernel's netpoll code at
all -- it comes from an out-of-tree module (!?)

>   WARNING: suspicious RCU usage
>   6.1.66 #3 Tainted: G           O
>   -----------------------------
>   /home/ubuntu22/msv_workspace/shared/vuln_msv.c:45 RCU-list traversed in non-reader section!!
> 
>   other info that might help us debug this:
> 
>   rcu_scheduler_active = 2, debug_locks = 1
>   no locks held by insmod/190.
> 
>   stack backtrace:
>   CPU: 1 PID: 190 Comm: insmod Tainted: G           O       6.1.66 #3

Have you tested it on a more modern kernel?

>   Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1 04/01/2014
>   Call Trace:
>    <task>
>    dump_stack_lvl+0x45/0x5d
>    lockdep_rcu_suspicious.cold+0x2d/0x64
>    poll_napi.constprop.0+0x43/0x71 [vuln_msv]
>    netpoll_poll_dev.constprop.0+0x27/0x36 [vuln_msv]
>    ? 0xffffffffc0005000
>    rcu_list_msv_init+0xe2/0x1000 [vuln_msv]

What is `vuln_msv` exactly?

Could you reproduce this from an in-kernel path instead -- a real
netpoll/netconsole/bonding caller, with the frames resolving to the kernel
rather than [vuln_msv]?

Meanwhile, NAK until the above is clarified

--
pw-bot: rejected

^ permalink raw reply

* Re: [PATCH v3] xsk: fix memory corruptions in net/core/xdp.c
From: Clement Lecigne @ 2026-06-29 11:15 UTC (permalink / raw)
  To: Fijalkowski, Maciej
  Cc: Lobakin, Aleksander, edumazet@google.com, netdev@vger.kernel.org,
	bpf@vger.kernel.org, linux-kernel@vger.kernel.org,
	kuba@kernel.org, sdf@fomichev.me, horms@kernel.org,
	john.fastabend@gmail.com, ast@kernel.org, daniel@iogearbox.net
In-Reply-To: <DM4SPRMB00455DAA85BD5AEE9784C6EB82E82@DM4SPRMB0045.namprd11.prod.outlook.com>

[-- Attachment #1: Type: text/plain, Size: 5314 bytes --]

On Mon, Jun 29, 2026 at 12:34 PM Fijalkowski, Maciej
<maciej.fijalkowski@intel.com> wrote:
>
> >
> > From: Clément Lecigne <clecigne@google.com>
> >
> > Commit 560d958c6c68 ("xsk: add generic XSk &xdp_buff -> skb conversion")
> > introduced a vulnerability in the handling of XDP_PASS for AF_XDP zero-copy
> > frames.
> >
> > Note: Currently, this specific AF_XDP zero-copy conversion path is only
> > reachable from the drivers/net/ethernet/intel/ice and
> > drivers/net/ethernet/intel/idpf drivers.
> >
> > When building an skb, xdp_build_skb_from_zc() uses the chunk size
> > (xdp->frame_sz) for the allocation. However, napi_build_skb() automatically
> > reserves space at the end of the allocation for the skb_shared_info
> > structure.
> >
> > Most high performance UMEM applications use 4K chunks, where the
> > corruption cannot happen. However, if the UMEM is configured with 2KB
> > chunks (a very common configuration to maximize packet density in memory),
> > a standard 1500 MTU packet will trigger the corruption because the required
> > space exceeds the 2048 byte chunk size:
> >
> > Headroom (256) + Packet (1514) + skb_shared_info (320) = 2090 bytes
> >
> > Because 2090 bytes > 2048 bytes and __skb_put() does not perform bounds
> > checking, the memcpy() writes past the available linear data area and
> > corrupts the skb_shared_info structure. This can lead to arbitrary code
> > execution if pointers like destructor_arg are overwritten.
> >
> > Additionally, in xdp_copy_frags_from_zc(), the allocation size is set
> > strictly to the fragment size (len), but the subsequent memcpy() uses
> > LARGEST_ALIGN(len). This mismatch results in an out-of-bounds write of
> > up to 7 bytes, which triggers KASAN warnings and is unsafe despite typical
> > page pool allocator padding.
> >
> > Fix the skb allocation in xdp_build_skb_from_zc() by dynamically
> > calculating the exact truesize required using SKB_HEAD_ALIGN() to
> > properly account for the headroom, the LARGEST_ALIGN(len), and the
> > skb_shared_info overhead.
> >
> > Fix the out-of-bounds write in xdp_copy_frags_from_zc() by rounding up
> > the allocation request using LARGEST_ALIGN(len) to match the copy
> > operation.
> >
> > Fixes: 560d958c6c68 ("xsk: add generic XSk &xdp_buff -> skb conversion")
> > CC: Alexander Lobakin <aleksander.lobakin@intel.com>
> > CC: Eric Dumazet <edumazet@google.com>
> > Signed-off-by: Clément Lecigne <clecigne@google.com>
>
> Hi Clement,
>
> Do you have a reproducer for mentioned issue or is it only a fix from
> theoretical POV?
>
> To be clear, we were addressing headroom issues in this series:
> https://lore.kernel.org/bpf/20260402154958.562179-1-maciej.fijalkowski@intel.com/
>
> so I wanted to ask if you are able to have this malformed setup for
> 2k chunk size. That series should not allow for that.

I didn't manage to build a malformed setup and only used a LKM to reproduce
the issue artificially. I shared some more details with you privately.

Thanks,
-clem

>
> I think this is the second time someone is trying to fix this area of code,
> so it is not a nack or something, let us fix this, but I wanted to have
> us on the same page.
>
> Thanks,
> Maciej
>
> > ---
> > Changes since v2:
> >  - Used LARGEST_ALIGN to calculate the len to account for the aligned
> > memcpy.
> >  - Fixed the commit message to include the idpf driver.
> >
> > Changes since v1:
> >  - Used SKB_HEAD_ALIGN to properly calculate the required allocation size
> >    including the skb_shared_info overhead.
> >  - Re-ordered variable declarations.
> >
> > ---
> > diff --git a/net/core/xdp.c b/net/core/xdp.c
> > index 9890a30584ba..7e39f17ad407 100644
> > --- a/net/core/xdp.c
> > +++ b/net/core/xdp.c
> > @@ -698,8 +698,8 @@ static noinline bool xdp_copy_frags_from_zc(struct
> > sk_buff *skb,
> >
> >       for (u32 i = 0; i < nr_frags; i++) {
> >               const skb_frag_t *frag = &xinfo->frags[i];
> > -             u32 len = skb_frag_size(frag);
> > -             u32 offset, truesize = len;
> > +             u32 offset, len = skb_frag_size(frag);
> > +             u32 truesize = LARGEST_ALIGN(len);
> >               struct page *page;
> >
> >               page = page_pool_dev_alloc(pp, &offset, &truesize);
> > @@ -738,9 +738,10 @@ static noinline bool xdp_copy_frags_from_zc(struct
> > sk_buff *skb,
> >   */
> >  struct sk_buff *xdp_build_skb_from_zc(struct xdp_buff *xdp)
> >  {
> > +     u32 headroom = xdp->data_meta - xdp->data_hard_start;
> >       const struct xdp_rxq_info *rxq = xdp->rxq;
> > -     u32 len = xdp->data_end - xdp->data_meta;
> > -     u32 truesize = xdp->frame_sz;
> > +     u32 len = LARGEST_ALIGN(xdp->data_end - xdp->data_meta);
> > +     u32 truesize = SKB_HEAD_ALIGN(headroom + len);
> >       struct sk_buff *skb = NULL;
> >       struct page_pool *pp;
> >       int metalen;
> > @@ -762,7 +763,7 @@ struct sk_buff *xdp_build_skb_from_zc(struct
> > xdp_buff *xdp)
> >       }
> >
> >       skb_mark_for_recycle(skb);
> > -     skb_reserve(skb, xdp->data_meta - xdp->data_hard_start);
> > +     skb_reserve(skb, headroom);
> >
> >       memcpy(__skb_put(skb, len), xdp->data_meta,
> > LARGEST_ALIGN(len));
> >
>

[-- Attachment #2: S/MIME Cryptographic Signature --]
[-- Type: application/pkcs7-signature, Size: 5281 bytes --]

^ permalink raw reply

* Re: [PATCH] net: stmmac: fix missed le32_to_cpu()
From: Ben Dooks @ 2026-06-29 11:11 UTC (permalink / raw)
  To: Maxime Chevallier, Jakub Kicinski
  Cc: Andrew Lunn, David S. Miller, Eric Dumazet, Paolo Abeni,
	Maxime Coquelin, Alexandre Torgue, Russell King (Oracle), netdev,
	linux-stm32, linux-arm-kernel, linux-kernel
In-Reply-To: <2a92fd9d-42b3-4564-b784-ec504d4d82b8@bootlin.com>

On 25/06/2026 08:07, Maxime Chevallier wrote:
> 
> 
> On 6/25/26 04:22, Jakub Kicinski wrote:
>> On Mon, 22 Jun 2026 19:51:39 +0200 Maxime Chevallier wrote:
>>> Hi Ben,
>>>
>>> On 6/22/26 16:37, Ben Dooks wrote:
>>>> The print in ndesc_display_ring() sends the des2 and des3
>>>> to the pr_info() without passing them through the relevant
>>>> conversion to cpu order.
>>>>
>>>> Fix the (prototype) sparse warnings by using le32_to_cpu():
>>>> drivers/net/ethernet/stmicro/stmmac/norm_desc.c:258:17: warning: incorrect type in argument 6 (different base types)
>>>> drivers/net/ethernet/stmicro/stmmac/norm_desc.c:258:17:    expected unsigned int
>>>> drivers/net/ethernet/stmicro/stmmac/norm_desc.c:258:17:    got restricted __le32 [usertype] des2
>>>> drivers/net/ethernet/stmicro/stmmac/norm_desc.c:258:17: warning: incorrect type in argument 7 (different base types)
>>>> drivers/net/ethernet/stmicro/stmmac/norm_desc.c:258:17:    expected unsigned int
>>>> drivers/net/ethernet/stmicro/stmmac/norm_desc.c:258:17:    got restricted __le32 [usertype] des3
>>>>
>>>> Signed-off-by: Ben Dooks <ben.dooks@codethink.co.uk>
>>>
>>> I agree on the principle, but this isn't a fix so this'll have to wait
>>> until net-next re-opens :)
>>
>> Humpf, why are we not seeing this on x86 allmodconfig ? 🤔️
>>
>> $ make C=1 W=1 drivers/net/ethernet/stmicro/stmmac/norm_desc.o
>>    DESCEND objtool
>>    CC [M]  drivers/net/ethernet/stmicro/stmmac/norm_desc.o
>>    CHECK   drivers/net/ethernet/stmicro/stmmac/norm_desc.c
>> $
> 
> Heh good point indeed !
>    
>>>> Fix the (prototype) sparse warnings by using le32_to_cpu():
> 
> Ben, what's this "prototype" sparse ? a custom tool of yours that
> you used to find that ?

I have an RFC to add variadic and thus also printf/scanf formatting
to sparse. This is waiting on review after the original got re-worked
to add scanf and a few other bug-fixed and shuffles.

Ref: https://marc.info/?l=linux-sparse&m=178185274600679&w=2


-- 
Ben Dooks				http://www.codethink.co.uk/
Senior Engineer				Codethink - Providing Genius

https://www.codethink.co.uk/privacy.html

^ permalink raw reply

* Re: [RFC net-next 08/15] ipxlat: add translation engine and dispatch core
From: Toke Høiland-Jørgensen @ 2026-06-29 11:08 UTC (permalink / raw)
  To: Ralf Lici
  Cc: netdev, Daniel Gröber, Antonio Quartulli, Andrew Lunn,
	David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	linux-kernel, Pablo Neira Ayuso, Florian Westphal, Phil Sutter,
	Beniamino Galvani
In-Reply-To: <20260624161854.686569-1-ralf@mandelbit.com>

Ralf Lici <ralf@mandelbit.com> writes:

> On Tue, 23 Jun 2026 21:59:44 +0200, Toke Høiland-Jørgensen <toke@kernel.org> wrote:
>> Ralf Lici <ralf@mandelbit.com> writes:
>> > On the BPF point specifically: I agree a BPF program should be able to
>> > decide whether to translate. What I am less sure about is whether
>> > redirecting to a netdevice is the best way to expose that. A TC action
>> > (yet another model, I know :)) gives you the same thing in-pipeline and
>> > more directly:
>> >
>> >     tc filter add dev wwan0 egress \
>> >         bpf obj match.o action ipxlat4to6 domain clat0
>> >
>> > Let BPF make the policy decision, with the native action doing the
>> > translation work that the current BPF CLAT implementations have trouble
>> > with: fragmentation, checksum corner cases, and ICMP error inner
>> > headers (as explained by Beniamino).
>> >
>> > So TC clsact looks like the natural in-kernel replacement for today's
>> > TC-BPF CLAT programs: no extra netdev, you attach to the existing
>> > uplink, direction is explicit, and on egress you sit on the real route
>> > dst, so the synthetic-dst and double-routing problems above just don't
>> > arise. The cost is more moving parts than a single bpf_redirect since
>> > userspace has to manage clsact, filters, priorities and action
>> > lifecycle/cleanup.
>>
>> Hmm, so no one really uses the bpf filter mechanism, since you can just
>> do everything from an action anyway (and with TCX attachment, you can
>> even avoid the overhead of the TC filter/action infrastructure
>> entirely). However, point taken wrt how to integrate this with BPF. I
>> guess the most flexible thing would be to expose the functionality
>> directly (as a kfunc callable from a BPF program). Which also fits with
>> your point below:
>>
>
> Ah, I see, the cls_bpf example was dated, and I like the kfunc angle
> better than a new TC action.
>
> I would probably keep that as the minimal per-packet interface: BPF can
> decide whether a packet should be translated, and the kfunc can do the
> actual translation work for packets whose translated form still fits the
> output MTU. The full 4->6 fragmentation case still looks like
> output-path/harness territory to me, since it is a 1->N fan-out
> operation.

Yeah, that would probably be fine; I would expect that in most cases
you'd want to configure your MTU to avoid fragmentation anyway :)

>> > For a gateway translator, though, I still think a device-bound model is
>> > less natural. There the translation point is more like a forwarding
>> > decision across routes and nexthops, so a route/LWT attachment, or
>> > possibly a netfilter attachment seems easier to reason about. Also, as
>> > you already pointed out while discussing LWT, an admin setting up NAT64
>> > is more likely to reach for an nft rule than for a clsact filter on a
>> > specific device.
>> >
>> > Taking a step back, ipxlat is really a generic translation engine plus a
>> > thin harness around it. So rather than pick one attachment, it might be
>> > worth structuring the engine so different harnesses can drive it.
>> > There's interesting precedent for this shape:
>> >
>> > - ILA, again, is the closest sibling: stateless IPv6 address translation
>> >   with a shared core in ila_common.c, driven both by an LWT frontend in
>> >   ila_lwt.c and by an inline netfilter hook with a netlink-configured
>> >   mapping table in ila_xlat.c.
>> >
>> > - act_ct is the precedent for the TC side specifically: a TC action that
>> >   reuses the netfilter conntrack engine rather than reimplementing it.
>> >
>> > And act_nat is the cautionary counter-example: a standalone TC
>> > reimplementation of stateless NAT that shares no code with nf_nat, and
>> > carries a "would be nice to share code" comment :)
>> >
>> > So I am wondering whether the right direction is to factor the
>> > translation engine cleanly, land it with one harness first, and keep the
>> > other attachment points as follow-up work once the core semantics are
>> > settled.
>> >
>> > Does that direction seem reasonable to you?
>>
>> Yes, reusable functionality that can be called from multiple places
>> sounds like a good fit; let's try to structure it that way!
>>
>
> Great, that's the direction I'll take then.
>
>> As for which hook to start with, well, let's see if we hear back from
>> the netfilter devs, but either netfilter or the routing subsystem (LWT
>> style) would be OK for me I think.
>>
>
> Works for me. The engine factoring is common to all of them, so I'll
> start there. Once it's in shape I can sketch a harness against it to
> sanity-check the interface.

Awesome, sounds good!

-Toke

^ permalink raw reply

* Re: [PATCH net v4] net/mlx5e: macsec: fix use-after-free of metadata_dst on RX SC delete
From: Tariq Toukan @ 2026-06-29 11:04 UTC (permalink / raw)
  To: Doruk Tan Ozturk, saeedm, leon, tariqt, mbloch, sd, andrew+netdev,
	davem, edumazet, kuba, pabeni
  Cc: horms, borisp, raeds, ehakim, netdev, linux-rdma, linux-kernel,
	stable
In-Reply-To: <20260627223059.29917-1-doruk@0sec.ai>



On 28/06/2026 1:30, Doruk Tan Ozturk wrote:
> When an offloaded MACsec RX SC is deleted, macsec_del_rxsc_ctx() freed
> the per-SC metadata_dst with metadata_dst_free(), which kfree()s the
> object unconditionally and ignores the dst reference count. The RX
> datapath in mlx5e_macsec_offload_handle_rx_skb() looks up the SC under
> rcu_read_lock() via xa_load(), takes a reference with dst_hold() and
> attaches the dst to the skb with skb_dst_set(). A reader that already
> obtained the rx_sc pointer can race with the delete path and operate on
> freed memory.
> 
> Fix the owner side by dropping the reference with dst_release() instead
> of freeing unconditionally, and convert the RX datapath to
> dst_hold_safe() so a reader racing the SC delete cannot attach a dst
> whose last reference was just dropped; only attach it when a reference
> was actually taken.
> 
> mlx5e_macsec_add_rxsc() also published sc_xarray_element via xa_alloc()
> before rx_sc->md_dst was allocated and initialised, so a datapath reader
> that looked the SC up by fs_id could observe rx_sc with md_dst still
> NULL or, on weakly-ordered architectures, a non-NULL md_dst pointer
> whose contents were not yet visible. NULL-check the xa_load() result and
> md_dst on the datapath, and reorder add_rxsc() so the xa_alloc() publish
> happens only after md_dst is fully initialised; the xarray RCU publish
> then pairs with the rcu_read_lock()/xa_load() in the datapath.
> 
> Note: macsec_del_rxsc_ctx() also kfree()s rx_sc->sc_xarray_element
> without an RCU grace period while the same datapath reads it under
> rcu_read_lock(); that is a separate pre-existing issue left to a
> follow-up patch.
> 
> Found by 0sec automated security-research tooling (https://0sec.ai).
> 
> Fixes: b7c9400cbc48 ("net/mlx5e: Implement MACsec Rx data path using MACsec skb_metadata_dst")
> Cc: stable@vger.kernel.org
> Signed-off-by: Doruk Tan Ozturk <doruk@0sec.ai>
> ---

Reviewed-by: Tariq Toukan <tariqt@nvidia.com>

Thanks.

^ permalink raw reply

* Re: [PATCH] mptcp: only honor zero-length DATA_FIN when a mapping is present
From: Michael Bommarito @ 2026-06-29 11:00 UTC (permalink / raw)
  To: Paolo Abeni
  Cc: Matthieu Baerts, Mat Martineau, Geliang Tang, Eric Dumazet,
	Jakub Kicinski, mptcp, netdev, linux-kernel
In-Reply-To: <3ad5bba8-18b9-48a6-94e0-99d958f23984@redhat.com>

On Mon, Jun 29, 2026 at 5:50 AM Paolo Abeni <pabeni@redhat.com> wrote:
> Isn't this fixed by commit 5e939544f9d2 ("mptcp: fix uninit-value in
> mptcp_established_options") ?

I did the reproduction ~10 days ago on linus's latest, so definitely
still reproducing.  I think 5e939544f9d2 was on the TX side and this
is about the RX option path, so they don't overlap on flows either.

Thanks,
Mike

^ permalink raw reply

* Re: [PATCH net 1/1] tcp: bound SYN-ACK timers to reqsk timeout range
From: Eric Dumazet @ 2026-06-29 11:00 UTC (permalink / raw)
  To: Ren Wei
  Cc: netdev, ncardwell, kuniyu, davem, pabeni, horms, chia-yu.chang,
	ij, bronzed_45_vested, yuuchihsu, idosch, yuantan098, yifanwucs,
	tomapufckgml, bird, roxy520tt
In-Reply-To: <02e24eb83639e9d7ecc623f000c60254bb5c40a5.1782643946.git.roxy520tt@gmail.com>

On Sun, Jun 28, 2026 at 4:43 AM Ren Wei <n05ec@lzu.edu.cn> wrote:
>
> From: Zhiling Zou <roxy520tt@gmail.com>
>

...

> -       max_retries = READ_ONCE(icsk->icsk_syn_retries) ? :
> -               READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_synack_retries) + 1;
> +       max_retries = READ_ONCE(icsk->icsk_syn_retries);
> +       if (!max_retries) {
> +               max_retries = READ_ONCE(net->ipv4.sysctl_tcp_synack_retries);
> +               max_retries++;
> +       }

Please do not change this part which looks good, let's avoid code churn.

^ permalink raw reply

* Re: [PATCH net-next v5 1/4] dpll: add DPLL_PIN_TYPE_INT_NCO pin type
From: Vadim Fedorenko @ 2026-06-29 10:53 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: Ivan Vecera, Jakub Kicinski, Arkadiusz Kubalewski, netdev,
	Jiri Pirko, David S. Miller, Donald Hunter, Eric Dumazet,
	Michal Schmidt, Paolo Abeni, Pasi Vaananen, Petr Oros,
	Prathosh Satish, Simon Horman, linux-kernel, Grzegorz Nitka
In-Reply-To: <aiftnkuT9IP31qUm@FV6GYCPJ69>

On 09/06/2026 12:06, Jiri Pirko wrote:
> Mon, Jun 08, 2026 at 01:33:56PM +0200, vadim.fedorenko@linux.dev wrote:
>> On 04/06/2026 17:42, Ivan Vecera wrote:
>>> On 6/4/26 5:16 PM, Jakub Kicinski wrote:
>>>> On Thu, 4 Jun 2026 17:01:36 +0200 Ivan Vecera wrote:
>>>>>> Purely going on intuition here but feels like NCO should be a mode
>>>>>> (enum dpll_mode) rather than one of the input pins?
>>>>>>
>>>>>> More acks here would be great, Vadim, Arkadiusz, Grzegorz... ?
>>>>>
>>>>> I had a long discussion with Jiri about this and we agreed finally
>>>>> that dpll_mode represents a reference (input pin) selection strategy
>>>>> mode and not a DPLL device running mode.
>>>>
>>>> Long discussion? I see 2 emails ;) Let's hear from others.
>>>> (thanks for the link BTW, _if_ there's a v6 please put it in the cover
>>>> letter)
>>>
>>> I called him... he explained me 'why?' in detail.
>>> I also appreciate others' opinion.
>>
>> Well, NCO mode means manual operation of frequency tuning. Does it mean
>> that different tunings may be applied to different out pins of DPLL
>> device? My assumption that it's not possible, and in this case NCO is
>> property/mode of DPLL device rather than single pin.
>>
>> @Jiri could you please share your detailed explanation on "why"?
> 
> Since the "why a pin and not a new dpll_mode?" question keeps coming up,
> let me try to describe why I believe that modelling NCO as an input pin
> (DPLL_PIN_TYPE_INT_NCO) is the right thing to do.
> 
> In the DPLL UAPI, 'mode' only describes the *input selection policy*:
> MANUAL means userspace picks which input the loop locks to, AUTOMATIC
> means the DPLL auto-selects the highest-priority input. I know there was
> some fuzz about this semantics in the early stages of upstreaming DPLL
> subsystem, but eventually this became very clear both in code and in
> kdoc:
> 
> <qoute>
>   * enum dpll_mode - working modes a dpll can support, differentiates if and how
>   *   dpll selects one of its inputs to syntonize with it, valid values for
>   *   DPLL_A_MODE attribute
> </quote>
> 
> NCO *is not* a third selection policy - it is just another *source* the
> loop is disciplined from. Except the source is steered by the host
> (via the PHC .adjfine() path) instead of being an external reference.
> Think of it as a virtual pin of some sort.
> 
> The object we already use for "a source the DPLL can lock to" is a pin,
> so an internal NCO belongs right next to DPLL_PIN_TYPE_INT_OSCILLATOR,
> which is already existing example of a similar virtual pin.
> 
> By having NCO as an input pin we reuse the existing model instead of
> inventing a parallel one. "Run as NCO" becomes "connect the NCO input"
> using the same connect/disconnect, pin state and pin-dump infrastructure
> as any other input. No new control surface, and it stays orthogonal to
> mode: we don't have to define what AUTOMATIC+NCO or pin priorities
> mean, and we don't grow enum dpll_mode and the supported-modes
> bitmask that every mode-aware consumer would then have to relearn.
> 
> For the pin info uAPI exposure we reuse the attributes pins already have
> - the output frequency offset from nominal is reported via the pin's
> fractional-frequency-offset / -ppt. A new device mode would need
> brand new device-level attributes for the same information.
> 
> Having said that, I think it's a perfect fit. The only "real" pull
> towards a new mode is that vendor datasheets call this NCO/DCO a "mode".
> But that is HW register terminology and we learned many times in
> the past that may be more or less misleading/incorrect wrt the uAPI.
> 
> Therefore my strong preference is DPLL_PIN_TYPE_INT_NCO, no new mode.
> Honestly, I don't really understand why it would make even little sense
> to have this as new mode. Perhaps I'm missing something, if you can
> describe it, that would be awesome.

Ok, I see your point. Even though the pin UAPI fits the model, I still
have some concerns:

1. I cannot really imagine AUTOMATIC mode selecting NCO pin by priority
in case other pins are gone somehow. It doesn't make sense without
steering SW running on the host. And the other way around - switching to
a higher priority pin while SW is keep "steering" DPLL. But looks like
we have discussed it in the other thread. Adding DPLL mode restrictions
based on pin selection/connection breaks the model, I think...

2. SW steering cannot be pure SW. Every disciplining algorithm relies on
measurements, the product of phase comparators. That technically means
the device has to have other inputs configure as monitor, which can be 
configured in AUTO mode with priorities. How will we model it then?

Thanks,
Vadim


^ permalink raw reply

* Re: [PATCH net] net: clear transport header during tunnel decapsulation
From: Paolo Abeni @ 2026-06-29 10:53 UTC (permalink / raw)
  To: Eric Dumazet, David S . Miller, Jakub Kicinski
  Cc: Simon Horman, Ido Schimmel, David Ahern, netdev, eric.dumazet,
	syzbot+d5d0d598a4cfdfafdc3b
In-Reply-To: <20260624073209.3703492-1-edumazet@google.com>

On 6/24/26 9:32 AM, Eric Dumazet wrote:
> diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
> index d3c677e9bff2080e4760347a3d873da4e83ac3ca..59192f58da2e3aae19d00505cc3bb04b083b77c5 100644
> --- a/net/ipv4/ip_tunnel_core.c
> +++ b/net/ipv4/ip_tunnel_core.c
> @@ -134,6 +134,7 @@ int __iptunnel_pull_header(struct sk_buff *skb, int hdr_len,
>  	__vlan_hwaccel_clear_tag(skb);
>  	skb_set_queue_mapping(skb, 0);
>  	skb_scrub_packet(skb, xnet);
> +	skb_unset_transport_header(skb);


In geneve_udp_encap_recv() the above is called a few lines before:

	geneveh = geneve_hdr(skb);

which in turn accesses indirectly the transport header via udp_hdr().

Also AFAICS even vxlan uses __iptunnel_pull_header() in the receive
path, possibly no additional unset needed in such driver.

Side note: it would be helpful if the syzbot CI reports could include
the fully decoded stack trace.

/P


^ permalink raw reply

* Re: [PATCH net 1/1] tcp: Require init_net CAP_NET_ADMIN for tcp_child_ehash_entries
From: tt roxy @ 2026-06-29 10:51 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Ren Wei, netdev, davem, pabeni, horms, chia-yu.chang, ij, idosch,
	fmancera, bronzed_45_vested, yuuchihsu, kuniyu, yuantan098,
	yifanwucs, tomapufckgml, bird
In-Reply-To: <CANn89i+sq29-PbxDrCdUJk405Bs0759wXLBNhpGqE7TNTEPraQ@mail.gmail.com>

On Mon, Jun 29, 2026 at 6:41 PM Eric Dumazet <edumazet@google.com> wrote:
>
> On Sun, Jun 28, 2026 at 4:38 AM Ren Wei <n05ec@lzu.edu.cn> wrote:
> >
> > From: Zhiling Zou <roxy520tt@gmail.com>
> >
> > tcp_child_ehash_entries controls the size of the private TCP established
> > hash table allocated for subsequently created child network namespaces.
> > The value is consumed during child netns creation by tcp_set_hashinfo()
> > and passed to inet_pernet_hashinfo_alloc(), which can allocate a large
> > per-netns ehash.
> >
> > The sysctl is writable in each network namespace, and net sysctl
> > permissions allow a task with CAP_NET_ADMIN in the namespace's owning
> > user namespace to write it.  An unprivileged user can therefore create a
> > user and network namespace, set tcp_child_ehash_entries to its maximum
> > value, and repeatedly create nested network namespaces to force large
> > kernel allocations and exhaust host memory.
> >
> > Require CAP_NET_ADMIN in the initial user namespace before accepting
> > writes to tcp_child_ehash_entries.  This keeps the tuning knob available
> > to the host administrator while preventing unprivileged user namespaces
> > from using it to drive host-wide memory consumption.
>
> I do not think this patch is desirable.
> It breaks nested container use cases.
> A container runtime running inside a container (with namespace-local
> CAP_NET_ADMIN but not global)
> would no longer be able to tune tcp_child_ehash_entries for its own
> nested child namespaces.
>
> inet_pernet_hashinfo_alloc() uses GFP_KERNEL_ACCOUNT for a reason, I
> suggest you start using memcg :)
>
> Keep in mind the sysctl could be set (by root) in init_net for some
> reason, so only memcg will protect against OOM.

Thanks for the review.

Agreed, restricting writes to init_user_ns CAP_NET_ADMIN is too broad and
would break nested container use cases. We will drop this approach.

We will re-check the memcg accounting/fallback behavior for the per-netns
ehash allocation and prepare a different fix if there is still a path to
host-wide OOM outside the intended memcg limits.

^ permalink raw reply

* Re: [PATCH net 1/3] net: Extend bpf_net_context lifetime to cover qdisc enqueue
From: Jamal Hadi Salim @ 2026-06-29 10:47 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior
  Cc: netdev, bpf, davem, edumazet, kuba, pabeni, horms, toke, jiri,
	clrkwllms, rostedt, kuniyu, sdf.kernel, skhawaja, liuhangbin,
	krikku, mkarsten, victor, ast, hawk, john.fastabend, daniel,
	Sashiko
In-Reply-To: <20260629102917.Ag2Vd7LR@linutronix.de>

 -

On Mon, Jun 29, 2026 at 6:29 AM Sebastian Andrzej Siewior
<bigeasy@linutronix.de> wrote:
>
> On 2026-06-26 12:51:54 [-0400], Jamal Hadi Salim wrote:
> > The bpf_net_context used by sch_handle_egress() is stack-allocated and torn
> > down in that function returned. By the time tcf_qevent_handle() runs
> > current->bpf_net_context is NULL.
> >
> > When a filter attached to a qevent block (e.g. RED's early_drop or mark
> > qevents, which always use shared blocks) returns TC_ACT_REDIRECT,
> > tcf_qevent_handle() calls skb_do_redirect(), which in turn calls bpf helper
> > bpf_net_ctx_get_ri(). That helper unconditionally dereferences
> > current->bpf_net_context resulting in a NULL pointer dereference.
> >
> > Note: The same holds for actions that invoke BPF redirect helpers
> > (e.g. act_bpf running a program that calls bpf_redirect()) during qevent
> > classification itself. And as a matter of fact the same assumption is
> > made in the code outside of tc.
> >
> > Fix:
> > Move the bpf_net_context lifecycle out of sch_handle_egress() into
> > __dev_queue_xmit(), so that it spans both the egress TC fast path and the
> > qdisc enqueue. The setup is placed outside the egress_needed_key static
> > branch because qevents are independent of clsact/NF egress hooks and
> > that key may stay disabled when only a qevent-bearing qdisc is
> > configured. Unfortunately this adds a small unconditional penalty to the
> > code path _per packet_ only guarded by CONFIG_NET_XGRESS (two writes and
> > one read for bpf_net_ctx_set, plus one write for bpf_net_ctx_clear).
>
> I fail to understand this but you and sashiko have an understanding...
> If there is TC_ACT_REDIRECT returned by tc_run(), then the skb is NULL
> and as such uppon return from sch_handle_egress() the control flow goes
> to the out label.
> As a fix you move the bpf_net_ctx assigned to before CONFIG_NET_EGRESS
> and clear it on exit. What do I miss here?
>

There are 2 separate filters.
IIUC, you are thinking of the first one which is the clsact egress
classifier (which runs in sch_handle_egress())  - its redirect would
indeed return NULL and skip qdisc enqueue.
The second one is the qevent redirect whch happens in
tcf_qevent_handle() during qdisc enqueue (block 10 in the reproducer).


> > This keeps all bpf_net_context management in net/core/dev.c i.e the
> > existing boundary between tc core and BPF without requiring any net/sched/
> > code to know about BPF plumbing.
> >
> > Reproducer (see the accompanying tdc test):
> >
> >   tc qdisc add dev eth0 root handle 1: red limit 1MB min 10KB max 20KB \
> >       avpkt 1000 burst 100 qevent early_drop block 10
> >   tc qdisc add dev eth0 clsact
> >   tc filter add block 10 pref 1 bpf obj redirect.o
>
> stupid question: how do I get this redirect.o? Just a simply thing to
> reproduce this…
>

It's just pseudo code for a bpf prog that redirects (so you can create
probably a few liner bpf prog).
Take a look at patch 3 which uses a prebuilt action-ebpf binary with
the action-redirect section (added by patch 3 to action.c).
If it's still not clear, I can craft one and send it to you.

cheers,
jamal

> >   tc filter add dev eth0 egress protocol ip prio 1 matchall \
> >       action gact pass
> >
> >   traffic through eth0 triggers red_enqueue() -> tcf_qevent_handle() and,
> >   on a redirect verdict, a NULL deref in skb_do_redirect().
>
> Sebastian

^ permalink raw reply

* Re: [PATCH net 1/1] tcp: Require init_net CAP_NET_ADMIN for tcp_child_ehash_entries
From: Eric Dumazet @ 2026-06-29 10:41 UTC (permalink / raw)
  To: Ren Wei
  Cc: netdev, davem, pabeni, horms, chia-yu.chang, ij, idosch, fmancera,
	bronzed_45_vested, yuuchihsu, kuniyu, yuantan098, yifanwucs,
	tomapufckgml, bird, roxy520tt
In-Reply-To: <012fba43272abc560acfc0fa37ae22182a60b457.1782641525.git.roxy520tt@gmail.com>

On Sun, Jun 28, 2026 at 4:38 AM Ren Wei <n05ec@lzu.edu.cn> wrote:
>
> From: Zhiling Zou <roxy520tt@gmail.com>
>
> tcp_child_ehash_entries controls the size of the private TCP established
> hash table allocated for subsequently created child network namespaces.
> The value is consumed during child netns creation by tcp_set_hashinfo()
> and passed to inet_pernet_hashinfo_alloc(), which can allocate a large
> per-netns ehash.
>
> The sysctl is writable in each network namespace, and net sysctl
> permissions allow a task with CAP_NET_ADMIN in the namespace's owning
> user namespace to write it.  An unprivileged user can therefore create a
> user and network namespace, set tcp_child_ehash_entries to its maximum
> value, and repeatedly create nested network namespaces to force large
> kernel allocations and exhaust host memory.
>
> Require CAP_NET_ADMIN in the initial user namespace before accepting
> writes to tcp_child_ehash_entries.  This keeps the tuning knob available
> to the host administrator while preventing unprivileged user namespaces
> from using it to drive host-wide memory consumption.

I do not think this patch is desirable.
It breaks nested container use cases.
A container runtime running inside a container (with namespace-local
CAP_NET_ADMIN but not global)
would no longer be able to tune tcp_child_ehash_entries for its own
nested child namespaces.

inet_pernet_hashinfo_alloc() uses GFP_KERNEL_ACCOUNT for a reason, I
suggest you start using memcg :)

Keep in mind the sysctl could be set (by root) in init_net for some
reason, so only memcg will protect against OOM.

^ permalink raw reply

* Re: [PATCH bpf-next v10 1/5] bpf: add bpf_icmp_send kfunc
From: Mahe Tardy @ 2026-06-29 10:35 UTC (permalink / raw)
  To: Stanislav Fomichev
  Cc: bpf, andrii, ast, daniel, john.fastabend, jordan, martin.lau,
	yonghong.song, emil, netdev, edumazet, kuba, pabeni, davem, horms
In-Reply-To: <aj6kdnfAB0LJKDcR@devvm7509.cco0.facebook.com>

On Fri, Jun 26, 2026 at 09:18:39AM -0700, Stanislav Fomichev wrote:
> On 06/25, Mahe Tardy wrote:
> > On Thu, Jun 25, 2026 at 09:24:59AM -0700, Stanislav Fomichev wrote:
> > > On 06/25, Mahe Tardy wrote:
> > 
> > [...]
> > 
> > > > +__bpf_kfunc int bpf_icmp_send(struct __sk_buff *skb_ctx, int type, int code)
> > > > +{
> > > > +	struct sk_buff *skb = (struct sk_buff *)skb_ctx;
> > > > +	struct sk_buff *nskb;
> > > > +	struct sock *sk;
> > > > +
> > > > +	sk = skb_to_full_sk(skb);
> > > > +	if (sk && sk->sk_kern_sock &&
> > > > +	    (sk->sk_protocol == IPPROTO_ICMP || sk->sk_protocol == IPPROTO_ICMPV6))
> > > > +		return -EBUSY;
> > > > +
> > > > +	switch (skb->protocol) {
> > > > +#if IS_ENABLED(CONFIG_INET)
> > > > +	case htons(ETH_P_IP): {
> > > > +		if (type != ICMP_DEST_UNREACH)
> > > > +			return -EOPNOTSUPP;
> > > > +		if (code < 0 || code > NR_ICMP_UNREACH ||
> > > > +		    code == ICMP_FRAG_NEEDED) /* needs a valid next-hop MTU */
> > > > +			return -EINVAL;
> > > > +
> > > > +		/* icmp_send expects skb_dst to be a real rtable. */
> > > > +		if (!skb_valid_dst(skb))
> > > > +			return -ENETUNREACH;
> > > > +
> > > > +		nskb = skb_clone(skb, GFP_ATOMIC);
> > > > +		if (!nskb)
> > > > +			return -ENOMEM;
> > > > +
> > > > +		memset(IPCB(nskb), 0, sizeof(*IPCB(nskb)));
> > > > +		icmp_send(nskb, type, code, 0);
> > > > +		consume_skb(nskb);
> > > > +		break;
> > > > +	}
> > > > +#endif
> > > > +#if IS_ENABLED(CONFIG_IPV6)
> > > > +	case htons(ETH_P_IPV6):
> > > > +		if (type != ICMPV6_DEST_UNREACH)
> > > > +			return -EOPNOTSUPP;
> > > > +		if (code < 0 || code > ICMPV6_REJECT_ROUTE)
> > > > +			return -EINVAL;
> > > 
> > > [..]
> > > 
> > > > +		/* icmpv6_send may treat skb_dst as rt6_info. */
> > > > +		if (skb_metadata_dst(skb))
> > > > +			return -ENETUNREACH;
> > > 
> > > A bit confused about this. Which part of icmpv6_send treats skb_dst as rt6_info?
> > > (I see the original sashiko report about dst, but icmp6 seems to be not
> > > requiring it)
> > 
> > Yeah I was also a bit confused because this came out of nowhere as soon
> > as I put the skb_valid_dst only on the IPv4 path (for different
> > reasons), but there is actually a potential trace in which we have type
> > confusion indeed:
> > 
> > - icmp6_send() checks scoped source addresses and calls icmp6_iif() at net/ipv6/icmp.c:702
> > - icmp6_iif() calls icmp6_dev() at net/ipv6/icmp.c:441
> > - icmp6_dev() does skb_rt6_info(skb) for loopback/L3 master devices at net/ipv6/icmp.c:428
> > - skb_rt6_info() casts any non-NULL dst to struct rt6_info at include/net/ip6_route.h:233
> > - rt6->rt6i_idev is then dereferenced at net/ipv6/icmp.c:434
> > 
> > When checking with pahole, we can find this on my local kernel:
> > 
> > struct rt6_info {
> > 	struct dst_entry           dst;                  /*     0   136 */
> > 	/* --- cacheline 2 boundary (128 bytes) was 8 bytes ago --- */
> > 	struct fib6_info *         from;                 /*   136     8 */
> > 	int                        sernum;               /*   144     4 */
> > 	struct rt6key              rt6i_dst;             /*   148    20 */
> > 	struct rt6key              rt6i_src;             /*   168    20 */
> > 	struct in6_addr            rt6i_gateway;         /*   188    16 */
> > 
> > 	/* XXX 4 bytes hole, try to pack */
> > 
> > 	/* --- cacheline 3 boundary (192 bytes) was 16 bytes ago --- */
> > 	struct inet6_dev *         rt6i_idev;            /*   208     8 */  <--- we dereference this
> > 	u32                        rt6i_flags;           /*   216     4 */
> > 	short unsigned int         rt6i_nfheader_len;    /*   220     2 */
> > 
> > 	/* size: 224, cachelines: 4, members: 9 */
> > 	/* sum members: 218, holes: 1, sum holes: 4 */
> > 	/* padding: 2 */
> > 	/* last cacheline: 32 bytes */
> > };
> > 
> > And the metadata_dst would look like this:
> > 
> > struct metadata_dst {
> > 	struct dst_entry           dst;                  /*     0   136 */
> > 	/* --- cacheline 2 boundary (128 bytes) was 8 bytes ago --- */
> > 	enum metadata_type         type;                 /*   136     4 */
> > 
> > 	/* XXX 4 bytes hole, try to pack */
> > 
> > 	union {
> > 		struct ip_tunnel_info tun_info;          /*   144    96 */
> > 		struct hw_port_info port_info;           /*   144    16 */
> > 		struct macsec_info macsec_info;          /*   144     8 */
> > 		struct xfrm_md_info xfrm_info;           /*   144    16 */
> > 	} u;                                             /*   144    96 */  <--- we land on this union
> > 
> > 	/* size: 240, cachelines: 4, members: 3 */
> > 	/* sum members: 236, holes: 1, sum holes: 4 */
> > 	/* last cacheline: 48 bytes */
> > };
> > 
> > Let's say it's a struct ip_tunnel_info:
> > 
> > struct ip_tunnel_info {
> > 	struct ip_tunnel_key       key;                  /*     0    64 */
> > 
> > 	/* XXX last struct has 7 bytes of padding */
> > 
> > 	/* --- cacheline 1 boundary (64 bytes) --- */
> > 	struct ip_tunnel_encap     encap;                /*    64     8 */  <--- 144 + 64 = 208 we land here
> > 	struct dst_cache           dst_cache;            /*    72    16 */
> > 	u8                         options_len;          /*    88     1 */
> > 	u8                         mode;                 /*    89     1 */
> > 
> > 	/* size: 96, cachelines: 2, members: 5 */
> > 	/* padding: 6 */
> > 	/* paddings: 1, sum paddings: 7 */
> > 	/* last cacheline: 32 bytes */
> > };
> > 
> > So I imagine this is fairly tricky to trigger but still a case of type
> > confusion. I have actually no idea how likely this can happen from my
> > call but the trace makes sense at least.
> 
> That logic seems to exist for the icmp6_send to find the input device
> (since the expected use-case for calling icmp6_send is to the incoming
> skb). And since you're mainly doing egress, I don't think this path will
> ever trigger (iow the check is not needed)?
> 
> Maybe you can add cgroup_ingress test case? Looks like this rt6_info
> path might trigger for ipv6 lo? I don't see any ingress test in your
> series, so might be good to have one regardless?

The initial reason I added only egress is because the use case of this
makes more sense if that's your local kernel giving you feedback about a
connection you are trying to establish, as a process, but is prevented.

But indeed, I could extend the test to ingress as well, I'd just like
ideally getting an ack from networking maintainers since this is already
v10 of this, before making some new changes.


^ permalink raw reply

* RE: [PATCH v3] xsk: fix memory corruptions in net/core/xdp.c
From: Fijalkowski, Maciej @ 2026-06-29 10:34 UTC (permalink / raw)
  To: Clement Lecigne, Lobakin, Aleksander, edumazet@google.com,
	netdev@vger.kernel.org
  Cc: bpf@vger.kernel.org, linux-kernel@vger.kernel.org,
	kuba@kernel.org, sdf@fomichev.me, horms@kernel.org,
	john.fastabend@gmail.com, ast@kernel.org, daniel@iogearbox.net
In-Reply-To: <20260629072300.1664622-1-clecigne@google.com>

> 
> From: Clément Lecigne <clecigne@google.com>
> 
> Commit 560d958c6c68 ("xsk: add generic XSk &xdp_buff -> skb conversion")
> introduced a vulnerability in the handling of XDP_PASS for AF_XDP zero-copy
> frames.
> 
> Note: Currently, this specific AF_XDP zero-copy conversion path is only
> reachable from the drivers/net/ethernet/intel/ice and
> drivers/net/ethernet/intel/idpf drivers.
> 
> When building an skb, xdp_build_skb_from_zc() uses the chunk size
> (xdp->frame_sz) for the allocation. However, napi_build_skb() automatically
> reserves space at the end of the allocation for the skb_shared_info
> structure.
> 
> Most high performance UMEM applications use 4K chunks, where the
> corruption cannot happen. However, if the UMEM is configured with 2KB
> chunks (a very common configuration to maximize packet density in memory),
> a standard 1500 MTU packet will trigger the corruption because the required
> space exceeds the 2048 byte chunk size:
> 
> Headroom (256) + Packet (1514) + skb_shared_info (320) = 2090 bytes
> 
> Because 2090 bytes > 2048 bytes and __skb_put() does not perform bounds
> checking, the memcpy() writes past the available linear data area and
> corrupts the skb_shared_info structure. This can lead to arbitrary code
> execution if pointers like destructor_arg are overwritten.
> 
> Additionally, in xdp_copy_frags_from_zc(), the allocation size is set
> strictly to the fragment size (len), but the subsequent memcpy() uses
> LARGEST_ALIGN(len). This mismatch results in an out-of-bounds write of
> up to 7 bytes, which triggers KASAN warnings and is unsafe despite typical
> page pool allocator padding.
> 
> Fix the skb allocation in xdp_build_skb_from_zc() by dynamically
> calculating the exact truesize required using SKB_HEAD_ALIGN() to
> properly account for the headroom, the LARGEST_ALIGN(len), and the
> skb_shared_info overhead.
> 
> Fix the out-of-bounds write in xdp_copy_frags_from_zc() by rounding up
> the allocation request using LARGEST_ALIGN(len) to match the copy
> operation.
> 
> Fixes: 560d958c6c68 ("xsk: add generic XSk &xdp_buff -> skb conversion")
> CC: Alexander Lobakin <aleksander.lobakin@intel.com>
> CC: Eric Dumazet <edumazet@google.com>
> Signed-off-by: Clément Lecigne <clecigne@google.com>

Hi Clement,

Do you have a reproducer for mentioned issue or is it only a fix from
theoretical POV?

To be clear, we were addressing headroom issues in this series:
https://lore.kernel.org/bpf/20260402154958.562179-1-maciej.fijalkowski@intel.com/

so I wanted to ask if you are able to have this malformed setup for
2k chunk size. That series should not allow for that.

I think this is the second time someone is trying to fix this area of code,
so it is not a nack or something, let us fix this, but I wanted to have
us on the same page.

Thanks,
Maciej

> ---
> Changes since v2:
>  - Used LARGEST_ALIGN to calculate the len to account for the aligned
> memcpy.
>  - Fixed the commit message to include the idpf driver.
> 
> Changes since v1:
>  - Used SKB_HEAD_ALIGN to properly calculate the required allocation size
>    including the skb_shared_info overhead.
>  - Re-ordered variable declarations.
> 
> ---
> diff --git a/net/core/xdp.c b/net/core/xdp.c
> index 9890a30584ba..7e39f17ad407 100644
> --- a/net/core/xdp.c
> +++ b/net/core/xdp.c
> @@ -698,8 +698,8 @@ static noinline bool xdp_copy_frags_from_zc(struct
> sk_buff *skb,
> 
>  	for (u32 i = 0; i < nr_frags; i++) {
>  		const skb_frag_t *frag = &xinfo->frags[i];
> -		u32 len = skb_frag_size(frag);
> -		u32 offset, truesize = len;
> +		u32 offset, len = skb_frag_size(frag);
> +		u32 truesize = LARGEST_ALIGN(len);
>  		struct page *page;
> 
>  		page = page_pool_dev_alloc(pp, &offset, &truesize);
> @@ -738,9 +738,10 @@ static noinline bool xdp_copy_frags_from_zc(struct
> sk_buff *skb,
>   */
>  struct sk_buff *xdp_build_skb_from_zc(struct xdp_buff *xdp)
>  {
> +	u32 headroom = xdp->data_meta - xdp->data_hard_start;
>  	const struct xdp_rxq_info *rxq = xdp->rxq;
> -	u32 len = xdp->data_end - xdp->data_meta;
> -	u32 truesize = xdp->frame_sz;
> +	u32 len = LARGEST_ALIGN(xdp->data_end - xdp->data_meta);
> +	u32 truesize = SKB_HEAD_ALIGN(headroom + len);
>  	struct sk_buff *skb = NULL;
>  	struct page_pool *pp;
>  	int metalen;
> @@ -762,7 +763,7 @@ struct sk_buff *xdp_build_skb_from_zc(struct
> xdp_buff *xdp)
>  	}
> 
>  	skb_mark_for_recycle(skb);
> -	skb_reserve(skb, xdp->data_meta - xdp->data_hard_start);
> +	skb_reserve(skb, headroom);
> 
>  	memcpy(__skb_put(skb, len), xdp->data_meta,
> LARGEST_ALIGN(len));
> 


^ permalink raw reply

* Re: [PATCH net 1/3] net: Extend bpf_net_context lifetime to cover qdisc enqueue
From: Sebastian Andrzej Siewior @ 2026-06-29 10:29 UTC (permalink / raw)
  To: Jamal Hadi Salim
  Cc: netdev, bpf, davem, edumazet, kuba, pabeni, horms, toke, jiri,
	clrkwllms, rostedt, kuniyu, sdf.kernel, skhawaja, liuhangbin,
	krikku, mkarsten, victor, ast, hawk, john.fastabend, daniel,
	Sashiko
In-Reply-To: <20260626165156.169012-2-jhs@mojatatu.com>

On 2026-06-26 12:51:54 [-0400], Jamal Hadi Salim wrote:
> The bpf_net_context used by sch_handle_egress() is stack-allocated and torn
> down in that function returned. By the time tcf_qevent_handle() runs
> current->bpf_net_context is NULL.
> 
> When a filter attached to a qevent block (e.g. RED's early_drop or mark
> qevents, which always use shared blocks) returns TC_ACT_REDIRECT,
> tcf_qevent_handle() calls skb_do_redirect(), which in turn calls bpf helper
> bpf_net_ctx_get_ri(). That helper unconditionally dereferences
> current->bpf_net_context resulting in a NULL pointer dereference.
> 
> Note: The same holds for actions that invoke BPF redirect helpers
> (e.g. act_bpf running a program that calls bpf_redirect()) during qevent
> classification itself. And as a matter of fact the same assumption is
> made in the code outside of tc.
> 
> Fix:
> Move the bpf_net_context lifecycle out of sch_handle_egress() into
> __dev_queue_xmit(), so that it spans both the egress TC fast path and the
> qdisc enqueue. The setup is placed outside the egress_needed_key static
> branch because qevents are independent of clsact/NF egress hooks and
> that key may stay disabled when only a qevent-bearing qdisc is
> configured. Unfortunately this adds a small unconditional penalty to the
> code path _per packet_ only guarded by CONFIG_NET_XGRESS (two writes and
> one read for bpf_net_ctx_set, plus one write for bpf_net_ctx_clear).

I fail to understand this but you and sashiko have an understanding...
If there is TC_ACT_REDIRECT returned by tc_run(), then the skb is NULL
and as such uppon return from sch_handle_egress() the control flow goes
to the out label.
As a fix you move the bpf_net_ctx assigned to before CONFIG_NET_EGRESS
and clear it on exit. What do I miss here?

> This keeps all bpf_net_context management in net/core/dev.c i.e the
> existing boundary between tc core and BPF without requiring any net/sched/
> code to know about BPF plumbing.
> 
> Reproducer (see the accompanying tdc test):
> 
>   tc qdisc add dev eth0 root handle 1: red limit 1MB min 10KB max 20KB \
>       avpkt 1000 burst 100 qevent early_drop block 10
>   tc qdisc add dev eth0 clsact
>   tc filter add block 10 pref 1 bpf obj redirect.o

stupid question: how do I get this redirect.o? Just a simply thing to
reproduce this…

>   tc filter add dev eth0 egress protocol ip prio 1 matchall \
>       action gact pass
> 
>   traffic through eth0 triggers red_enqueue() -> tcf_qevent_handle() and,
>   on a redirect verdict, a NULL deref in skb_do_redirect().

Sebastian

^ permalink raw reply

* [PATCH net 1/3 v2] net: Extend bpf_net_context lifetime to cover qdisc enqueue
From: Jamal Hadi Salim @ 2026-06-29 10:21 UTC (permalink / raw)
  To: netdev
  Cc: jiri, davem, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Simon Horman, toke, Steven Rostedt, Petr Machata,
	Alexei Starovoitov, Daniel Borkmann, John Fastabend,
	Jesper Dangaard Brouer, linux-rt-devel, bpf, security, stable,
	Jamal Hadi Salim, Victor Nogueira
In-Reply-To: <20260629102157.737306-1-jhs@mojatatu.com>

The bpf_net_context used by sch_handle_egress() is stack-allocated and torn
down in that function returned. By the time tcf_qevent_handle() runs
current->bpf_net_context is NULL.

When a filter attached to a qevent block (e.g. RED's early_drop or mark
qevents, which always use shared blocks) returns TC_ACT_REDIRECT,
tcf_qevent_handle() calls skb_do_redirect(), which in turn calls bpf helper
bpf_net_ctx_get_ri().  That helper unconditionally dereferences
current->bpf_net_context resulting in a NULL pointer dereference.

Note: The same holds for actions that invoke BPF redirect helpers
(e.g. act_bpf running a program that calls bpf_redirect()) during qevent
classification itself.

Fix:
Move the bpf_net_context lifecycle out of sch_handle_egress() into
__dev_queue_xmit(), so that it spans both the egress TC fast path and the
qdisc enqueue.
Note: The call is placed outside the egress_needed_key static branch
to cover the case where clsact static key is disabled. Unfortunately this
adds a small unconditional penalty to the code path _per packet_ only
guarded by CONFIG_NET_XGRESS (two writes and one read).

As pointed by sashiko [1]:
The same context must also be set up in net_tx_action()'s qdisc drain
path, since qdisc_run() -> netem_dequeue() -> qdisc_enqueue( RED child)
can trigger qevent classification asynchronously from softirq context.

This keeps all bpf_net_context management in net/core/dev.c i.e the
existing boundary between tc core and BPF without requiring any net/sched/
code to know about BPF plumbing.

Reproducer:

  tc qdisc add dev eth0 root handle 1: red limit 1MB min 10KB max 20KB \
      avpkt 1000 burst 100 qevent early_drop block 10
  tc filter add block 10 pref 1 bpf obj redirect.o

  traffic through eth0 triggers red_enqueue() -> tcf_qevent_handle() and,
  on a redirect verdict, a NULL deref in skb_do_redirect().

Fixes: 3625750f05ec ("net: sched: Introduce helpers for qevent blocks")
Tested-by: Victor Nogueira <victor@mojatatu.com>
Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com>
---
 net/core/dev.c | 31 +++++++++++++++++++++++--------
 1 file changed, 23 insertions(+), 8 deletions(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index 4b3d5cfdf6e0..b95a8b153c76 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4527,14 +4527,11 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
 {
 	struct bpf_mprog_entry *entry = rcu_dereference_bh(dev->tcx_egress);
 	enum skb_drop_reason drop_reason = SKB_DROP_REASON_TC_EGRESS;
-	struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
 	int sch_ret;
 
 	if (!entry)
 		return skb;
 
-	bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
-
 	/* qdisc_skb_cb(skb)->pkt_len & tcx_set_ingress() was
 	 * already set by the caller.
 	 */
@@ -4550,12 +4547,10 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
 		/* No need to push/pop skb's mac_header here on egress! */
 		skb_do_redirect(skb);
 		*ret = NET_XMIT_SUCCESS;
-		bpf_net_ctx_clear(bpf_net_ctx);
 		return NULL;
 	case TC_ACT_SHOT:
 		kfree_skb_reason(skb, drop_reason);
 		*ret = NET_XMIT_DROP;
-		bpf_net_ctx_clear(bpf_net_ctx);
 		return NULL;
 	/* used by tc_run */
 	case TC_ACT_STOLEN:
@@ -4565,10 +4560,8 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
 		fallthrough;
 	case TC_ACT_CONSUMED:
 		*ret = NET_XMIT_SUCCESS;
-		bpf_net_ctx_clear(bpf_net_ctx);
 		return NULL;
 	}
-	bpf_net_ctx_clear(bpf_net_ctx);
 
 	return skb;
 }
@@ -4767,6 +4760,9 @@ struct netdev_queue *netdev_core_pick_tx(struct net_device *dev,
  */
 int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
 {
+#ifdef CONFIG_NET_XGRESS
+	struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx = NULL;
+#endif
 	struct net_device *dev = skb->dev;
 	struct netdev_queue *txq = NULL;
 	enum skb_drop_reason reason;
@@ -4795,6 +4791,9 @@ int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
 	skb_update_prio(skb);
 
 	tcx_set_ingress(skb, false);
+#ifdef CONFIG_NET_XGRESS
+	bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
+#endif
 #ifdef CONFIG_NET_EGRESS
 	if (static_branch_unlikely(&egress_needed_key)) {
 		if (nf_hook_egress_active()) {
@@ -4898,12 +4897,18 @@ int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
 
 	reason = SKB_DROP_REASON_RECURSION_LIMIT;
 drop:
+#ifdef CONFIG_NET_XGRESS
+	bpf_net_ctx_clear(bpf_net_ctx);
+#endif
 	rcu_read_unlock_bh();
 
 	dev_core_stats_tx_dropped_inc(dev);
 	kfree_skb_list_reason(skb, reason);
 	return rc;
 out:
+#ifdef CONFIG_NET_XGRESS
+	bpf_net_ctx_clear(bpf_net_ctx);
+#endif
 	rcu_read_unlock_bh();
 	return rc;
 }
@@ -5815,6 +5820,9 @@ static __latent_entropy void net_tx_action(void)
 
 	if (sd->output_queue) {
 		struct Qdisc *head;
+#ifdef CONFIG_NET_XGRESS
+		struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
+#endif
 
 		local_irq_disable();
 		head = sd->output_queue;
@@ -5824,6 +5832,10 @@ static __latent_entropy void net_tx_action(void)
 
 		rcu_read_lock();
 
+#ifdef CONFIG_NET_XGRESS
+		bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
+#endif
+
 		while (head) {
 			spinlock_t *root_lock = NULL;
 			struct sk_buff *to_free;
@@ -5860,6 +5872,10 @@ static __latent_entropy void net_tx_action(void)
 			tcf_kfree_skb_list(to_free, q, NULL, qdisc_dev(q));
 		}
 
+#ifdef CONFIG_NET_XGRESS
+		bpf_net_ctx_clear(bpf_net_ctx);
+#endif
+
 		rcu_read_unlock();
 	}
 
-- 
2.54.0


^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox