Netdev List
 help / color / mirror / Atom feed
* Re: [PATCH net-next v2 4/4] selftests: net: getsockopt_iter: add raw ICMP_FILTER coverage
From: Stanislav Fomichev @ 2026-06-30 18:20 UTC (permalink / raw)
  To: Breno Leitao
  Cc: David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Simon Horman, Willem de Bruijn, Shuah Khan, netdev, linux-kernel,
	linux-kselftest, kernel-team
In-Reply-To: <20260630-getsockopt_phase2-v2-4-193335f3d4d1@debian.org>

On 06/30, Breno Leitao wrote:
> Exercise the raw getsockopt path now backed by sockopt_t. ICMP_FILTER
> returns a fixed-size struct and, unlike the int/u64 options already
> covered, clamps the length down to the user buffer on a short read
> instead of failing, so check that semantic explicitly along with the
> exact and oversized cases, the -EOPNOTSUPP path on a non-ICMP raw
> socket, and an unknown optname.
> 
> Signed-off-by: Breno Leitao <leitao@debian.org>

Acked-by: Stanislav Fomichev <sdf@fomichev.me>

^ permalink raw reply

* [PATCH net-next 0/6] net: hold instance lock around NETDEV_DOWN and NETDEV_GOING_DOWN
From: Stanislav Fomichev @ 2026-06-30 18:21 UTC (permalink / raw)
  To: netdev; +Cc: davem, edumazet, kuba, pabeni

NETDEV_UP and NETDEV_REGISTER already run under the per-device
instance lock. The teardown side does not. Make it symmetric so
ops-locked drivers can rely on the lock being held in both
directions.

Stanislav Fomichev (6):
  net: hold instance lock around NETDEV_DOWN/GOING_DOWN
  net: dsa: hold instance lock on close-on-shutdown paths
  net: mtk_eth_soc: hold instance lock around DMA-device-swap close
  net: rtnetlink: take instance lock inside rtnl_configure_link
  net: require instance lock for NETDEV_DOWN/GOING_DOWN notifiers
  net: document NETDEV_UNREGISTER unlocked rationale

 Documentation/networking/netdevices.rst     | 10 ++++++++++
 drivers/net/ethernet/mediatek/mtk_eth_soc.c |  5 +++++
 net/core/dev.c                              |  5 +++++
 net/core/lock_debug.c                       |  4 ++--
 net/core/rtnetlink.c                        | 17 ++++++++++-------
 net/dsa/dsa.c                               | 20 +++++++++++++++++---
 net/dsa/user.c                              | 19 +++++++++++++++++--
 7 files changed, 66 insertions(+), 14 deletions(-)

-- 
2.53.0-Meta


^ permalink raw reply

* [PATCH net-next 1/6] net: hold instance lock around NETDEV_DOWN/GOING_DOWN
From: Stanislav Fomichev @ 2026-06-30 18:21 UTC (permalink / raw)
  To: netdev; +Cc: davem, edumazet, kuba, pabeni
In-Reply-To: <20260630182129.1601784-1-sdf@fomichev.me>

Mirror what call_netdevice_register_net_notifiers does but for the
teardown. Cover only DOWN and GOING_DOWN. UNREGISTER is still unlocked
because of the SW devices using dev_xxx methods.

Signed-off-by: Stanislav Fomichev <sdf@fomichev.me>
---
 net/core/dev.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/core/dev.c b/net/core/dev.c
index 4b3d5cfdf6e0..9d49493f4fb5 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1912,9 +1912,11 @@ static void call_netdevice_unregister_notifiers(struct notifier_block *nb,
 						struct net_device *dev)
 {
 	if (dev->flags & IFF_UP) {
+		netdev_lock_ops(dev);
 		call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
 					dev);
 		call_netdevice_notifier(nb, NETDEV_DOWN, dev);
+		netdev_unlock_ops(dev);
 	}
 	call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
 }
-- 
2.53.0-Meta


^ permalink raw reply related

* [PATCH net-next 2/6] net: dsa: hold instance lock on close-on-shutdown paths
From: Stanislav Fomichev @ 2026-06-30 18:21 UTC (permalink / raw)
  To: netdev; +Cc: davem, edumazet, kuba, pabeni
In-Reply-To: <20260630182129.1601784-1-sdf@fomichev.me>

netif_close_many will soon assert ops lock (for locked DOWN/GOING_DOWN).
Update dsa_switch_shutdown to manually grab and release the ops lock.

Signed-off-by: Stanislav Fomichev <sdf@fomichev.me>
---
 net/dsa/dsa.c  | 20 +++++++++++++++++---
 net/dsa/user.c | 19 +++++++++++++++++--
 2 files changed, 34 insertions(+), 5 deletions(-)

diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
index 9cb732f6b1e3..da53a666d4b8 100644
--- a/net/dsa/dsa.c
+++ b/net/dsa/dsa.c
@@ -18,6 +18,7 @@
 #include <linux/of.h>
 #include <linux/of_net.h>
 #include <net/dsa_stubs.h>
+#include <net/netdev_lock.h>
 #include <net/sch_generic.h>
 
 #include "conduit.h"
@@ -1620,10 +1621,23 @@ void dsa_switch_shutdown(struct dsa_switch *ds)
 
 	rtnl_lock();
 
-	dsa_switch_for_each_cpu_port(dp, ds)
-		list_add(&dp->conduit->close_list, &close_list);
+	dsa_switch_for_each_cpu_port(dp, ds) {
+		if (!(dp->conduit->flags & IFF_UP))
+			continue;
+		list_add_tail(&dp->conduit->close_list, &close_list);
+		netdev_lock_ops(dp->conduit);
+	}
+
+	netif_close_many(&close_list, false);
 
-	netif_close_many(&close_list, true);
+	while (!list_empty(&close_list)) {
+		struct net_device *conduit;
+
+		conduit = list_first_entry(&close_list, struct net_device,
+					   close_list);
+		netdev_unlock_ops(conduit);
+		list_del_init(&conduit->close_list);
+	}
 
 	dsa_switch_for_each_user_port(dp, ds) {
 		conduit = dsa_port_to_conduit(dp);
diff --git a/net/dsa/user.c b/net/dsa/user.c
index 8704c1a3a5b7..8ea47444d6d5 100644
--- a/net/dsa/user.c
+++ b/net/dsa/user.c
@@ -13,6 +13,7 @@
 #include <linux/of_net.h>
 #include <linux/of_mdio.h>
 #include <linux/mdio.h>
+#include <net/netdev_lock.h>
 #include <net/rtnetlink.h>
 #include <net/pkt_cls.h>
 #include <net/selftests.h>
@@ -3600,10 +3601,24 @@ static int dsa_user_netdevice_event(struct notifier_block *nb,
 			if (dp->cpu_dp != cpu_dp)
 				continue;
 
-			list_add(&dp->user->close_list, &close_list);
+			if (!(dp->user->flags & IFF_UP))
+				continue;
+
+			list_add_tail(&dp->user->close_list, &close_list);
+			netdev_lock_ops(dp->user);
 		}
 
-		netif_close_many(&close_list, true);
+		netif_close_many(&close_list, false);
+
+		while (!list_empty(&close_list)) {
+			struct net_device *user_dev;
+
+			user_dev = list_first_entry(&close_list,
+						    struct net_device,
+						    close_list);
+			netdev_unlock_ops(user_dev);
+			list_del_init(&user_dev->close_list);
+		}
 
 		return NOTIFY_OK;
 	}
-- 
2.53.0-Meta


^ permalink raw reply related

* [PATCH net-next 3/6] net: mtk_eth_soc: hold instance lock around DMA-device-swap close
From: Stanislav Fomichev @ 2026-06-30 18:21 UTC (permalink / raw)
  To: netdev; +Cc: davem, edumazet, kuba, pabeni
In-Reply-To: <20260630182129.1601784-1-sdf@fomichev.me>

netif_close_many will soon assert ops lock (for locked DOWN/GOING_DOWN).
Update mtk_eth_set_dma_device to manually grab and release the ops lock.

Signed-off-by: Stanislav Fomichev <sdf@fomichev.me>
---
 drivers/net/ethernet/mediatek/mtk_eth_soc.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/drivers/net/ethernet/mediatek/mtk_eth_soc.c b/drivers/net/ethernet/mediatek/mtk_eth_soc.c
index 5d291e50a47b..fe7610c42e5d 100644
--- a/drivers/net/ethernet/mediatek/mtk_eth_soc.c
+++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.c
@@ -26,6 +26,7 @@
 #include <linux/bitfield.h>
 #include <net/dsa.h>
 #include <net/dst_metadata.h>
+#include <net/netdev_lock.h>
 #include <net/page_pool/helpers.h>
 #include <linux/genalloc.h>
 
@@ -5030,10 +5031,14 @@ void mtk_eth_set_dma_device(struct mtk_eth *eth, struct device *dma_dev)
 			continue;
 
 		list_add_tail(&dev->close_list, &dev_list);
+		netdev_lock_ops(dev);
 	}
 
 	netif_close_many(&dev_list, false);
 
+	list_for_each_entry(dev, &dev_list, close_list)
+		netdev_unlock_ops(dev);
+
 	eth->dma_dev = dma_dev;
 
 	list_for_each_entry_safe(dev, tmp, &dev_list, close_list) {
-- 
2.53.0-Meta


^ permalink raw reply related

* [PATCH net-next 4/6] net: rtnetlink: take instance lock inside rtnl_configure_link
From: Stanislav Fomichev @ 2026-06-30 18:21 UTC (permalink / raw)
  To: netdev; +Cc: davem, edumazet, kuba, pabeni
In-Reply-To: <20260630182129.1601784-1-sdf@fomichev.me>

rtnl_configure_link calls __dev_change_flags() and __dev_notify_flags,
both need the instance lock. rtnl_newlink_create grabs it but stacked
devices do not. Move the lock inside rtnl_configure_link.

Signed-off-by: Stanislav Fomichev <sdf@fomichev.me>
---
 net/core/rtnetlink.c | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 12aa3aa1688b..1b7d6f6b8b68 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -3660,14 +3660,16 @@ int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm,
 			u32 portid, const struct nlmsghdr *nlh)
 {
 	unsigned int old_flags, changed;
-	int err;
+	int err = 0;
+
+	netdev_lock_ops(dev);
 
 	old_flags = dev->flags;
 	if (ifm && (ifm->ifi_flags || ifm->ifi_change)) {
 		err = __dev_change_flags(dev, rtnl_dev_combine_flags(dev, ifm),
 					 NULL);
 		if (err < 0)
-			return err;
+			goto out;
 	}
 
 	changed = old_flags ^ dev->flags;
@@ -3677,7 +3679,10 @@ int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm,
 	}
 
 	__dev_notify_flags(dev, old_flags, changed, portid, nlh);
-	return 0;
+
+out:
+	netdev_unlock_ops(dev);
+	return err;
 }
 EXPORT_SYMBOL(rtnl_configure_link);
 
@@ -3918,22 +3923,20 @@ static int rtnl_newlink_create(struct sk_buff *skb, struct ifinfomsg *ifm,
 		goto out;
 	}
 
-	netdev_lock_ops(dev);
-
 	err = rtnl_configure_link(dev, ifm, portid, nlh);
 	if (err < 0)
 		goto out_unregister;
 	if (tb[IFLA_MASTER]) {
+		netdev_lock_ops(dev);
 		err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER]), extack);
+		netdev_unlock_ops(dev);
 		if (err)
 			goto out_unregister;
 	}
 
-	netdev_unlock_ops(dev);
 out:
 	return err;
 out_unregister:
-	netdev_unlock_ops(dev);
 	if (ops->newlink) {
 		LIST_HEAD(list_kill);
 
-- 
2.53.0-Meta


^ permalink raw reply related

* [PATCH net-next 5/6] net: require instance lock for NETDEV_DOWN/GOING_DOWN notifiers
From: Stanislav Fomichev @ 2026-06-30 18:21 UTC (permalink / raw)
  To: netdev; +Cc: davem, edumazet, kuba, pabeni
In-Reply-To: <20260630182129.1601784-1-sdf@fomichev.me>

Sprinkle a few asserts about ops lock: netif_close_many and __dev_notify_flags
should now consistently run under the lock

Signed-off-by: Stanislav Fomichev <sdf@fomichev.me>
---
 Documentation/networking/netdevices.rst | 2 ++
 net/core/dev.c                          | 3 +++
 net/core/lock_debug.c                   | 4 ++--
 3 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/Documentation/networking/netdevices.rst b/Documentation/networking/netdevices.rst
index d2a238f8cc8b..1bb68a73bb67 100644
--- a/Documentation/networking/netdevices.rst
+++ b/Documentation/networking/netdevices.rst
@@ -421,6 +421,8 @@ For devices with locked ops, currently only the following notifiers are
 * ``NETDEV_CHANGENAME``
 * ``NETDEV_REGISTER``
 * ``NETDEV_UP``
+* ``NETDEV_DOWN``
+* ``NETDEV_GOING_DOWN``
 
 The following notifiers are running without the lock:
 * ``NETDEV_UNREGISTER``
diff --git a/net/core/dev.c b/net/core/dev.c
index 9d49493f4fb5..714d05283500 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1802,6 +1802,7 @@ void netif_close_many(struct list_head *head, bool unlink)
 	__dev_close_many(head);
 
 	list_for_each_entry_safe(dev, tmp, head, close_list) {
+		netdev_assert_locked_ops_compat(dev);
 		rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP | IFF_RUNNING, GFP_KERNEL, 0, NULL);
 		call_netdevice_notifiers(NETDEV_DOWN, dev);
 		if (unlink)
@@ -9787,6 +9788,8 @@ void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
 {
 	unsigned int changes = dev->flags ^ old_flags;
 
+	netdev_assert_locked_ops_compat(dev);
+
 	if (gchanges)
 		rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC, portid, nlh);
 
diff --git a/net/core/lock_debug.c b/net/core/lock_debug.c
index 8a81c5430705..abc4c00728b1 100644
--- a/net/core/lock_debug.c
+++ b/net/core/lock_debug.c
@@ -24,15 +24,15 @@ int netdev_debug_event(struct notifier_block *nb, unsigned long event,
 	case NETDEV_CHANGE:
 	case NETDEV_REGISTER:
 	case NETDEV_UP:
+	case NETDEV_DOWN:
+	case NETDEV_GOING_DOWN:
 		netdev_assert_locked_ops_compat(dev);
 		fallthrough;
-	case NETDEV_DOWN:
 	case NETDEV_REBOOT:
 	case NETDEV_UNREGISTER:
 	case NETDEV_CHANGEMTU:
 	case NETDEV_CHANGEADDR:
 	case NETDEV_PRE_CHANGEADDR:
-	case NETDEV_GOING_DOWN:
 	case NETDEV_FEAT_CHANGE:
 	case NETDEV_BONDING_FAILOVER:
 	case NETDEV_PRE_UP:
-- 
2.53.0-Meta


^ permalink raw reply related

* [PATCH net-next 6/6] net: document NETDEV_UNREGISTER unlocked rationale
From: Stanislav Fomichev @ 2026-06-30 18:21 UTC (permalink / raw)
  To: netdev; +Cc: davem, edumazet, kuba, pabeni
In-Reply-To: <20260630182129.1601784-1-sdf@fomichev.me>

The lock-state table marks UNREGISTER as unlocked without saying
why. Add a short note that many handlers release the lowers via
dev_close().

Signed-off-by: Stanislav Fomichev <sdf@fomichev.me>
---
 Documentation/networking/netdevices.rst | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/Documentation/networking/netdevices.rst b/Documentation/networking/netdevices.rst
index 1bb68a73bb67..761cdb08bf0f 100644
--- a/Documentation/networking/netdevices.rst
+++ b/Documentation/networking/netdevices.rst
@@ -427,6 +427,14 @@ For devices with locked ops, currently only the following notifiers are
 The following notifiers are running without the lock:
 * ``NETDEV_UNREGISTER``
 
+Many ``NETDEV_UNREGISTER`` handlers release their lowers with
+``dev_close()``, which takes the instance lock itself. Holding
+the lock across UNREGISTER would deadlock.
+
+Moving UNREGISTER under the lock is mechanical: switch those
+callers to the ``netif_*()`` lock-held variants. Deferred to
+limit churn.
+
 There are no clear expectations for the remaining notifiers. Notifiers not on
 the list may run with or without the instance lock, potentially even invoking
 the same notifier type with and without the lock from different code paths.
-- 
2.53.0-Meta


^ permalink raw reply related

* Re: [PATCH net-next v5 5/5] ionic: Add .get_fec_stats ethtool handler
From: Eric Joyner @ 2026-06-30 18:24 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: netdev, Brett Creeley, Andrew Lunn, David S. Miller, Eric Dumazet,
	Paolo Abeni, Nikhil P . Rao, Simon Horman
In-Reply-To: <20260615182732.7d28e31a@kernel.org>

On 6/15/2026 6:27 PM, Jakub Kicinski wrote:
> Caution: This message originated from an External Source. Use proper caution when opening attachments, clicking links, or responding.
> 
> 
> On Sun, 14 Jun 2026 13:53:03 -0700 Eric Joyner wrote:
>> +             if (fec_cw_err_bin != IONIC_STAT_INVALID)
>> +                     hist->values[i].sum = le64_to_cpu(fec_cw_err_bin);
>> +             else
>> +                     hist->values[i].sum = 0;
> 
> Setting the sum to zero is very much against the API contract with
> ethtool, no? Since we are just digging ourselves out of a whole with
> the link down events maybe let's be more strict going forward. :S
> 
> I think mlx5 had a similar problem of not knowing bucket count.
> You can put the bucket table in some driver struct and populate it
> at runtime.
> 
> Looking at this scheme, tho, I think the ethtool core is buggy for
> mlx5 :/ We should make a copy of the hist table, because we write
> the Netlink attrs after releasing the locks. Another call may start
> already and make the driver write its table in parallel.
> 
> Please send a fix for that if you can, or LMK if not, I'll chase
> one of the people involved in adding the fec stats.
> 
> I'll apply the first 4 patches here in the meantime.

Yeah, I'll work on a fix.

(Just to say that I didn't forget about this!)

- Eric



^ permalink raw reply

* [PATCH net v2] net/smc: fix UAF in smc_cdc_rx_handler() by pinning the socket
From: Xiang Mei @ 2026-06-30 18:32 UTC (permalink / raw)
  To: Sidraya Jayagond, D . Wythe, Dust Li, Wenjia Zhang,
	Mahanta Jambigi, Tony Lu, Wen Gu, netdev
  Cc: David S . Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Simon Horman, Hans Wippel, linux-rdma, linux-s390, Weiming Shi,
	Xiang Mei

smc_cdc_rx_handler() looks up the connection by token under the link
group's conns_lock, drops the lock, and then dereferences conn and the
smc_sock derived from it, ending in sock_hold(&smc->sk) inside
smc_cdc_msg_recv(). No reference is held across the lock release.

The only reference pinning the socket while the connection is
discoverable in the link group is taken in smc_lgr_register_conn()
(sock_hold) and dropped in __smc_lgr_unregister_conn() (sock_put), both
under conns_lock. Once the handler drops conns_lock, a concurrent
close() -> smc_release() -> smc_conn_free() -> smc_lgr_unregister_conn()
can drop that reference and free the smc_sock, so the handler's later
sock_hold() runs on freed memory:

  WARNING: lib/refcount.c:25 at refcount_warn_saturate
  Workqueue: rxe_wq do_work
   refcount_warn_saturate (lib/refcount.c:25)
   smc_cdc_msg_recv (net/smc/smc_cdc.c:430)
   smc_cdc_rx_handler (net/smc/smc_cdc.c:502)
   smc_wr_rx_tasklet_fn (net/smc/smc_wr.c:445)
   tasklet_action_common (kernel/softirq.c:938)
   handle_softirqs (kernel/softirq.c:622)
  Kernel panic - not syncing: panic_on_warn set

Only SMC-R is affected. The SMC-D receive tasklet is stopped by
tasklet_kill(&conn->rx_tsklet) in smc_conn_free() before the connection
is unregistered, so it cannot run concurrently with the free.

Take the socket reference while still holding conns_lock, so the
registration reference can no longer be the last one, and drop it once
the handler is done.

Fixes: d7b0e37c1ac1 ("net/smc: restructure CDC message reception")
Reported-by: Weiming Shi <bestswngs@gmail.com>
Assisted-by: Claude:claude-opus-4-8
Signed-off-by: Xiang Mei <xmei5@asu.edu>
---
v2:
- Take the reference under conns_lock, and compute smc once
- Initialize smc = NULL at declaration

 net/smc/smc_cdc.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c
index 619b3bab3824..32d6d03df321 100644
--- a/net/smc/smc_cdc.c
+++ b/net/smc/smc_cdc.c
@@ -470,9 +470,9 @@ static void smc_cdc_rx_handler(struct ib_wc *wc, void *buf)
 {
 	struct smc_link *link = (struct smc_link *)wc->qp->qp_context;
 	struct smc_cdc_msg *cdc = buf;
+	struct smc_sock *smc = NULL;
 	struct smc_connection *conn;
 	struct smc_link_group *lgr;
-	struct smc_sock *smc;
 
 	if (wc->byte_len < offsetof(struct smc_cdc_msg, reserved))
 		return; /* short message */
@@ -483,21 +483,26 @@ static void smc_cdc_rx_handler(struct ib_wc *wc, void *buf)
 	lgr = smc_get_lgr(link);
 	read_lock_bh(&lgr->conns_lock);
 	conn = smc_lgr_find_conn(ntohl(cdc->token), lgr);
-	read_unlock_bh(&lgr->conns_lock);
-	if (!conn || conn->out_of_sync)
+	if (!conn || conn->out_of_sync) {
+		read_unlock_bh(&lgr->conns_lock);
 		return;
+	}
 	smc = container_of(conn, struct smc_sock, conn);
+	sock_hold(&smc->sk);
+	read_unlock_bh(&lgr->conns_lock);
 
 	if (cdc->prod_flags.failover_validation) {
 		smc_cdc_msg_validate(smc, cdc, link);
-		return;
+		goto out;
 	}
 	if (smc_cdc_before(ntohs(cdc->seqno),
 			   conn->local_rx_ctrl.seqno))
 		/* received seqno is old */
-		return;
+		goto out;
 
 	smc_cdc_msg_recv(smc, cdc);
+out:
+	sock_put(&smc->sk);
 }
 
 static struct smc_wr_rx_handler smc_cdc_rx_handlers[] = {
-- 
2.43.0


^ permalink raw reply related

* [PATCH net-next v3 0/2] net/sched: sch_fq_pie: add per-flow class statistics
From: Hemendra M. Naik @ 2026-06-30 18:37 UTC (permalink / raw)
  To: netdev
  Cc: davem, edumazet, kuba, pabeni, horms, jiri, jhs, shuah,
	linux-kernel, linux-kselftest, vishy0777, tahiliani,
	Hemendra M. Naik

 FQ-PIE runs an independent PIE controller per flow but exposes no
 per-flow statistics. This series wires up fq_pie_class_ops to expose
 per-flow AQM state (prob, delay, deficit, avg_dq_rate) 
 via 'tc -s class show', following a similar pattern as FQ-CoDel.
---
 Changelog:

 v3: 
  - No changes since v2.
  - Resending as the previous submission was deferred when the net-next tree closed during review.
  - Corresponding iproute2 patch updated in response to review comments; no changes required for this patch.

 v2: 
 - Addressed ABI backward compatibility issue for tc_fq_pie_xstats. (https://lore.kernel.org/netdev/20260614125000.6058-1-hemendranaik@gmail.com/)

 v1: 
 - https://lore.kernel.org/netdev/20260531125314.22492-1-hemendranaik@gmail.com/

Hemendra M. Naik (2):
  net/sched: sch_fq_pie: add per-flow statistics via class ops
  selftests: tc-testing: add fq_pie per-flow class stats test

 include/uapi/linux/pkt_sched.h                |  29 ++++-
 net/sched/sch_fq_pie.c                        | 118 +++++++++++++++++-
 tools/include/uapi/linux/pkt_sched.h          |   4 +-
 .../tc-testing/tc-tests/qdiscs/fq_pie.json    |  22 ++++
 4 files changed, 163 insertions(+), 10 deletions(-)

-- 
2.34.1


^ permalink raw reply

* [PATCH net-next v3 1/2]     net/sched: sch_fq_pie: add per-flow statistics via class ops
From: Hemendra M. Naik @ 2026-06-30 18:37 UTC (permalink / raw)
  To: netdev
  Cc: davem, edumazet, kuba, pabeni, horms, jiri, jhs, shuah,
	linux-kernel, linux-kselftest, vishy0777, tahiliani,
	Hemendra M. Naik
In-Reply-To: <20260630183702.170798-1-hemendranaik@gmail.com>

    FQ-PIE schedules independent PIE controllers per flow but exposes no
    per-flow AQM state. Without class-level statistics there is no way to
    observe the per-flow drop probability, queue delay, deficit or
    dequeue rate from userspace.

    Extend tc_fq_pie_xstats to support both qdisc and class-level
    extended statistics.

    - Add enum with QDISC and CLASS type discriminators.
    - Add struct tc_fq_pie_cl_stats for per-flow metrics (prob,
      delay, deficit, avg_dq_rate, dq_rate_estimating).
    - Add empty struct tc_fq_pie_xqd_stats placeholder.

    Wire up fq_pie_class_ops (.walk, .dump, .dump_stats) so that
    'tc -s class show' against an fq_pie qdisc reports per-flow state:

      prob               per-flow PIE drop probability
      delay              per-flow queue sojourn time (microseconds)
      deficit            remaining DRR byte credits (signed integer)
      avg_dq_rate        dequeue rate estimate in bytes/second
                         (dq_rate_estimator mode only)
      dq_rate_estimating flag indicating active delay estimation mode

    Fix the 'delay' field comment in struct tc_pie_xstats from "in ms" to
    "in microseconds" to match the kernel's
    PSCHED_TICKS2NS / NSEC_PER_USEC conversion.

    Also correct the avg_dq_rate comment in tc_pie_xstats from
    "bits/pie_time" to "bytes/second" to match the actual kernel
    conversion (avg_dq_rate * PSCHED_TICKS_PER_SEC >> PIE_SCALE).

Signed-off-by: Hemendra M. Naik <hemendranaik@gmail.com>
Signed-off-by: Vishal Kamath <vishy0777@gmail.com>
Signed-off-by: Mohit P. Tahiliani <tahiliani@nitk.edu.in>
    ---
    Changelog:

    v3: 
     - No changes since v2.
     - Resending as the previous submission was deferred when the net-next tree closed during review.
     - Corresponding iproute2 patch updated in response to review comments; no changes required for this patch.

    v2: 
     - Addressed ABI backward compatibility issue for tc_fq_pie_xstats. (https://lore.kernel.org/netdev/20260614125000.6058-2-hemendranaik@gmail.com/)

    v1:
     - https://lore.kernel.org/netdev/20260531125314.22492-2-hemendranaik@gmail.com
---
 include/uapi/linux/pkt_sched.h       |  29 ++++++-
 net/sched/sch_fq_pie.c               | 118 +++++++++++++++++++++++++--
 tools/include/uapi/linux/pkt_sched.h |   4 +-
 3 files changed, 141 insertions(+), 10 deletions(-)

diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index 490efd288526..b18f274b2ec5 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -920,9 +920,9 @@ enum {
 
 struct tc_pie_xstats {
 	__u64 prob;			/* current probability */
-	__u32 delay;			/* current delay in ms */
+	__u32 delay;			/* current delay in microseconds */
 	__u32 avg_dq_rate;		/* current average dq_rate in
-					 * bits/pie_time
+					 * bytes/second
 					 */
 	__u32 dq_rate_estimating;	/* is avg_dq_rate being calculated? */
 	__u32 packets_in;		/* total number of packets enqueued */
@@ -953,6 +953,25 @@ enum {
 };
 #define TCA_FQ_PIE_MAX   (__TCA_FQ_PIE_MAX - 1)
 
+enum {
+	TCA_FQ_PIE_XSTATS_QDISC,
+	TCA_FQ_PIE_XSTATS_CLASS,
+};
+
+struct tc_fq_pie_cl_stats {
+	__u64 prob;			/* current probability */
+	__u32 delay;			/* current delay in microseconds */
+	__s32 deficit;		/* number of remaining byte credits */
+	__u32 avg_dq_rate;		/* current average dq_rate in
+					 * bytes/second
+					 */
+	__u32 dq_rate_estimating;	/* is avg_dq_rate being calculated? */
+};
+
+struct tc_fq_pie_xqd_stats {
+	/* placeholder for new qdisc-level stats */
+};
+
 struct tc_fq_pie_xstats {
 	__u32 packets_in;	/* total number of packets enqueued */
 	__u32 dropped;		/* packets dropped due to fq_pie_action */
@@ -963,6 +982,12 @@ struct tc_fq_pie_xstats {
 	__u32 new_flows_len;	/* count of flows in new list */
 	__u32 old_flows_len;	/* count of flows in old list */
 	__u32 memory_usage;	/* total memory across all queues */
+	__u32 type;
+	union {
+		struct tc_fq_pie_cl_stats class_stats;
+		struct tc_fq_pie_xqd_stats xqdisc_stats;
+	};
+
 };
 
 /* CBS */
diff --git a/net/sched/sch_fq_pie.c b/net/sched/sch_fq_pie.c
index 72f48fa4010b..60e85c002ae7 100644
--- a/net/sched/sch_fq_pie.c
+++ b/net/sched/sch_fq_pie.c
@@ -330,7 +330,7 @@ static int fq_pie_change(struct Qdisc *sch, struct nlattr *opt,
 	/* tupdate is in jiffies */
 	if (tb[TCA_FQ_PIE_TUPDATE])
 		WRITE_ONCE(q->p_params.tupdate,
-			usecs_to_jiffies(nla_get_u32(tb[TCA_FQ_PIE_TUPDATE])));
+			   usecs_to_jiffies(nla_get_u32(tb[TCA_FQ_PIE_TUPDATE])));
 
 	if (tb[TCA_FQ_PIE_ALPHA])
 		WRITE_ONCE(q->p_params.alpha,
@@ -509,7 +509,9 @@ static int fq_pie_dump(struct Qdisc *sch, struct sk_buff *skb)
 static int fq_pie_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
 {
 	struct fq_pie_sched_data *q = qdisc_priv(sch);
-	struct tc_fq_pie_xstats st = { 0 };
+	struct tc_fq_pie_xstats st = {
+		.type	= TCA_FQ_PIE_XSTATS_QDISC,
+	};
 	struct list_head *pos;
 
 	sch_tree_lock(sch);
@@ -517,10 +519,10 @@ static int fq_pie_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
 	st.packets_in	= q->stats.packets_in;
 	st.overlimit	= q->stats.overlimit;
 	st.overmemory	= q->overmemory;
-	st.dropped	= q->stats.dropped;
-	st.ecn_mark	= q->stats.ecn_mark;
-	st.new_flow_count = q->new_flow_count;
-	st.memory_usage   = q->memory_usage;
+	st.dropped		= q->stats.dropped;
+	st.ecn_mark		= q->stats.ecn_mark;
+	st.new_flow_count	= q->new_flow_count;
+	st.memory_usage	= q->memory_usage;
 
 	list_for_each(pos, &q->new_flows)
 		st.new_flows_len++;
@@ -561,7 +563,111 @@ static void fq_pie_destroy(struct Qdisc *sch)
 	kvfree(q->flows);
 }
 
+static struct Qdisc *fq_pie_leaf(struct Qdisc *sch, unsigned long arg)
+{
+	return NULL;
+}
+
+static unsigned long fq_pie_find(struct Qdisc *sch, u32 classid)
+{
+	return 0;
+}
+
+static unsigned long fq_pie_bind(struct Qdisc *sch, unsigned long parent,
+				 u32 classid)
+{
+	return 0;
+}
+
+static void fq_pie_unbind(struct Qdisc *q, unsigned long cl)
+{
+}
+
+static struct tcf_block *fq_pie_tcf_block(struct Qdisc *sch, unsigned long cl,
+					  struct netlink_ext_ack *extack)
+{
+	struct fq_pie_sched_data *q = qdisc_priv(sch);
+
+	if (cl)
+		return NULL;
+	return q->block;
+}
+
+static int fq_pie_dump_class(struct Qdisc *sch, unsigned long cl,
+			     struct sk_buff *skb, struct tcmsg *tcm)
+{
+	tcm->tcm_handle |= TC_H_MIN(cl);
+	return 0;
+}
+
+static int fq_pie_dump_class_stats(struct Qdisc *sch, unsigned long cl,
+				   struct gnet_dump *d)
+{
+	struct fq_pie_sched_data *q = qdisc_priv(sch);
+	struct gnet_stats_queue qs = { 0 };
+	struct tc_fq_pie_xstats xstats;
+	u32 idx = cl - 1;
+
+	if (idx < q->flows_cnt) {
+		const struct fq_pie_flow *flow = &q->flows[idx];
+
+		memset(&xstats, 0, sizeof(xstats));
+		xstats.type = TCA_FQ_PIE_XSTATS_CLASS;
+		xstats.class_stats.prob = READ_ONCE(flow->vars.prob) << BITS_PER_BYTE;
+		xstats.class_stats.delay =
+			((u32)PSCHED_TICKS2NS(READ_ONCE(flow->vars.qdelay))) /
+			NSEC_PER_USEC;
+		xstats.class_stats.deficit = READ_ONCE(flow->deficit);
+		xstats.class_stats.dq_rate_estimating =
+			READ_ONCE(q->p_params.dq_rate_estimator);
+
+		if (xstats.class_stats.dq_rate_estimating) {
+			xstats.class_stats.avg_dq_rate =
+				READ_ONCE(flow->vars.avg_dq_rate) *
+				(PSCHED_TICKS_PER_SEC) >> PIE_SCALE;
+		}
+
+		qs.qlen    = READ_ONCE(flow->qlen);
+		qs.backlog = READ_ONCE(flow->backlog);
+	}
+	if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0)
+		return -1;
+	if (idx < q->flows_cnt)
+		return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
+	return 0;
+}
+
+static void fq_pie_walk(struct Qdisc *sch, struct qdisc_walker *arg)
+{
+	struct fq_pie_sched_data *q = qdisc_priv(sch);
+	unsigned int i;
+
+	if (arg->stop)
+		return;
+
+	for (i = 0; i < q->flows_cnt; i++) {
+		if (list_empty(&q->flows[i].flowchain)) {
+			arg->count++;
+			continue;
+		}
+		if (!tc_qdisc_stats_dump(sch, i + 1, arg))
+			break;
+	}
+}
+
+static const struct Qdisc_class_ops fq_pie_class_ops = {
+	.leaf		=	fq_pie_leaf,
+	.find		=	fq_pie_find,
+	.tcf_block	=	fq_pie_tcf_block,
+	.bind_tcf	=	fq_pie_bind,
+	.unbind_tcf	=	fq_pie_unbind,
+	.dump		=	fq_pie_dump_class,
+	.dump_stats	=	fq_pie_dump_class_stats,
+	.walk		=	fq_pie_walk,
+};
+
 static struct Qdisc_ops fq_pie_qdisc_ops __read_mostly = {
+	.cl_ops		=	&fq_pie_class_ops,
 	.id		= "fq_pie",
 	.priv_size	= sizeof(struct fq_pie_sched_data),
 	.enqueue	= fq_pie_qdisc_enqueue,
diff --git a/tools/include/uapi/linux/pkt_sched.h b/tools/include/uapi/linux/pkt_sched.h
index 587481a19433..45ea10026742 100644
--- a/tools/include/uapi/linux/pkt_sched.h
+++ b/tools/include/uapi/linux/pkt_sched.h
@@ -847,8 +847,8 @@ enum {
 
 struct tc_pie_xstats {
 	__u32 prob;             /* current probability */
-	__u32 delay;            /* current delay in ms */
-	__u32 avg_dq_rate;      /* current average dq_rate in bits/pie_time */
+	__u32 delay;            /* current delay in micoseconds */
+	__u32 avg_dq_rate;      /* current average dq_rate in bytes/second */
 	__u32 packets_in;       /* total number of packets enqueued */
 	__u32 dropped;          /* packets dropped due to pie_action */
 	__u32 overlimit;        /* dropped due to lack of space in queue */
-- 
2.34.1


^ permalink raw reply related

* [PATCH net-next v3 2/2] selftests: tc-testing: add fq_pie per-flow class stats test
From: Hemendra M. Naik @ 2026-06-30 18:37 UTC (permalink / raw)
  To: netdev
  Cc: davem, edumazet, kuba, pabeni, horms, jiri, jhs, shuah,
	linux-kernel, linux-kselftest, vishy0777, tahiliani,
	Hemendra M. Naik
In-Reply-To: <20260630183702.170798-1-hemendranaik@gmail.com>

Add a tc-testing entry (id: 83c0) to verify the fq_pie class ops
wired up in the previous patch do not crash and integrate cleanly
with the tc class show path.

The test creates an fq_pie root qdisc on a dummy interface and runs
'tc -s class show'.

Signed-off-by: Hemendra M. Naik <hemendranaik@gmail.com>
Signed-off-by: Vishal Kamath <vishy0777@gmail.com>
Signed-off-by: Mohit P. Tahiliani <tahiliani@nitk.edu.in>
---

Changelog:

v3: 
 - No changes since v2.
 - Resending as the previous submission was deferred when the net-next tree closed during review.
 - Corresponding iproute2 patch updated in response to review comments; no changes required for this patch.

v2: 
 - No changes since v1. (https://lore.kernel.org/netdev/20260614125000.6058-3-hemendranaik@gmail.com/)

v1: 
 - https://lore.kernel.org/netdev/20260531125314.22492-3-hemendranaik@gmail.com/
---
 .../tc-testing/tc-tests/qdiscs/fq_pie.json    | 22 +++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/tools/testing/selftests/tc-testing/tc-tests/qdiscs/fq_pie.json b/tools/testing/selftests/tc-testing/tc-tests/qdiscs/fq_pie.json
index 229fe1bf4a90..88139f429430 100644
--- a/tools/testing/selftests/tc-testing/tc-tests/qdiscs/fq_pie.json
+++ b/tools/testing/selftests/tc-testing/tc-tests/qdiscs/fq_pie.json
@@ -40,5 +40,27 @@
         "matchPattern": "qdisc fq_pie 1: root refcnt [0-9]+ limit 1p",
         "matchCount": "1",
         "teardown": ["$TC qdisc del dev $DEV1 handle 1: root"]
+    },
+    {
+        "id": "83c0",
+        "name": "FQ-PIE class stats accessible via tc class show",
+        "category": [
+            "qdisc",
+            "fq_pie"
+        ],
+        "plugins": {
+            "requires": "nsPlugin"
+        },
+        "setup": [
+            "$TC qdisc add dev $DUMMY handle 1: root fq_pie"
+        ],
+        "cmdUnderTest": "$TC -s class show dev $DUMMY",
+        "expExitCode": "0",
+        "verifyCmd": "$TC -s class show dev $DUMMY",
+        "matchPattern": "class fq_pie",
+        "matchCount": "0",
+        "teardown": [
+            "$TC qdisc del dev $DUMMY handle 1: root"
+        ]
     }
 ]
-- 
2.34.1


^ permalink raw reply related

* [PATCH iproute2-next v3] Add support for printing per-flow PIE statistics exposed by the kernel via the new TCA_FQ_PIE_XSTATS_CLASS type in tc_fq_pie_xstats.
From: Hemendra M. Naik @ 2026-06-30 18:44 UTC (permalink / raw)
  To: netdev; +Cc: jiri, jhs, linux-kernel, vishy0777, tahiliani, Hemendra M. Naik

'tc -s class show' against an fq_pie qdisc now prints:

 prob           drop probability for the flow
 delay          per-flow queue sojourn time (microseconds)
 deficit        remaining DRR byte credits (signed integer)
 avg_dq_rate    dequeue rate estimate in bytes/second
             	(dq_rate_estimator mode only)

avg_dq_rate is formatted using tc_print_rate(), which converts the
kernel's bytes/second value to a human-readable bits/second string
(e.g. '3906Kbit'), consistent with how other tc schedulers display
rate fields. Apply the same fix to tc/q_pie.c, where avg_dq_rate was
also printed as a raw integer without a unit.

Update the UAPI header to mirror tc_fq_pie_cl_stats from the kernel.
Fix the 'delay' field comment in struct tc_pie_xstats from "in ms" to
"in microseconds" to match the kernel's
PSCHED_TICKS2NS / NSEC_PER_USEC conversion.

Add a 'tc -s class show' example to tc-fq_pie(8) with dq_rate_estimator
enabled, showing all per-flow fields (prob, delay, deficit, avg_dq_rate)
across multiple flows. Update tc-pie(8) avg_dq_rate example from a raw
integer to a formatted bits/second string.

The corresponding kernel patch can be viewed here:
https://lore.kernel.org/netdev/20260630183702.170798-1-hemendranaik@gmail.com/

Signed-off-by: Hemendra M. Naik <hemendranaik@gmail.com>
Signed-off-by: Vishal Kamath <vishy0777@gmail.com>
Signed-off-by: Mohit P. Tahiliani <tahiliani@nitk.edu.in>

---
Changelog:

v3: 
 - Moved print_nl() into each xstats type block to preserve output formatting.

v2:
 - Addressed ABI backward compatibility issue. (https://lore.kernel.org/netdev/20260614130729.10076-1-hemendranaik@gmail.com/)

v1:
 - https://lore.kernel.org/netdev/20260531131411.28213-1-hemendranaik@gmail.com/
---
 include/uapi/linux/pkt_sched.h | 29 +++++++++++++++--
 man/man8/tc-fq_pie.8           | 18 +++++++++++
 man/man8/tc-pie.8              |  2 +-
 tc/q_fq_pie.c                  | 59 +++++++++++++++++++++++-----------
 tc/q_pie.c                     |  4 +--
 5 files changed, 88 insertions(+), 24 deletions(-)

diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index d1f67ef8..f56f24ad 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -920,9 +920,9 @@ enum {
 
 struct tc_pie_xstats {
 	__u64 prob;			/* current probability */
-	__u32 delay;			/* current delay in ms */
+	__u32 delay;			/* current delay in microseconds */
 	__u32 avg_dq_rate;		/* current average dq_rate in
-					 * bits/pie_time
+					 * bytes/second
 					 */
 	__u32 dq_rate_estimating;	/* is avg_dq_rate being calculated? */
 	__u32 packets_in;		/* total number of packets enqueued */
@@ -953,6 +953,25 @@ enum {
 };
 #define TCA_FQ_PIE_MAX   (__TCA_FQ_PIE_MAX - 1)
 
+enum {
+	TCA_FQ_PIE_XSTATS_QDISC,
+	TCA_FQ_PIE_XSTATS_CLASS,
+};
+
+struct tc_fq_pie_cl_stats {
+	__u64 prob;			/* current probability */
+	__u32 delay;			/* current delay in microseconds */
+	__s32 deficit;		/* number of remaining byte credits */
+	__u32 avg_dq_rate;		/* current average dq_rate in
+					 * bytes/second
+					 */
+	__u32 dq_rate_estimating;	/* is avg_dq_rate being calculated? */
+};
+
+struct tc_fq_pie_xqd_stats {
+	/* placeholder for new qdisc-level stats */
+};
+
 struct tc_fq_pie_xstats {
 	__u32 packets_in;	/* total number of packets enqueued */
 	__u32 dropped;		/* packets dropped due to fq_pie_action */
@@ -963,6 +982,12 @@ struct tc_fq_pie_xstats {
 	__u32 new_flows_len;	/* count of flows in new list */
 	__u32 old_flows_len;	/* count of flows in old list */
 	__u32 memory_usage;	/* total memory across all queues */
+	__u32 type;
+	union {
+		struct tc_fq_pie_cl_stats class_stats;
+		struct tc_fq_pie_xqd_stats xqdisc_stats;
+	};
+
 };
 
 /* CBS */
diff --git a/man/man8/tc-fq_pie.8 b/man/man8/tc-fq_pie.8
index 457a56bb..bf988f5f 100644
--- a/man/man8/tc-fq_pie.8
+++ b/man/man8/tc-fq_pie.8
@@ -153,6 +153,24 @@ dq_rate_estimator
   pkts_in 6082 overlimit 0 overmemory 0 dropped 4 ecn_mark 0
   new_flow_count 94 new_flows_len 0 old_flows_len 8 memory_used 1157632
 
+# tc qdisc add dev eth0 parent 100:1 handle 200: fq_pie target 2ms flows 3
+.br
+# tc -s class show dev eth0
+.br
+class fq_pie 200:2 parent 200:
+ Sent 0 bytes 0 pkt (dropped 0, overlimits 0 requeues 0)
+ backlog 22800b 2p requeues 0
+  prob 0.57679 delay 2.38ms
+
+# tc qdisc add dev eth0 parent 100:1 handle 200: fq_pie target 2ms flows 3 dq_rate_estimator
+.br
+# tc -s class show dev eth0
+.br
+class fq_pie 200:2 parent 200:
+ Sent 0 bytes 0 pkt (dropped 0, overlimits 0 requeues 0)
+ backlog 22800b 2p requeues 0
+  prob 0.57679 delay 2.38ms avg_dq_rate 10742Kbit
+
 .SH SEE ALSO
 .BR tc (8),
 .BR tc-pie (8),
diff --git a/man/man8/tc-pie.8 b/man/man8/tc-pie.8
index 5a8c7820..f3a09616 100644
--- a/man/man8/tc-pie.8
+++ b/man/man8/tc-pie.8
@@ -115,7 +115,7 @@ is turned off.
    qdisc pie 8036: dev eth0 root refcnt 2 limit 1000p target 15.0ms tupdate 16.0ms alpha 2 beta 20
     Sent 63947420 bytes 42414 pkt (dropped 41, overlimits 0 requeues 0)
     backlog 271006b 179p requeues 0
-     prob 0.000092 delay 22200us avg_dq_rate 12145996
+     prob 0.000092 delay 22200us avg_dq_rate 10742Kbit
      pkts_in 41 overlimit 343 dropped 0 maxq 50 ecn_mark 0
 
  # tc qdisc add dev eth0 root pie limit 100 target 20ms tupdate 30ms ecn
diff --git a/tc/q_fq_pie.c b/tc/q_fq_pie.c
index dc2710cd..1b89ee3a 100644
--- a/tc/q_fq_pie.c
+++ b/tc/q_fq_pie.c
@@ -274,6 +274,8 @@ static int fq_pie_print_xstats(const struct qdisc_util *qu, FILE *f,
 {
 	struct tc_fq_pie_xstats _st = {}, *st;
 
+	SPRINT_BUF(b1);
+
 	if (xstats == NULL)
 		return 0;
 
@@ -283,25 +285,44 @@ static int fq_pie_print_xstats(const struct qdisc_util *qu, FILE *f,
 		st = &_st;
 	}
 
-	print_uint(PRINT_ANY, "pkts_in", "  pkts_in %u",
-		   st->packets_in);
-	print_uint(PRINT_ANY, "overlimit", " overlimit %u",
-		   st->overlimit);
-	print_uint(PRINT_ANY, "overmemory", " overmemory %u",
-		   st->overmemory);
-	print_uint(PRINT_ANY, "dropped", " dropped %u",
-		   st->dropped);
-	print_uint(PRINT_ANY, "ecn_mark", " ecn_mark %u",
-		   st->ecn_mark);
-	print_nl();
-	print_uint(PRINT_ANY, "new_flow_count", "  new_flow_count %u",
-		   st->new_flow_count);
-	print_uint(PRINT_ANY, "new_flows_len", " new_flows_len %u",
-		   st->new_flows_len);
-	print_uint(PRINT_ANY, "old_flows_len", " old_flows_len %u",
-		   st->old_flows_len);
-	print_uint(PRINT_ANY, "memory_used", " memory_used %u",
-		   st->memory_usage);
+	if (!st->type || st->type == TCA_FQ_PIE_XSTATS_QDISC) {
+		print_uint(PRINT_ANY, "pkts_in", "  pkts_in %u",
+			   st->packets_in);
+		print_uint(PRINT_ANY, "overlimit", " overlimit %u",
+			   st->overlimit);
+		print_uint(PRINT_ANY, "overmemory", " overmemory %u",
+			   st->overmemory);
+		print_uint(PRINT_ANY, "dropped", " dropped %u",
+			   st->dropped);
+		print_uint(PRINT_ANY, "ecn_mark", " ecn_mark %u",
+			   st->ecn_mark);
+		print_nl();
+		print_uint(PRINT_ANY, "new_flow_count", "  new_flow_count %u",
+			   st->new_flow_count);
+		print_uint(PRINT_ANY, "new_flows_len", " new_flows_len %u",
+			   st->new_flows_len);
+		print_uint(PRINT_ANY, "old_flows_len", " old_flows_len %u",
+			   st->old_flows_len);
+		print_uint(PRINT_ANY, "memory_used", " memory_used %u",
+			   st->memory_usage);
+		print_nl();
+	}
+
+	if (st->type == TCA_FQ_PIE_XSTATS_CLASS) {
+		print_float(PRINT_ANY, "prob", " prob %lg",
+			    (double)st->class_stats.prob / (double)UINT64_MAX);
+		print_uint(PRINT_JSON, "delay", NULL, st->class_stats.delay);
+		print_string(PRINT_FP, NULL, " delay %s",
+			     sprint_time(st->class_stats.delay, b1));
+		print_int(PRINT_ANY, "deficit", " deficit %d",
+			  st->class_stats.deficit);
+
+		if (st->class_stats.dq_rate_estimating) {
+			tc_print_rate(PRINT_ANY, "avg_dq_rate", " avg_dq_rate %s",
+				      st->class_stats.avg_dq_rate);
+		}
+		print_nl();
+	}
 
 	return 0;
 
diff --git a/tc/q_pie.c b/tc/q_pie.c
index 04c9aa61..abae1ced 100644
--- a/tc/q_pie.c
+++ b/tc/q_pie.c
@@ -220,8 +220,8 @@ static int pie_print_xstats(const struct qdisc_util *qu, FILE *f,
 	print_string(PRINT_FP, NULL, " delay %s", sprint_time(st->delay, b1));
 
 	if (st->dq_rate_estimating)
-		print_uint(PRINT_ANY, "avg_dq_rate", " avg_dq_rate %u",
-			   st->avg_dq_rate);
+		tc_print_rate(PRINT_ANY, "avg_dq_rate", " avg_dq_rate %s",
+			      st->avg_dq_rate);
 
 	print_nl();
 	print_uint(PRINT_ANY, "pkts_in", "  pkts_in %u", st->packets_in);
-- 
2.34.1


^ permalink raw reply related

* Re: [PATCH net-next 0/9] Switch support
From: Jakub Kicinski @ 2026-06-30 18:52 UTC (permalink / raw)
  To: Ratheesh Kannoth
  Cc: linux-kernel, netdev, andrew+netdev, davem, edumazet, pabeni,
	sgoutham
In-Reply-To: <20260630024715.4124281-1-rkannoth@marvell.com>

On Tue, 30 Jun 2026 08:17:06 +0530 Ratheesh Kannoth wrote:
> Marvell OcteonTX2 switch hardware is capable of accelerating L2, L3, and
> flow. When representors are enabled through devlink, a logical port is
> created in switch hardware for each representor device.

There's a number of allocations here which are missing a NULL check.

I'm sure sashiko will also have semi-infinite number of complaints,
so please check that yourself.
-- 
pw-bot: cr

^ permalink raw reply

* Re: [PATCH net-next v7 5/5] veth: time-based BQL completion coalescing via ethtool tx-usecs
From: Simon Schippers @ 2026-06-30 19:07 UTC (permalink / raw)
  To: Jonas Köppeler, hawk, netdev
  Cc: kernel-team, Andrew Lunn, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Alexei Starovoitov, Daniel Borkmann,
	John Fastabend, Stanislav Fomichev, linux-kernel, bpf
In-Reply-To: <c4154d9c-8930-436d-b8b8-01951e8bd9f4@tu-berlin.de>

On 6/30/26 16:00, Jonas Köppeler wrote:
> On 6/13/26 4:14 PM, Simon Schippers wrote:
>> On 6/12/26 10:35, hawk@kernel.org wrote:
>>> From: Simon Schippers <simon.schippers@tu-dortmund.de>
>>>
>>> Per-packet BQL completion forces DQL to converge on limit=2, causing
>>> excessive NAPI scheduling overhead and qdisc requeues.
>>>
>>> Accumulate BQL completions and flush them when a configurable time
>>> threshold (tx-usecs) is exceeded, letting DQL discover a limit that
>>> bounds actual queuing delay to the configured interval. Coalescing
>>> state persists across NAPI polls in struct veth_rq so completions can
>>> accumulate beyond a single budget=64 cycle.
>>>
>>> The flush condition is:
>>>
>>> state->time + bql_flush_ns <= current_time || state->n_bql > dql.limit
>>>
>>> Flushing when n_bql exceeds dql.limit handles BQL starvation.
>>>
>>> The comparison is strictly greater-than because netdev_tx_sent_queue()
>>> always lets the producer exceed the limit by one before it stops, so
>>> n_bql == dql.limit is a normal in-flight state. dql.limit lives in
>>> the same cacheline as the completion path, so the check is cheap.
>>>
>>> Add ethtool tx-usecs support for runtime tuning. Default is 100 us;
>>> setting tx-usecs to 0 disables coalescing and falls back to per-packet
>>> completion.
>>>
>>>    ethtool -C <veth-dev> tx-usecs 500  # 500us coalescing
>>>    ethtool -C <veth-dev> tx-usecs 0    # per-packet (no coalescing)
>>>
>>> Co-developed-by: Jesper Dangaard Brouer <hawk@kernel.org>
>>> Signed-off-by: Jesper Dangaard Brouer <hawk@kernel.org>
>>> Co-developed-by: Jonas Köppeler <j.koeppeler@tu-berlin.de>
>>> Signed-off-by: Jonas Köppeler <j.koeppeler@tu-berlin.de>
>>> Signed-off-by: Simon Schippers <simon.schippers@tu-dortmund.de>
>>> ---
>>>   drivers/net/veth.c | 123 ++++++++++++++++++++++++++++++++++++++++++---
>>>   1 file changed, 117 insertions(+), 6 deletions(-)
>>>
>>> diff --git a/drivers/net/veth.c b/drivers/net/veth.c
>>> index 2473f730734b..c62d87a8402c 100644
>>> --- a/drivers/net/veth.c
>>> +++ b/drivers/net/veth.c
>>> @@ -28,6 +28,7 @@
>>>   #include <linux/bpf_trace.h>
>>>   #include <linux/net_tstamp.h>
>>>   #include <linux/skbuff_ref.h>
>>> +#include <linux/sched/clock.h>
>>>   #include <net/page_pool/helpers.h>
>>>     #define DRV_NAME    "veth"
>>> @@ -50,6 +51,7 @@
>>>    * delay => 64 * 250 ms = 16 s.
>>>    */
>>>   #define VETH_WATCHDOG_TIMEOUT_MS    (64 * 250)
>>> +#define VETH_BQL_COAL_TX_USECS    100 /* default tx-usecs for BQL batching*/
>>>     struct veth_stats {
>>>       u64    rx_drops;
>>> @@ -69,6 +71,11 @@ struct veth_rq_stats {
>>>       struct u64_stats_sync    syncp;
>>>   };
>>>   +struct veth_bql_state {
>>> +    u64    time;    /* sched_clock() when current coalescing window started */
>>> +    uint    n_bql;    /* BQL completions batched in the current window */
>>> +};
>>> +
>>>   struct veth_rq {
>>>       struct napi_struct    xdp_napi;
>>>       struct napi_struct __rcu *napi; /* points to xdp_napi when the latteris initialized */
>>> @@ -76,6 +83,7 @@ struct veth_rq {
>>>       struct bpf_prog __rcu    *xdp_prog;
>>>       struct xdp_mem_info    xdp_mem;
>>>       struct veth_rq_stats    stats;
>>> +    struct veth_bql_state    bql_state;
>>>       bool            rx_notify_masked;
>>>       struct ptr_ring        xdp_ring;
>>>       struct xdp_rxq_info    xdp_rxq;
>>> @@ -88,6 +96,7 @@ struct veth_priv {
>>>       struct bpf_prog        *_xdp_prog;
>>>       struct veth_rq        *rq;
>>>       unsigned int        requested_headroom;
>>> +    unsigned int        tx_coal_usecs;    /* BQL completion coalescing */
>>>   };
>>>     struct veth_xdp_tx_bq {
>>> @@ -272,7 +281,56 @@ static void veth_get_channels(struct net_device *dev,
>>>   static int veth_set_channels(struct net_device *dev,
>>>                    struct ethtool_channels *ch);
>>>   +static int veth_get_coalesce(struct net_device *dev,
>>> +                 struct ethtool_coalesce *ec,
>>> +                 struct kernel_ethtool_coalesce *kernel_coal,
>>> +                 struct netlink_ext_ack *extack)
>>> +{
>>> +    struct veth_priv *priv = netdev_priv(dev);
>>> +
>>> +    ec->tx_coalesce_usecs = priv->tx_coal_usecs;
>>> +    return 0;
>>> +}
>>> +
>>> +static int veth_set_coalesce(struct net_device *dev,
>>> +                 struct ethtool_coalesce *ec,
>>> +                 struct kernel_ethtool_coalesce *kernel_coal,
>>> +                 struct netlink_ext_ack *extack)
>>> +{
>>> +    struct veth_priv *priv = netdev_priv(dev);
>>> +    struct net_device *peer;
>>> +
>>> +    /* The coalescing window delays BQL completions, so keep tx-usecs well
>>> +     * below the tx_timeout watchdog; otherwise a large value could stall a
>>> +     * stopped queue long enough to trip a false watchdog timeout. Cap at
>>> +     * half the watchdog to leave a generous safety margin. tx-usecs is
>>> +     * microseconds, the watchdog is milliseconds.
>>> +     */
>>> +    if (ec->tx_coalesce_usecs > VETH_WATCHDOG_TIMEOUT_MS / 2 * USEC_PER_MSEC) {
>>> +        NL_SET_ERR_MSG_MOD(extack,
>>> +                   "tx-usecs must stay below half the tx_timeout watchdog");
>>> +        return -ERANGE;
>>> +    }
>>> +
>>> +    /* Paired with READ_ONCE in veth_xdp_rcv(). */
>>> +    WRITE_ONCE(priv->tx_coal_usecs, ec->tx_coalesce_usecs);
>>> +
>>> +    /* veth_xdp_rcv() reads each device's own value, so mirror it onto
>>> +     * the peer to keep the pair symmetric: both directions coalesce
>>> +     * with the same tx-usecs. Called under RTNL, rtnl_dereference() is safe.
>>> +     */
>>> +    peer = rtnl_dereference(priv->peer);
>>> +    if (peer) {
>>> +        struct veth_priv *peer_priv = netdev_priv(peer);
>>> +
>>> +        WRITE_ONCE(peer_priv->tx_coal_usecs, ec->tx_coalesce_usecs);
>>> +    }
>>> +
>>> +    return 0;
>>> +}
>>> +
>>>   static const struct ethtool_ops veth_ethtool_ops = {
>>> +    .supported_coalesce_params = ETHTOOL_COALESCE_TX_USECS,
>>>       .get_drvinfo        = veth_get_drvinfo,
>>>       .get_link        = ethtool_op_get_link,
>>>       .get_strings        = veth_get_strings,
>>> @@ -282,6 +340,8 @@ static const struct ethtool_ops veth_ethtool_ops ={
>>>       .get_ts_info        = ethtool_op_get_ts_info,
>>>       .get_channels        = veth_get_channels,
>>>       .set_channels        = veth_set_channels,
>>> +    .get_coalesce        = veth_get_coalesce,
>>> +    .set_coalesce        = veth_set_coalesce,
>>>   };
>>>     /* general routines */
>>> @@ -969,13 +1029,54 @@ static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq,
>>>       return NULL;
>>>   }
>>>   +static void veth_bql_maybe_complete(struct veth_bql_state *state,
>>> +                    struct netdev_queue *peer_txq,
>>> +                    u64 bql_flush_ns)
>>> +{
>>> +    u64 current_time;
>>> +
>>> +    /* There is no reason to complete with 0 and
>>> +     * peer_txq could go away.
>>> +     */
>>> +    if (!state->n_bql || !peer_txq)
>>> +        return;
>>> +
>>> +    current_time = sched_clock();
>>> +
>>> +    /* We complete if:
>>> +     * 1. We reach bql_flush_ns.
>>> +     * 2. We potentially have BQL starvation.
>>> +     */
>>> +    if (state->time + bql_flush_ns <= current_time ||
>>> +        state->n_bql > peer_txq->dql.limit) {
>>
> Indeed, this does not compile when CONFIG_BQL is not set. I think we should just bring back the 'queue is empty + queue is stopped' check from v6 back at the end of the poll and remove the n_bql > dql.limit check.

We would put #ifdef CONFIG_BQL around that logic aswell.

> It also feels not obvious why this is handling the starvation case. This only works, because the producer has went overlimit previously and was stopped. So more than 'limit' packets have been enqueued to the ring, and they are eventually drained when this check is true.

I think it just needs some comment tweaking:

/* We complete if:
 * 1. We reach bql_flush_ns.
 * 2. We have BQL starvation. This means that the queue was over-limit
 *    in the last interval, and there is no more data in the queue,
 *    which is equivalent to we consumed more than limit items.
 */ 

> By removing this we can also avoid accessing dql internal members, but if you don't think that's a problem we can leave as is.

I agree accessing dql internal variables is not perfect.

That is why I have locally implemented DQL for software interfaces in
a generic way inside dynamic_queue_limits.{h,c}.
I was able to squeeze the time and n_bql variables into the completion
cacheline of the dql struct by moving around variables.
The logic applies inside dql_completed() if enabled.
With this we just have to call netdev_completed_queue().
Also it allows for per-queue tweaking of tx_usecs via sysfs.
Works well for me, can share it if we want to use it.

> 
> Further, this is only works if VETH_BQL_UNIT stays 1, otherwise it will never fire. Anyway, still its necessary to check for CONFIG_BQL. But we could solve this by adding VETH_BQL_UNIT to n_bql instead of 1. This is also safe from any overflows, since limit is bound to limit_max, inflight is always less than limit + 1*VETH_BQL_UNIT and n_bql <= inflight.

You are right.

But I think there is no reason for VETH_BQL_UNIT anyway.
There should be no difference in the BQL algorithm, I personally
would replace VETH_BQL_UNIT with a hard-coded 1.

> 
> In a version of bringing back the 'queue-empty' check and keeping most of the current logic (so a mixture of v6 and v7) resulted in the same performance on an x86_64 architecture.
> 
>> Both Sashiko-Nipa and Sashiko-Gemini are right, this is missing a
>> #ifdef CONFIG_BQL. Not sure what is the best way to add them.
>> And for the struct we could maybe do:
>>
>> #ifdef CONFIG_BQL
>> struct veth_bql_state {
>>      u64    time;    /* sched_clock() when current coalescing window started */
>>      uint    n_bql;    /* BQL completions batched in the current window */
>> };
>> #else
>> struct veth_bql_state {};
>> #endif
> Regarding the configs: we can just do something along those lines.
> struct veth_rq {
> ...
> #ifdef CONFIG_BQL
>     struct veth_bql_state        dql;
> #endif
> ...
> }
> 
> and we put the rest of the code that accesses or performs an action regarding bql in some functions and do it like in netdev_* functions with
> 
> Function-Signature()
> {
> #ifdef CONFIG_BQL
> // Code
> #endif
> }
> 
> Wdyt?
> - Jonas

Yes, we have to. Unless we put it into dynamic_queue_limits.{h,c}
of course :^)

Thanks,
Simon

>>
>>> +        netdev_tx_completed_queue(peer_txq, state->n_bql,
>>> +                      state->n_bql * VETH_BQL_UNIT);
>>> +        state->time = current_time;
>>> +        state->n_bql = 0;
>>> +    }
>>> +}
>>> +
>>>   static int veth_xdp_rcv(struct veth_rq *rq, int budget,
>>>               struct veth_xdp_tx_bq *bq,
>>>               struct veth_stats *stats,
>>>               struct netdev_queue *peer_txq)
>>>   {
>>> +    struct veth_priv *priv = netdev_priv(rq->dev);
>>> +    struct veth_bql_state *state = &rq->bql_state;
>>>       int i, done = 0, n_xdpf = 0;
>>>       void *xdpf[VETH_XDP_BATCH];
>>> +    u64 bql_flush_ns;
>>> +
>>> +    /* Mirrored to both peers; paired with WRITE_ONCE() in veth_set_coalesce */
>>> +    bql_flush_ns = (u64)READ_ONCE(priv->tx_coal_usecs) * 1000;
>>> +
>>> +    /* Clamp stored timestamp in case we migrated to a CPU with a behind
>>> +     * sched_clock(); tries to reduce late BQL flushes.
>>> +     */
>>> +    state->time = min(state->time, sched_clock());
>>> +
>>> +    /* Flush completions that timed out since the previous NAPI poll. */
>>> +    veth_bql_maybe_complete(state, peer_txq, bql_flush_ns);>>
>>>       for (i = 0; i < budget; i++) {
>>>           void *ptr = __ptr_ring_consume(&rq->xdp_ring);
>>> @@ -1000,12 +1101,11 @@ static int veth_xdp_rcv(struct veth_rq *rq, int budget,
>>>               }
>>>           } else {
>>>               /* ndo_start_xmit */
>>> -            bool bql_charged = veth_ptr_is_bql(ptr);
>>>               struct sk_buff *skb = veth_ptr_to_skb(ptr);
>>>   +            if (veth_ptr_is_bql(ptr))
>>> +                state->n_bql++;
>>>               stats->xdp_bytes += skb->len;
>>> -            if (peer_txq && bql_charged)
>>> -                netdev_tx_completed_queue(peer_txq, 1, VETH_BQL_UNIT);
>>>                 skb = veth_xdp_rcv_skb(rq, skb, bq, stats);
>>>               if (skb) {
>>> @@ -1015,6 +1115,7 @@ static int veth_xdp_rcv(struct veth_rq *rq, int budget,
>>>                       napi_gro_receive(&rq->xdp_napi, skb);
>>>               }
>>>           }
>>> +        veth_bql_maybe_complete(state, peer_txq, bql_flush_ns);
>>>           done++;
>>
>> Sashiko-Nipa reports:
>>
>> "If veth_xdp_rcv() finishes and returns a done count less than the budget,
>> NAPI will go to sleep in veth_poll(). Do we need to unconditionally flush
>> any stranded BQL completions in veth_poll() before sleeping?
>> If completions are left in rq->bql_state indefinitely across NAPI idle
>> periods, it might present an artificially massive delay to DQL. This could
>> cause DQL to mistakenly conclude the hardware is extremely slow and
>> aggressively shrink dql.limit to its minimum, crippling throughput on
>> subsequent bursts."
>>
>> Again the issue that I found to be non-problematic in [1] and can be
>> seen by an BQL inflight > 0 when for example pktgen suddenly stops.
>>
>> If we would "unconditionally flush any stranded BQL completions in
>> veth_poll() before sleeping" we would *not* accumulate BQL completions
>> across NAPI polls but we want to do that.
>>
>> Do you agree?
>>
>> [1] https://lore.kernel.org/netdev/c8650d3a-e488-4279-b28f-549d766c23a1@tu-dortmund.de/
> 

^ permalink raw reply

* [PATCH net] net/tls: Consume empty data records in tls_sw_read_sock()
From: Chuck Lever @ 2026-06-30 19:15 UTC (permalink / raw)
  To: john.fastabend, kuba, sd; +Cc: davem, edumazet, pabeni, horms, netdev

A peer may send a zero-length TLS application_data record; TLS 1.3
explicitly permits these as a traffic-analysis countermeasure (RFC
8446, Section 5.1). After decryption such a record has full_len ==
0. tls_sw_read_sock() hands it to the read_actor, which has no
payload to consume and returns zero. The loop treats a zero return
as backpressure (used <= 0), requeues the skb at the head of
rx_list, and stops. rx_list is serviced head-first on the next
call, so the empty record is dequeued, fails the same way, and is
requeued again; every later record on the connection is blocked
behind it.

tls_sw_recvmsg() does not stall on this: a zero-length data record
copies nothing and falls through to consume_skb(). Mirror that in
the read_sock() path by recognizing an empty data record before
the actor runs, consuming it, and continuing.

Fixes: 662fbcec32f4 ("net/tls: implement ->read_sock()")
Signed-off-by: Chuck Lever <cel@kernel.org>
---
 net/tls/tls_sw.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index 9324e4ed20a3..d4afc90fd796 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -2115,6 +2115,17 @@ int tls_sw_read_sock(struct sock *sk, read_descriptor_t *desc,
 			goto read_sock_requeue;
 		}
 
+		/* An empty data record (legal in TLS 1.3) gives a zero
+		 * read_actor return, indistinguishable from the consumer
+		 * stalling; the used <= 0 path would requeue it at the
+		 * head of rx_list and block all later records. Consume it
+		 * here instead.
+		 */
+		if (rxm->full_len == 0) {
+			consume_skb(skb);
+			continue;
+		}
+
 		used = read_actor(desc, skb, rxm->offset, rxm->full_len);
 		if (used <= 0) {
 			if (!copied)
-- 
2.54.0


^ permalink raw reply related

* [RFC PATCH bpf-next v1 0/7] xdp: RX checksum metadata hint and checksum assertion over redirect
From: Vladimir Vdovin @ 2026-06-30 19:15 UTC (permalink / raw)
  To: bpf, netdev
  Cc: ast, daniel, andrii, martin.lau, sdf, hawk, john.fastabend, kuba,
	Vladimir Vdovin

This series lets XDP programs work with the hardware RX checksum verdict:
read what the NIC concluded about a packet, and carry a "the L4 checksum
is correct" assertion across a redirect so the stack does not revalidate
it in software.

When an XDP program redirects a frame to a cpumap (or any other path that
rebuilds an skb from an xdp_frame via __xdp_build_skb_from_frame()), the
HW RX checksum status is lost and the stack revalidates the L4 checksum in
software.

Two kfuncs are added:

 - bpf_xdp_metadata_rx_csum(): a device-bound RX-metadata hint, like the
   existing rx_hash / rx_vlan_tag ones.  It reports enum xdp_csum_status
   (XDP_CSUM_NONE / XDP_CSUM_VERIFIED) and is implemented for mlx5e, ice
   and veth.

 - bpf_xdp_assert_rx_csum(): a generic, non-device-bound kfunc that lets
   the program assert the L4 checksum is correct.  It sets a buff flag
   that rides into the xdp_frame, and __xdp_build_skb_from_frame() turns
   it into skb->ip_summed = CHECKSUM_UNNECESSARY.  The kernel cannot
   verify the assertion; the program takes responsibility, as it already
   does when rewriting packet contents.

Posted as RFC to get feedback on:

 - whether the read hint (bpf_xdp_metadata_rx_csum() and its driver
   support) belongs in this series at all.  bpf_xdp_assert_rx_csum() is
   self-contained and already covers the main use case: a program that
   computes or fixes the L4 checksum itself, or trusts the source, and
   wants the rebuilt skb to skip software revalidation.  The read hint is
   an optimization for programs that did not touch the payload and only
   want to relay the hardware verdict.  These could just as well be two
   independent series (assert-only first);
 - the kfunc naming, bpf_xdp_assert_rx_csum() in particular.

Testing:

 - new selftest xdp_cpumap_rx_csum drives a frame through a native-XDP
   veth into a cpumap redirect and checks, via fexit on
   __xdp_build_skb_from_frame(), that the rebuilt skb is
   CHECKSUM_UNNECESSARY iff the program called bpf_xdp_assert_rx_csum();
 - xdp_metadata calls bpf_xdp_metadata_rx_csum() over veth and checks both
   verdicts: XDP_CSUM_NONE for an AF_XDP-injected frame and
   XDP_CSUM_VERIFIED for one sent through the stack.

Vladimir Vdovin (7):
  xdp: let XDP programs assert the RX checksum over redirect
  selftests/bpf: add test for bpf_xdp_assert_rx_csum over cpumap
  xdp: add bpf_xdp_metadata_rx_csum() RX metadata kfunc
  net/mlx5e: support the rx_csum XDP metadata hint
  ice: support the rx_csum XDP metadata hint
  veth: support the rx_csum XDP metadata hint
  selftests/bpf: cover bpf_xdp_metadata_rx_csum in xdp_metadata

 Documentation/netlink/specs/netdev.yaml       |   5 +
 drivers/net/ethernet/intel/ice/ice_txrx_lib.c |  32 ++++
 .../net/ethernet/mellanox/mlx5/core/en/xdp.c  |  23 +++
 drivers/net/veth.c                            |  23 +++
 include/net/xdp.h                             |  23 +++
 include/uapi/linux/netdev.h                   |   3 +
 net/core/xdp.c                                |  73 ++++++++-
 tools/include/uapi/linux/netdev.h             |   3 +
 .../bpf/prog_tests/xdp_cpumap_rx_csum.c       | 150 ++++++++++++++++++
 .../selftests/bpf/prog_tests/xdp_metadata.c   |  10 ++
 .../selftests/bpf/progs/bpf_tracing_net.h     |   1 +
 .../bpf/progs/test_xdp_cpumap_rx_csum.c       |  51 ++++++
 .../selftests/bpf/progs/xdp_metadata.c        |   9 ++
 tools/testing/selftests/bpf/xdp_metadata.h    |   8 +
 14 files changed, 412 insertions(+), 2 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/xdp_cpumap_rx_csum.c
 create mode 100644 tools/testing/selftests/bpf/progs/test_xdp_cpumap_rx_csum.c


base-commit: f456c1922c49e6be5ce407ddb74a6e61af5b65cf
-- 
2.47.0


^ permalink raw reply

* [RFC PATCH bpf-next v1 1/7] xdp: let XDP programs assert the RX checksum over redirect
From: Vladimir Vdovin @ 2026-06-30 19:15 UTC (permalink / raw)
  To: bpf, netdev
  Cc: ast, daniel, andrii, martin.lau, sdf, hawk, john.fastabend, kuba,
	Vladimir Vdovin
In-Reply-To: <20260630191510.81402-1-deliran@verdict.gg>

When an XDP program redirects a frame to a cpumap (or any other path
that rebuilds an skb from an xdp_frame via __xdp_build_skb_from_frame()),
the HW RX checksum status is lost and the stack revalidates the L4
checksum in software.

Add a non-dev-bound kfunc, bpf_xdp_assert_rx_csum(), that lets the program
assert the L4 checksum is correct.  It sets XDP_FLAGS_RX_CSUM_UNNECESSARY
on the buffer; the flag rides into the xdp_frame and
__xdp_build_skb_from_frame() turns it into skb->ip_summed =
CHECKSUM_UNNECESSARY.  The kernel cannot verify the assertion, the program
takes responsibility, the same way it is already trusted to rewrite
arbitrary packet contents.

Signed-off-by: Vladimir Vdovin <deliran@verdict.gg>
---
 include/net/xdp.h | 11 +++++++++++
 net/core/xdp.c    | 50 +++++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 59 insertions(+), 2 deletions(-)

diff --git a/include/net/xdp.h b/include/net/xdp.h
index aa742f413c35..5a1e2cc9c312 100644
--- a/include/net/xdp.h
+++ b/include/net/xdp.h
@@ -81,6 +81,11 @@ enum xdp_buff_flags {
 	 * XDP program is not attached.
 	 */
 	XDP_FLAGS_FRAGS_UNREADABLE	= BIT(2),
+	/* XDP program asserts the L4 checksum is correct, so the skb built
+	 * out of this frame (e.g. on the cpumap redirect path) can be marked
+	 * CHECKSUM_UNNECESSARY instead of being validated in software.
+	 */
+	XDP_FLAGS_RX_CSUM_UNNECESSARY	= BIT(3),
 };
 
 struct xdp_buff {
@@ -316,6 +321,12 @@ xdp_frame_get_skb_flags(const struct xdp_frame *frame)
 	return frame->flags;
 }
 
+static __always_inline bool
+xdp_frame_rx_csum_unnecessary(const struct xdp_frame *frame)
+{
+	return !!(frame->flags & XDP_FLAGS_RX_CSUM_UNNECESSARY);
+}
+
 #define XDP_BULK_QUEUE_SIZE	16
 struct xdp_frame_bulk {
 	int count;
diff --git a/net/core/xdp.c b/net/core/xdp.c
index 9890a30584ba..63ee36ec93de 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -830,8 +830,11 @@ struct sk_buff *__xdp_build_skb_from_frame(struct xdp_frame *xdpf,
 	/* Essential SKB info: protocol and skb->dev */
 	skb->protocol = eth_type_trans(skb, dev);
 
+	/* HW checksum info, if the XDP program asserted it */
+	if (xdp_frame_rx_csum_unnecessary(xdpf))
+		skb->ip_summed = CHECKSUM_UNNECESSARY;
+
 	/* Optional SKB info, currently missing:
-	 * - HW checksum info		(skb->ip_summed)
 	 * - HW RX hash			(skb_set_hash)
 	 * - RX ring dev queue index	(skb_record_rx_queue)
 	 */
@@ -961,6 +964,31 @@ __bpf_kfunc int bpf_xdp_metadata_rx_vlan_tag(const struct xdp_md *ctx,
 	return -EOPNOTSUPP;
 }
 
+/**
+ * bpf_xdp_assert_rx_csum - Assert the packet's L4 checksum is correct.
+ * @ctx: XDP context pointer.
+ *
+ * Mark the frame so that an skb later built out of it (e.g. on the cpumap
+ * redirect path, see __xdp_build_skb_from_frame()) is set to
+ * CHECKSUM_UNNECESSARY instead of being validated in software when it enters
+ * the stack.
+ *
+ * This is an assertion made by the XDP program: the kernel cannot verify it.
+ * The program takes responsibility for the checksum being correct, the same
+ * way it is already trusted to rewrite arbitrary packet contents. If the
+ * program modifies L4 data after calling this kfunc the assertion may no
+ * longer hold.
+ *
+ * Return: 0.
+ */
+__bpf_kfunc int bpf_xdp_assert_rx_csum(struct xdp_md *ctx)
+{
+	struct xdp_buff *xdp = (struct xdp_buff *)ctx;
+
+	xdp->flags |= XDP_FLAGS_RX_CSUM_UNNECESSARY;
+	return 0;
+}
+
 __bpf_kfunc_end_defs();
 
 BTF_KFUNCS_START(xdp_metadata_kfunc_ids)
@@ -974,6 +1002,18 @@ static const struct btf_kfunc_id_set xdp_metadata_kfunc_set = {
 	.set   = &xdp_metadata_kfunc_ids,
 };
 
+/* Generic XDP kfuncs that need no driver support and are therefore not
+ * dev-bound (unlike the rx-metadata kfuncs above).
+ */
+BTF_KFUNCS_START(xdp_kfunc_ids)
+BTF_ID_FLAGS(func, bpf_xdp_assert_rx_csum)
+BTF_KFUNCS_END(xdp_kfunc_ids)
+
+static const struct btf_kfunc_id_set xdp_kfunc_set = {
+	.owner = THIS_MODULE,
+	.set   = &xdp_kfunc_ids,
+};
+
 BTF_ID_LIST(xdp_metadata_kfunc_ids_unsorted)
 #define XDP_METADATA_KFUNC(name, _, str, __) BTF_ID(func, str)
 XDP_METADATA_KFUNC_xxx
@@ -992,7 +1032,13 @@ bool bpf_dev_bound_kfunc_id(u32 btf_id)
 
 static int __init xdp_metadata_init(void)
 {
-	return register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &xdp_metadata_kfunc_set);
+	int ret;
+
+	ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &xdp_metadata_kfunc_set);
+	if (ret)
+		return ret;
+
+	return register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &xdp_kfunc_set);
 }
 late_initcall(xdp_metadata_init);
 
-- 
2.47.0


^ permalink raw reply related

* [RFC PATCH bpf-next v1 2/7] selftests/bpf: add test for bpf_xdp_assert_rx_csum over cpumap
From: Vladimir Vdovin @ 2026-06-30 19:15 UTC (permalink / raw)
  To: bpf, netdev
  Cc: ast, daniel, andrii, martin.lau, sdf, hawk, john.fastabend, kuba,
	Vladimir Vdovin
In-Reply-To: <20260630191510.81402-1-deliran@verdict.gg>

Drive a frame through a native-XDP veth into a cpumap redirect and
observe, via fexit on __xdp_build_skb_from_frame(), that the rebuilt skb
is CHECKSUM_UNNECESSARY when the program called bpf_xdp_assert_rx_csum()
and CHECKSUM_NONE otherwise.  fexit is used because cpumap GRO would
otherwise normalize ip_summed before any later hook can observe it.

Signed-off-by: Vladimir Vdovin <deliran@verdict.gg>
---
 .../bpf/prog_tests/xdp_cpumap_rx_csum.c       | 150 ++++++++++++++++++
 .../selftests/bpf/progs/bpf_tracing_net.h     |   1 +
 .../bpf/progs/test_xdp_cpumap_rx_csum.c       |  51 ++++++
 3 files changed, 202 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/xdp_cpumap_rx_csum.c
 create mode 100644 tools/testing/selftests/bpf/progs/test_xdp_cpumap_rx_csum.c

diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_cpumap_rx_csum.c b/tools/testing/selftests/bpf/prog_tests/xdp_cpumap_rx_csum.c
new file mode 100644
index 000000000000..2def92fe1111
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/xdp_cpumap_rx_csum.c
@@ -0,0 +1,150 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <net/if.h>
+#include <linux/if_ether.h>
+#include <linux/if_link.h>
+#include <linux/if_packet.h>
+#include <linux/ipv6.h>
+#include <netinet/in.h>
+#include <netinet/udp.h>
+#include <sys/socket.h>
+
+#include "test_progs.h"
+#include "network_helpers.h"
+#include <bpf/bpf_endian.h>
+#include "test_xdp_cpumap_rx_csum.skel.h"
+
+#define TEST_NS		"xdp_cm_csum_ns"
+#define UDP_TEST_PORT	7777
+
+/* Kernel skb->ip_summed values, not exported to userspace headers. */
+#define CHECKSUM_NONE		0
+#define CHECKSUM_UNNECESSARY	1
+
+struct udp_pkt {
+	struct ethhdr eth;
+	struct ipv6hdr iph;
+	struct udphdr udp;
+	__u8 payload[16];
+} __packed;
+
+static struct udp_pkt pkt = {
+	.eth.h_proto = __bpf_constant_htons(ETH_P_IPV6),
+	.eth.h_dest = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+	.eth.h_source = {0x66, 0x77, 0x88, 0x99, 0xaa, 0xbb},
+	.iph.version = 6,
+	.iph.nexthdr = IPPROTO_UDP,
+	.iph.payload_len = __bpf_constant_htons(sizeof(struct udphdr) + 16),
+	.iph.hop_limit = 64,
+	.udp.source = __bpf_constant_htons(1),
+	.udp.dest = __bpf_constant_htons(UDP_TEST_PORT),
+	.udp.len = __bpf_constant_htons(sizeof(struct udphdr) + 16),
+};
+
+/* Inject one frame on veth0; it is received on veth1 where native XDP
+ * redirects it into the cpumap. Report the ip_summed the rebuilt skb carried.
+ */
+static int inject_and_observe(struct test_xdp_cpumap_rx_csum *skel, int sfd,
+			      int ifindex_src, bool assert_csum, int *ip_summed)
+{
+	struct sockaddr_ll sll = {
+		.sll_family = AF_PACKET,
+		.sll_ifindex = ifindex_src,
+		.sll_halen = 0,
+	};
+	int i, n;
+
+	skel->bss->assert_csum = assert_csum;
+	skel->bss->seen = false;
+	skel->data->observed_ip_summed = -1;
+
+	n = sendto(sfd, &pkt, sizeof(pkt), 0, (void *)&sll, sizeof(sll));
+	if (!ASSERT_EQ(n, sizeof(pkt), "sendto"))
+		return -1;
+
+	/* The skb is built asynchronously by the cpumap kthread. */
+	for (i = 0; i < 20 && !skel->bss->seen; i++)
+		usleep(50000);
+
+	if (!ASSERT_TRUE(skel->bss->seen, "skb built from frame"))
+		return -1;
+
+	*ip_summed = skel->data->observed_ip_summed;
+	return 0;
+}
+
+void test_xdp_cpumap_rx_csum(void)
+{
+	struct test_xdp_cpumap_rx_csum *skel = NULL;
+	struct bpf_cpumap_val val = { .qsize = 192 };
+	struct bpf_link *fexit_link = NULL;
+	struct nstoken *nstoken = NULL;
+	int err, map_fd, ifindex_dst = 0, ifindex_src, sfd = -1, ip_summed;
+	bool xdp_attached = false;
+	__u32 idx = 0;
+
+	SYS(out, "ip netns add %s", TEST_NS);
+	nstoken = open_netns(TEST_NS);
+	if (!ASSERT_OK_PTR(nstoken, "open_netns"))
+		goto out;
+
+	/* veth pair: a frame TX'd on veth0 is RX'd on veth1. */
+	SYS(out, "ip link add veth0 type veth peer name veth1");
+	SYS(out, "ip link set veth0 up");
+	SYS(out, "ip link set veth1 up");
+
+	skel = test_xdp_cpumap_rx_csum__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "skel open_and_load"))
+		goto out;
+
+	/* cpumap entry without a program: a plain redirect that forces the
+	 * frame->skb conversion in __xdp_build_skb_from_frame().
+	 */
+	map_fd = bpf_map__fd(skel->maps.cpu_map);
+	err = bpf_map_update_elem(map_fd, &idx, &val, 0);
+	if (!ASSERT_OK(err, "cpumap update"))
+		goto out;
+
+	ifindex_dst = if_nametoindex("veth1");
+	ifindex_src = if_nametoindex("veth0");
+	if (!ASSERT_GT(ifindex_dst, 0, "veth1 ifindex") ||
+	    !ASSERT_GT(ifindex_src, 0, "veth0 ifindex"))
+		goto out;
+
+	/* Native XDP so the redirect goes through xdp_convert_buff_to_frame(),
+	 * which propagates the rx-csum flag into the frame. Generic mode would
+	 * redirect a ready-made skb and never hit our code path.
+	 */
+	err = bpf_xdp_attach(ifindex_dst, bpf_program__fd(skel->progs.xdp_redir),
+			     XDP_FLAGS_DRV_MODE, NULL);
+	if (!ASSERT_OK(err, "attach native xdp"))
+		goto out;
+	xdp_attached = true;
+
+	fexit_link = bpf_program__attach(skel->progs.on_build);
+	if (!ASSERT_OK_PTR(fexit_link, "attach fexit"))
+		goto out;
+
+	sfd = socket(AF_PACKET, SOCK_RAW, htons(ETH_P_ALL));
+	if (!ASSERT_GE(sfd, 0, "AF_PACKET socket"))
+		goto out;
+
+	/* Program asserts the checksum -> CHECKSUM_UNNECESSARY. */
+	if (!inject_and_observe(skel, sfd, ifindex_src, true, &ip_summed))
+		ASSERT_EQ(ip_summed, CHECKSUM_UNNECESSARY,
+			  "ip_summed marked unnecessary");
+
+	/* No assertion -> skb is left CHECKSUM_NONE for the stack to validate. */
+	if (!inject_and_observe(skel, sfd, ifindex_src, false, &ip_summed))
+		ASSERT_EQ(ip_summed, CHECKSUM_NONE, "ip_summed left none");
+
+out:
+	if (sfd >= 0)
+		close(sfd);
+	bpf_link__destroy(fexit_link);
+	if (xdp_attached)
+		bpf_xdp_detach(ifindex_dst, XDP_FLAGS_DRV_MODE, NULL);
+	test_xdp_cpumap_rx_csum__destroy(skel);
+	if (nstoken)
+		close_netns(nstoken);
+	SYS_NOFAIL("ip netns del %s", TEST_NS);
+}
diff --git a/tools/testing/selftests/bpf/progs/bpf_tracing_net.h b/tools/testing/selftests/bpf/progs/bpf_tracing_net.h
index d8dacef37c16..c3a0b2696035 100644
--- a/tools/testing/selftests/bpf/progs/bpf_tracing_net.h
+++ b/tools/testing/selftests/bpf/progs/bpf_tracing_net.h
@@ -87,6 +87,7 @@
 #define TCPOLEN_SACK_PERM	2
 
 #define CHECKSUM_NONE		0
+#define CHECKSUM_UNNECESSARY	1
 #define CHECKSUM_PARTIAL	3
 
 #define IFNAMSIZ		16
diff --git a/tools/testing/selftests/bpf/progs/test_xdp_cpumap_rx_csum.c b/tools/testing/selftests/bpf/progs/test_xdp_cpumap_rx_csum.c
new file mode 100644
index 000000000000..86c691887d25
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_xdp_cpumap_rx_csum.c
@@ -0,0 +1,51 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "vmlinux.h"
+#include "bpf_tracing_net.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include <bpf/bpf_endian.h>
+
+extern int bpf_xdp_assert_rx_csum(struct xdp_md *ctx) __ksym;
+
+struct {
+	__uint(type, BPF_MAP_TYPE_CPUMAP);
+	__uint(key_size, sizeof(__u32));
+	__uint(value_size, sizeof(struct bpf_cpumap_val));
+	__uint(max_entries, 1);
+} cpu_map SEC(".maps");
+
+/* Set from userspace before injecting each packet. */
+bool assert_csum = false;
+
+/* Filled in by the fexit program when the cpumap skb is built. */
+bool seen = false;
+int observed_ip_summed = -1;
+
+SEC("xdp")
+int xdp_redir(struct xdp_md *ctx)
+{
+	/* Assert the L4 checksum so the skb built on the cpumap redirect
+	 * path is marked CHECKSUM_UNNECESSARY instead of validated in software.
+	 */
+	if (assert_csum)
+		bpf_xdp_assert_rx_csum(ctx);
+
+	return bpf_redirect_map(&cpu_map, 0, 0);
+}
+
+/* Observe ip_summed exactly as __xdp_build_skb_from_frame() leaves it, before
+ * GRO in the cpumap kthread can normalize it. tc-ingress would be too late:
+ * GRO software-validates a CHECKSUM_NONE skb and marks it UNNECESSARY anyway.
+ */
+SEC("fexit/__xdp_build_skb_from_frame")
+int BPF_PROG(on_build, struct xdp_frame *xdpf, struct sk_buff *skb,
+	     struct net_device *dev, struct sk_buff *ret)
+{
+	if (ret && ret->protocol == bpf_htons(ETH_P_IPV6)) {
+		observed_ip_summed = ret->ip_summed;
+		seen = true;
+	}
+	return 0;
+}
+
+char _license[] SEC("license") = "GPL";
-- 
2.47.0


^ permalink raw reply related

* [RFC PATCH bpf-next v1 3/7] xdp: add bpf_xdp_metadata_rx_csum() RX metadata kfunc
From: Vladimir Vdovin @ 2026-06-30 19:15 UTC (permalink / raw)
  To: bpf, netdev
  Cc: ast, daniel, andrii, martin.lau, sdf, hawk, john.fastabend, kuba,
	Vladimir Vdovin
In-Reply-To: <20260630191510.81402-1-deliran@verdict.gg>

Add a device-bound RX-metadata kfunc that reports the hardware
checksum verdict (enum xdp_csum_status: XDP_CSUM_NONE / XDP_CSUM_VERIFIED)
through a new xmo_rx_csum operation, so an XDP program can make an
informed decision (e.g. call bpf_xdp_assert_rx_csum()) instead of trusting
blindly.  Wire it into the XDP_METADATA_KFUNC machinery and advertise it
via NETDEV_XDP_RX_METADATA_CSUM.

Signed-off-by: Vladimir Vdovin <deliran@verdict.gg>
---
 Documentation/netlink/specs/netdev.yaml |  5 +++++
 include/net/xdp.h                       | 12 ++++++++++++
 include/uapi/linux/netdev.h             |  3 +++
 net/core/xdp.c                          | 23 +++++++++++++++++++++++
 tools/include/uapi/linux/netdev.h       |  3 +++
 5 files changed, 46 insertions(+)

diff --git a/Documentation/netlink/specs/netdev.yaml b/Documentation/netlink/specs/netdev.yaml
index 5f143da7458c..86017f7402d9 100644
--- a/Documentation/netlink/specs/netdev.yaml
+++ b/Documentation/netlink/specs/netdev.yaml
@@ -61,6 +61,11 @@ definitions:
         doc: |
           Device is capable of exposing receive packet VLAN tag via
           bpf_xdp_metadata_rx_vlan_tag().
+      -
+        name: csum
+        doc: |
+          Device is capable of exposing receive packet checksum status via
+          bpf_xdp_metadata_rx_csum().
   -
     type: flags
     name: xsk-flags
diff --git a/include/net/xdp.h b/include/net/xdp.h
index 5a1e2cc9c312..40f6fba41962 100644
--- a/include/net/xdp.h
+++ b/include/net/xdp.h
@@ -597,6 +597,10 @@ void xdp_attachment_setup(struct xdp_attachment_info *info,
 			   NETDEV_XDP_RX_METADATA_VLAN_TAG, \
 			   bpf_xdp_metadata_rx_vlan_tag, \
 			   xmo_rx_vlan_tag) \
+	XDP_METADATA_KFUNC(XDP_METADATA_KFUNC_RX_CSUM, \
+			   NETDEV_XDP_RX_METADATA_CSUM, \
+			   bpf_xdp_metadata_rx_csum, \
+			   xmo_rx_csum) \
 
 enum xdp_rx_metadata {
 #define XDP_METADATA_KFUNC(name, _, __, ___) name,
@@ -654,12 +658,20 @@ enum xdp_rss_hash_type {
 	XDP_RSS_TYPE_L4_IPV6_SCTP_EX = XDP_RSS_TYPE_L4_IPV6_SCTP | XDP_RSS_L3_DYNHDR,
 };
 
+/* Checksum status reported by bpf_xdp_metadata_rx_csum(). */
+enum xdp_csum_status {
+	XDP_CSUM_NONE = 0,	/* HW did not validate the checksum */
+	XDP_CSUM_VERIFIED,	/* HW validated the L4 checksum; it is correct */
+};
+
 struct xdp_metadata_ops {
 	int	(*xmo_rx_timestamp)(const struct xdp_md *ctx, u64 *timestamp);
 	int	(*xmo_rx_hash)(const struct xdp_md *ctx, u32 *hash,
 			       enum xdp_rss_hash_type *rss_type);
 	int	(*xmo_rx_vlan_tag)(const struct xdp_md *ctx, __be16 *vlan_proto,
 				   u16 *vlan_tci);
+	int	(*xmo_rx_csum)(const struct xdp_md *ctx,
+			       enum xdp_csum_status *csum_status);
 };
 
 #ifdef CONFIG_NET
diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h
index 2f3ab75e8cc0..99cda716f0ee 100644
--- a/include/uapi/linux/netdev.h
+++ b/include/uapi/linux/netdev.h
@@ -47,11 +47,14 @@ enum netdev_xdp_act {
  *   hash via bpf_xdp_metadata_rx_hash().
  * @NETDEV_XDP_RX_METADATA_VLAN_TAG: Device is capable of exposing receive
  *   packet VLAN tag via bpf_xdp_metadata_rx_vlan_tag().
+ * @NETDEV_XDP_RX_METADATA_CSUM: Device is capable of exposing receive packet
+ *   checksum status via bpf_xdp_metadata_rx_csum().
  */
 enum netdev_xdp_rx_metadata {
 	NETDEV_XDP_RX_METADATA_TIMESTAMP = 1,
 	NETDEV_XDP_RX_METADATA_HASH = 2,
 	NETDEV_XDP_RX_METADATA_VLAN_TAG = 4,
+	NETDEV_XDP_RX_METADATA_CSUM = 8,
 };
 
 /**
diff --git a/net/core/xdp.c b/net/core/xdp.c
index 63ee36ec93de..7f4b5c6f7c87 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -964,6 +964,29 @@ __bpf_kfunc int bpf_xdp_metadata_rx_vlan_tag(const struct xdp_md *ctx,
 	return -EOPNOTSUPP;
 }
 
+/**
+ * bpf_xdp_metadata_rx_csum - Read the device's RX checksum verdict.
+ * @ctx: XDP context pointer.
+ * @csum_status: Destination pointer for the checksum status.
+ *
+ * Report what the hardware concluded about the packet's checksum, so the
+ * program can decide whether to assert it (e.g. via bpf_xdp_assert_rx_csum()
+ * before a cpumap redirect) instead of having the stack validate it again.
+ *
+ * On ``XDP_CSUM_VERIFIED`` the device has checked the L4 checksum and it is
+ * correct. ``XDP_CSUM_NONE`` means the device did not validate it.
+ *
+ * Return:
+ * * Returns 0 on success or ``-errno`` on error.
+ * * ``-EOPNOTSUPP`` : device driver doesn't implement kfunc
+ * * ``-ENODATA``    : checksum information is not available
+ */
+__bpf_kfunc int bpf_xdp_metadata_rx_csum(const struct xdp_md *ctx,
+					 enum xdp_csum_status *csum_status)
+{
+	return -EOPNOTSUPP;
+}
+
 /**
  * bpf_xdp_assert_rx_csum - Assert the packet's L4 checksum is correct.
  * @ctx: XDP context pointer.
diff --git a/tools/include/uapi/linux/netdev.h b/tools/include/uapi/linux/netdev.h
index 2f3ab75e8cc0..99cda716f0ee 100644
--- a/tools/include/uapi/linux/netdev.h
+++ b/tools/include/uapi/linux/netdev.h
@@ -47,11 +47,14 @@ enum netdev_xdp_act {
  *   hash via bpf_xdp_metadata_rx_hash().
  * @NETDEV_XDP_RX_METADATA_VLAN_TAG: Device is capable of exposing receive
  *   packet VLAN tag via bpf_xdp_metadata_rx_vlan_tag().
+ * @NETDEV_XDP_RX_METADATA_CSUM: Device is capable of exposing receive packet
+ *   checksum status via bpf_xdp_metadata_rx_csum().
  */
 enum netdev_xdp_rx_metadata {
 	NETDEV_XDP_RX_METADATA_TIMESTAMP = 1,
 	NETDEV_XDP_RX_METADATA_HASH = 2,
 	NETDEV_XDP_RX_METADATA_VLAN_TAG = 4,
+	NETDEV_XDP_RX_METADATA_CSUM = 8,
 };
 
 /**
-- 
2.47.0


^ permalink raw reply related

* [RFC PATCH bpf-next v1 4/7] net/mlx5e: support the rx_csum XDP metadata hint
From: Vladimir Vdovin @ 2026-06-30 19:15 UTC (permalink / raw)
  To: bpf, netdev
  Cc: ast, daniel, andrii, martin.lau, sdf, hawk, john.fastabend, kuba,
	Vladimir Vdovin
In-Reply-To: <20260630191510.81402-1-deliran@verdict.gg>

Implement xmo_rx_csum by reading CQE_L3_OK/CQE_L4_OK, mirroring the
verdict mlx5e_handle_csum() uses for CHECKSUM_UNNECESSARY.
CHECKSUM_COMPLETE is intentionally not surfaced: it is already
disabled while an XDP program is loaded.

Signed-off-by: Vladimir Vdovin <deliran@verdict.gg>
---
 .../net/ethernet/mellanox/mlx5/core/en/xdp.c  | 23 +++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
index d8c7cb8837d7..6ac06bd24c79 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
@@ -277,10 +277,33 @@ static int mlx5e_xdp_rx_vlan_tag(const struct xdp_md *ctx, __be16 *vlan_proto,
 	return 0;
 }
 
+static int mlx5e_xdp_rx_csum(const struct xdp_md *ctx,
+			     enum xdp_csum_status *csum_status)
+{
+	const struct mlx5e_xdp_buff *_ctx = (void *)ctx;
+	const struct mlx5_cqe64 *cqe = _ctx->cqe;
+
+	if (unlikely(!(_ctx->xdp.rxq->dev->features & NETIF_F_RXCSUM)))
+		return -ENODATA;
+
+	/* Same verdict the normal RX path uses for CHECKSUM_UNNECESSARY.
+	 * CHECKSUM_COMPLETE is deliberately not surfaced here: it is disabled
+	 * while an XDP program is loaded (see mlx5e_handle_csum()).
+	 */
+	if (likely((cqe->hds_ip_ext & CQE_L3_OK) &&
+		   (cqe->hds_ip_ext & CQE_L4_OK)))
+		*csum_status = XDP_CSUM_VERIFIED;
+	else
+		*csum_status = XDP_CSUM_NONE;
+
+	return 0;
+}
+
 const struct xdp_metadata_ops mlx5e_xdp_metadata_ops = {
 	.xmo_rx_timestamp		= mlx5e_xdp_rx_timestamp,
 	.xmo_rx_hash			= mlx5e_xdp_rx_hash,
 	.xmo_rx_vlan_tag		= mlx5e_xdp_rx_vlan_tag,
+	.xmo_rx_csum			= mlx5e_xdp_rx_csum,
 };
 
 struct mlx5e_xsk_tx_complete {
-- 
2.47.0


^ permalink raw reply related

* [RFC PATCH bpf-next v1 5/7] ice: support the rx_csum XDP metadata hint
From: Vladimir Vdovin @ 2026-06-30 19:15 UTC (permalink / raw)
  To: bpf, netdev
  Cc: ast, daniel, andrii, martin.lau, sdf, hawk, john.fastabend, kuba,
	Vladimir Vdovin
In-Reply-To: <20260630191510.81402-1-deliran@verdict.gg>

Implement xmo_rx_csum from the Rx flex descriptor status0 bits
(L3L4P set and no XSUM_L4E), mirroring ice_rx_csum().  Return -ENODATA
when RX checksum offload (NETIF_F_RXCSUM) is disabled, since the status
bits are not meaningful then.

Signed-off-by: Vladimir Vdovin <deliran@verdict.gg>
---
 drivers/net/ethernet/intel/ice/ice_txrx_lib.c | 32 +++++++++++++++++++
 1 file changed, 32 insertions(+)

diff --git a/drivers/net/ethernet/intel/ice/ice_txrx_lib.c b/drivers/net/ethernet/intel/ice/ice_txrx_lib.c
index e695a664e53d..d13c5e76bc13 100644
--- a/drivers/net/ethernet/intel/ice/ice_txrx_lib.c
+++ b/drivers/net/ethernet/intel/ice/ice_txrx_lib.c
@@ -594,8 +594,40 @@ static int ice_xdp_rx_vlan_tag(const struct xdp_md *ctx, __be16 *vlan_proto,
 	return 0;
 }
 
+/**
+ * ice_xdp_rx_csum - RX checksum XDP hint handler
+ * @ctx: XDP buff pointer
+ * @csum_status: destination for the checksum verdict
+ *
+ * Report whether the hardware validated the packet's L4 checksum, mirroring
+ * the verdict ice_rx_csum() uses for CHECKSUM_UNNECESSARY.  Return -ENODATA
+ * when RX checksum offload is disabled, since the status bits are not
+ * meaningful then.
+ */
+static int ice_xdp_rx_csum(const struct xdp_md *ctx,
+			   enum xdp_csum_status *csum_status)
+{
+	const struct libeth_xdp_buff *xdp_ext = (void *)ctx;
+	struct ice_rx_ring *rx_ring;
+	u16 status0;
+
+	rx_ring = libeth_xdp_buff_to_rq(xdp_ext, typeof(*rx_ring), xdp_rxq);
+	if (!(rx_ring->netdev->features & NETIF_F_RXCSUM))
+		return -ENODATA;
+
+	status0 = le16_to_cpu(xdp_ext->desc->wb.status_error0);
+	if ((status0 & BIT(ICE_RX_FLEX_DESC_STATUS0_L3L4P_S)) &&
+	    !(status0 & BIT(ICE_RX_FLEX_DESC_STATUS0_XSUM_L4E_S)))
+		*csum_status = XDP_CSUM_VERIFIED;
+	else
+		*csum_status = XDP_CSUM_NONE;
+
+	return 0;
+}
+
 const struct xdp_metadata_ops ice_xdp_md_ops = {
 	.xmo_rx_timestamp		= ice_xdp_rx_hw_ts,
 	.xmo_rx_hash			= ice_xdp_rx_hash,
 	.xmo_rx_vlan_tag		= ice_xdp_rx_vlan_tag,
+	.xmo_rx_csum			= ice_xdp_rx_csum,
 };
-- 
2.47.0


^ permalink raw reply related

* [RFC PATCH bpf-next v1 6/7] veth: support the rx_csum XDP metadata hint
From: Vladimir Vdovin @ 2026-06-30 19:15 UTC (permalink / raw)
  To: bpf, netdev
  Cc: ast, daniel, andrii, martin.lau, sdf, hawk, john.fastabend, kuba,
	Vladimir Vdovin
In-Reply-To: <20260630191510.81402-1-deliran@verdict.gg>

Implement xmo_rx_csum from skb->ip_summed.  veth has no real hardware;
this surfaces whatever checksum verdict the skb already carries and makes
the metadata kfunc testable without a NIC.

Signed-off-by: Vladimir Vdovin <deliran@verdict.gg>
---
 drivers/net/veth.c | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/drivers/net/veth.c b/drivers/net/veth.c
index 1c5142149175..b7bc5a3b07e5 100644
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c
@@ -1700,6 +1700,28 @@ static int veth_xdp_rx_vlan_tag(const struct xdp_md *ctx, __be16 *vlan_proto,
 	return err;
 }
 
+static int veth_xdp_rx_csum(const struct xdp_md *ctx,
+			    enum xdp_csum_status *csum_status)
+{
+	const struct veth_xdp_buff *_ctx = (void *)ctx;
+	const struct sk_buff *skb = _ctx->skb;
+
+	if (!skb)
+		return -ENODATA;
+
+	/* veth has no real hardware; surface whatever checksum verdict the
+	 * skb already carries (e.g. CHECKSUM_PARTIAL/UNNECESSARY from a local
+	 * sender or a previous validation).
+	 */
+	if (skb->ip_summed == CHECKSUM_UNNECESSARY ||
+	    skb->ip_summed == CHECKSUM_PARTIAL)
+		*csum_status = XDP_CSUM_VERIFIED;
+	else
+		*csum_status = XDP_CSUM_NONE;
+
+	return 0;
+}
+
 static const struct net_device_ops veth_netdev_ops = {
 	.ndo_init            = veth_dev_init,
 	.ndo_open            = veth_open,
@@ -1725,6 +1747,7 @@ static const struct xdp_metadata_ops veth_xdp_metadata_ops = {
 	.xmo_rx_timestamp		= veth_xdp_rx_timestamp,
 	.xmo_rx_hash			= veth_xdp_rx_hash,
 	.xmo_rx_vlan_tag		= veth_xdp_rx_vlan_tag,
+	.xmo_rx_csum			= veth_xdp_rx_csum,
 };
 
 #define VETH_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HW_CSUM | \
-- 
2.47.0


^ permalink raw reply related

* [RFC PATCH bpf-next v1 7/7] selftests/bpf: cover bpf_xdp_metadata_rx_csum in xdp_metadata
From: Vladimir Vdovin @ 2026-06-30 19:15 UTC (permalink / raw)
  To: bpf, netdev
  Cc: ast, daniel, andrii, martin.lau, sdf, hawk, john.fastabend, kuba,
	Vladimir Vdovin
In-Reply-To: <20260630191510.81402-1-deliran@verdict.gg>

Call bpf_xdp_metadata_rx_csum() in the xdp_metadata program and export the
status to userspace.  veth surfaces skb->ip_summed: a frame injected via
AF_XDP carries no checksum context (XDP_CSUM_NONE), while one sent through
the stack is CHECKSUM_PARTIAL (XDP_CSUM_VERIFIED).  Assert each.

Signed-off-by: Vladimir Vdovin <deliran@verdict.gg>
---
 tools/testing/selftests/bpf/prog_tests/xdp_metadata.c | 10 ++++++++++
 tools/testing/selftests/bpf/progs/xdp_metadata.c      |  9 +++++++++
 tools/testing/selftests/bpf/xdp_metadata.h            |  8 ++++++++
 3 files changed, 27 insertions(+)

diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_metadata.c b/tools/testing/selftests/bpf/prog_tests/xdp_metadata.c
index 5c31054ad4a4..77f55696eb78 100644
--- a/tools/testing/selftests/bpf/prog_tests/xdp_metadata.c
+++ b/tools/testing/selftests/bpf/prog_tests/xdp_metadata.c
@@ -310,6 +310,16 @@ static int verify_xsk_metadata(struct xsk *xsk, bool sent_from_af_xdp)
 	if (!ASSERT_NEQ(meta->rx_hash, 0, "rx_hash"))
 		return -1;
 
+	/* veth surfaces the checksum verdict from skb->ip_summed.  A packet
+	 * injected via AF_XDP carries no checksum context and is CHECKSUM_NONE,
+	 * while one sent through the stack is CHECKSUM_PARTIAL and reads back as
+	 * verified.
+	 */
+	if (!ASSERT_EQ(meta->rx_csum_status,
+		       sent_from_af_xdp ? XDP_META_CSUM_NONE : XDP_META_CSUM_VERIFIED,
+		       "rx_csum_status"))
+		return -1;
+
 	if (!sent_from_af_xdp) {
 		if (!ASSERT_NEQ(meta->rx_hash_type & XDP_RSS_TYPE_L4, 0, "rx_hash_type"))
 			return -1;
diff --git a/tools/testing/selftests/bpf/progs/xdp_metadata.c b/tools/testing/selftests/bpf/progs/xdp_metadata.c
index 09bb8a038d52..0089c6c5a2e4 100644
--- a/tools/testing/selftests/bpf/progs/xdp_metadata.c
+++ b/tools/testing/selftests/bpf/progs/xdp_metadata.c
@@ -33,6 +33,8 @@ extern int bpf_xdp_metadata_rx_hash(const struct xdp_md *ctx, __u32 *hash,
 extern int bpf_xdp_metadata_rx_vlan_tag(const struct xdp_md *ctx,
 					__be16 *vlan_proto,
 					__u16 *vlan_tci) __ksym;
+extern int bpf_xdp_metadata_rx_csum(const struct xdp_md *ctx,
+				    enum xdp_csum_status *csum_status) __ksym;
 
 SEC("xdp")
 int rx(struct xdp_md *ctx)
@@ -43,6 +45,7 @@ int rx(struct xdp_md *ctx)
 	struct udphdr *udp = NULL;
 	struct iphdr *iph = NULL;
 	struct xdp_meta *meta;
+	enum xdp_csum_status csum_status;
 	u64 timestamp = -1;
 	int ret;
 
@@ -99,6 +102,12 @@ int rx(struct xdp_md *ctx)
 	bpf_xdp_metadata_rx_vlan_tag(ctx, &meta->rx_vlan_proto,
 				     &meta->rx_vlan_tci);
 
+	ret = bpf_xdp_metadata_rx_csum(ctx, &csum_status);
+	if (ret < 0)
+		meta->rx_csum_err = ret;
+	else
+		meta->rx_csum_status = csum_status;
+
 	return bpf_redirect_map(&xsk, ctx->rx_queue_index, XDP_PASS);
 }
 
diff --git a/tools/testing/selftests/bpf/xdp_metadata.h b/tools/testing/selftests/bpf/xdp_metadata.h
index 87318ad1117a..ba1b2902b371 100644
--- a/tools/testing/selftests/bpf/xdp_metadata.h
+++ b/tools/testing/selftests/bpf/xdp_metadata.h
@@ -30,6 +30,10 @@ enum xdp_meta_field {
 	XDP_META_FIELD_VLAN_TAG	= BIT(2),
 };
 
+/* Mirror of enum xdp_csum_status (include/net/xdp.h) for userspace asserts. */
+#define XDP_META_CSUM_NONE	0
+#define XDP_META_CSUM_VERIFIED	1
+
 struct xdp_meta {
 	union {
 		__u64 rx_timestamp;
@@ -48,5 +52,9 @@ struct xdp_meta {
 		};
 		__s32 rx_vlan_tag_err;
 	};
+	union {
+		__u32 rx_csum_status;
+		__s32 rx_csum_err;
+	};
 	enum xdp_meta_field hint_valid;
 };
-- 
2.47.0


^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox