Netdev List
 help / color / mirror / Atom feed
* [syzbot] [wpan?] general protection fault in ieee802154_release_queue
From: syzbot @ 2026-06-29  9:53 UTC (permalink / raw)
  To: alex.aring, davem, edumazet, horms, kuba, linux-kernel,
	linux-wpan, miquel.raynal, netdev, pabeni, stefan, syzkaller-bugs

Hello,

syzbot found the following issue on:

HEAD commit:    b85966adbf5d Merge tag 'net-next-7.2' of git://git.kernel...
git tree:       net-next
console output: https://syzkaller.appspot.com/x/log.txt?x=17ac7046580000
kernel config:  https://syzkaller.appspot.com/x/.config?x=9a9f723a32776544
dashboard link: https://syzkaller.appspot.com/bug?extid=36256deb69a588e9290e
compiler:       Debian clang version 22.1.6 (++20260514074242+fc4aad7b5db3-1~exp1~20260514074407.73), Debian LLD 22.1.6

Unfortunately, I don't have any reproducer for this issue yet.

Downloadable assets:
disk image: https://storage.googleapis.com/syzbot-assets/d65306d96573/disk-b85966ad.raw.xz
vmlinux: https://storage.googleapis.com/syzbot-assets/ef43139aab0e/vmlinux-b85966ad.xz
kernel image: https://storage.googleapis.com/syzbot-assets/26d4d1ab67c3/bzImage-b85966ad.xz

IMPORTANT: if you fix the issue, please add the following tag to the commit:
Reported-by: syzbot+36256deb69a588e9290e@syzkaller.appspotmail.com

Oops: general protection fault, probably for non-canonical address 0xfbd59c0000000043: 0000 [#1] SMP KASAN PTI
KASAN: maybe wild-memory-access in range [0xdead000000000218-0xdead00000000021f]
CPU: 1 UID: 0 PID: 15064 Comm: syz.4.2289 Not tainted syzkaller #0 PREEMPT(full) 
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 05/09/2026
RIP: 0010:ieee802154_wake_queue net/mac802154/util.c:34 [inline]
RIP: 0010:ieee802154_release_queue+0x1b0/0x380 net/mac802154/util.c:83
Code: 42 80 3c 30 00 74 08 4c 89 e7 e8 8b f4 d0 f6 4d 8b 2c 24 4d 39 e5 0f 84 d6 00 00 00 4d 8d bd 18 01 00 00 4c 89 f8 48 c1 e8 03 <42> 80 3c 30 00 74 08 4c 89 ff e8 61 f4 d0 f6 49 8b 2f 48 85 ed 74
RSP: 0018:ffffc90005f3f0d0 EFLAGS: 00010802
RAX: 1bd5a00000000043 RBX: ffff88802a41a760 RCX: 0000000000080000
RDX: ffffc90007f8c000 RSI: 000000000001e208 RDI: 000000000001e209
RBP: ffff88802a43c018 R08: ffffffff903116f7 R09: 1ffffffff20622de
R10: dffffc0000000000 R11: fffffbfff20622df R12: ffff88802a41a770
R13: dead000000000100 R14: dffffc0000000000 R15: dead000000000218
FS:  00007f6a783b06c0(0000) GS:ffff88812537c000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007f8924187cc0 CR3: 00000000a6c7a000 CR4: 00000000003526f0
Call Trace:
 <TASK>
 ieee802154_xmit_complete+0x11d/0x290 net/mac802154/util.c:140
 hwsim_hw_xmit+0x1571/0x1620 drivers/net/ieee802154/mac802154_hwsim.c:288
 drv_xmit_async net/mac802154/driver-ops.h:16 [inline]
 ieee802154_tx+0x26d/0x510 net/mac802154/tx.c:89
 ieee802154_hot_tx net/mac802154/tx.c:207 [inline]
 ieee802154_subif_start_xmit+0x110/0x190 net/mac802154/tx.c:239
 __netdev_start_xmit include/linux/netdevice.h:5387 [inline]
 netdev_start_xmit include/linux/netdevice.h:5396 [inline]
 xmit_one net/core/dev.c:3889 [inline]
 dev_hard_start_xmit+0x2cd/0x830 net/core/dev.c:3905
 sch_direct_xmit+0x257/0x4c0 net/sched/sch_generic.c:372
 __dev_xmit_skb net/core/dev.c:4211 [inline]
 __dev_queue_xmit+0x1754/0x37f0 net/core/dev.c:4833
 dev_queue_xmit include/linux/netdevice.h:3436 [inline]
 dgram_sendmsg+0x709/0xe30 net/ieee802154/socket.c:689
 sock_sendmsg_nosec net/socket.c:775 [inline]
 __sock_sendmsg net/socket.c:790 [inline]
 ____sys_sendmsg+0x9b9/0xa20 net/socket.c:2684
 ___sys_sendmsg+0x2a5/0x360 net/socket.c:2738
 __sys_sendmmsg+0x273/0x4d0 net/socket.c:2827
 __do_sys_sendmmsg net/socket.c:2854 [inline]
 __se_sys_sendmmsg net/socket.c:2851 [inline]
 __x64_sys_sendmmsg+0xa0/0xc0 net/socket.c:2851
 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline]
 do_syscall_64+0x174/0x580 arch/x86/entry/syscall_64.c:94
 entry_SYSCALL_64_after_hwframe+0x77/0x7f
RIP: 0033:0x7f6a7759ce59
Code: ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 e8 ff ff ff f7 d8 64 89 01 48
RSP: 002b:00007f6a783b0028 EFLAGS: 00000246 ORIG_RAX: 0000000000000133
RAX: ffffffffffffffda RBX: 00007f6a77815fa0 RCX: 00007f6a7759ce59
RDX: 0000000004000050 RSI: 00002000000196c0 RDI: 000000000000000d
RBP: 00007f6a77632e6f R08: 0000000000000000 R09: 0000000000000000
R10: 000000000400c010 R11: 0000000000000246 R12: 0000000000000000
R13: 00007f6a77816038 R14: 00007f6a77815fa0 R15: 00007ffe3e47cbf8
 </TASK>
Modules linked in:
---[ end trace 0000000000000000 ]---
RIP: 0010:ieee802154_wake_queue net/mac802154/util.c:34 [inline]
RIP: 0010:ieee802154_release_queue+0x1b0/0x380 net/mac802154/util.c:83
Code: 42 80 3c 30 00 74 08 4c 89 e7 e8 8b f4 d0 f6 4d 8b 2c 24 4d 39 e5 0f 84 d6 00 00 00 4d 8d bd 18 01 00 00 4c 89 f8 48 c1 e8 03 <42> 80 3c 30 00 74 08 4c 89 ff e8 61 f4 d0 f6 49 8b 2f 48 85 ed 74
RSP: 0018:ffffc90005f3f0d0 EFLAGS: 00010802
RAX: 1bd5a00000000043 RBX: ffff88802a41a760 RCX: 0000000000080000
RDX: ffffc90007f8c000 RSI: 000000000001e208 RDI: 000000000001e209
RBP: ffff88802a43c018 R08: ffffffff903116f7 R09: 1ffffffff20622de
R10: dffffc0000000000 R11: fffffbfff20622df R12: ffff88802a41a770
R13: dead000000000100 R14: dffffc0000000000 R15: dead000000000218
FS:  00007f6a783b06c0(0000) GS:ffff88812537c000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007f8924187cc0 CR3: 00000000a6c7a000 CR4: 00000000003526f0
----------------
Code disassembly (best guess):
   0:	42 80 3c 30 00       	cmpb   $0x0,(%rax,%r14,1)
   5:	74 08                	je     0xf
   7:	4c 89 e7             	mov    %r12,%rdi
   a:	e8 8b f4 d0 f6       	call   0xf6d0f49a
   f:	4d 8b 2c 24          	mov    (%r12),%r13
  13:	4d 39 e5             	cmp    %r12,%r13
  16:	0f 84 d6 00 00 00    	je     0xf2
  1c:	4d 8d bd 18 01 00 00 	lea    0x118(%r13),%r15
  23:	4c 89 f8             	mov    %r15,%rax
  26:	48 c1 e8 03          	shr    $0x3,%rax
* 2a:	42 80 3c 30 00       	cmpb   $0x0,(%rax,%r14,1) <-- trapping instruction
  2f:	74 08                	je     0x39
  31:	4c 89 ff             	mov    %r15,%rdi
  34:	e8 61 f4 d0 f6       	call   0xf6d0f49a
  39:	49 8b 2f             	mov    (%r15),%rbp
  3c:	48 85 ed             	test   %rbp,%rbp
  3f:	74                   	.byte 0x74


---
This report is generated by a bot. It may contain errors.
See https://goo.gl/tpsmEJ for more information about syzbot.
syzbot engineers can be reached at syzkaller@googlegroups.com.

syzbot will keep track of this issue. See:
https://goo.gl/tpsmEJ#status for how to communicate with syzbot.

If the report is already addressed, let syzbot know by replying with:
#syz fix: exact-commit-title

If you want to overwrite report's subsystems, reply with:
#syz set subsystems: new-subsystem
(See the list of subsystem names on the web dashboard)

If the report is a duplicate of another one, reply with:
#syz dup: exact-subject-of-another-report

If you want to undo deduplication, reply with:
#syz undup

^ permalink raw reply

* [PATCH net 0/3 v2] Fix broken TC_ACT_REDIRECT
From: Jamal Hadi Salim @ 2026-06-29 10:21 UTC (permalink / raw)
  To: netdev
  Cc: jiri, davem, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Simon Horman, toke, Steven Rostedt, Petr Machata,
	Alexei Starovoitov, Daniel Borkmann, John Fastabend,
	Jesper Dangaard Brouer, linux-rt-devel, bpf, security, stable,
	Jamal Hadi Salim

When sashiko-gemini[1] reviewed commit a8a02897f2b4
("net/sched: cls_api: Handle TC_ACT_CONSUMED in tcf_qevent_handle") it
 correctly pointed out the following:

"
This is a pre-existing issue, but does executing a redirect via a qevent
filter cause a NULL pointer dereference?
When tcf_qevent_handle() processes a TC_ACT_REDIRECT, it calls
skb_do_redirect(). This eventually calls bpf_net_ctx_get_ri() which
dereferences the task bpf_net_context:
include/linux/filter.h:bpf_net_ctx_get_ri() {
    ...
    struct bpf_net_context *bpf_net_ctx = bpf_net_ctx_get();
    if (!(bpf_net_ctx->ri.kern_flags & BPF_RI_F_RI_INIT)) {
    ...
}
Since qevents are evaluated during enqueue, which runs within
__dev_queue_xmit() after sch_handle_egress() has already executed and
cleared the bpf_net_context pointer, will this dereference a NULL pointer?
"

That issue is fixed in patch 1. See the commit log for details.

Upon further investigation it turns out that TC_ACT_REDIRECT being returned
from the egress qdiscs never actually worked. When an action returns that
code we would silently loose it and the packet will never be redirected.
After all those years, if nobody complained, my gut feel is it was never
used by anyone with serious need for it.
Patch 2 fixes it by 1) putting a warning out when someone does and 2) asking
the core to drop the packet. At least this would help whoever is
misconfiguring to diagnose the issue much faster.
I had initially attempted to "fix" this and make it work, but unfortunately
it's a bit ugly so i left i didnt think it was worth fixing

Apologies for the shotgun Cc - its what get_maintainer.pl told me to use.


[1] https://sashiko.dev/#/patchset/20260620130749.226642-1-jhs%40mojatatu.com

---
Changes since v1 (address 3 sashiko comments):
1)Patch 1: Address pre-existing issue to cover asynchronous qdisc enqueue
  operations in particular if bpf_redirect() is invoked from an attached
   ebpf program (the helper invokes bpf_net_ctx_get_ri())
https://sashiko.dev/#/patchset/20260620130749.226642-1-jhs%40mojatatu.com

2)Patch 2: Explain in the commit message that it is actually design intent to
  remove TC_ACT_REDIRECT from tcf_qevent_handle().
https://sashiko.dev/#/patchset/20260626165156.169012-1-jhs@mojatatu.com?part=2

3) Patch 3: be explicit with $EBPFDIR
https://sashiko.dev/#/patchset/20260626165156.169012-1-jhs@mojatatu.com?part=3
---
 net/core/dev.c                                  | 31 +++++++++++++++----
 include/net/pkt_cls.h                            | 13 +++++++
 net/sched/cls_api.c                              |  6 +---
 net/sched/sch_cake.c                             |  2 +-
 net/sched/sch_drr.c                              |  2 +-
 net/sched/sch_dualpi2.c                          |  2 +-
 net/sched/sch_ets.c                              |  2 +-
 net/sched/sch_fq_codel.c                         |  2 +-
 net/sched/sch_fq_pie.c                           |  2 +-
 net/sched/sch_hfsc.c                             |  2 +-
 net/sched/sch_htb.c                              |  2 +-
 net/sched/sch_multiq.c                           |  2 +-
 net/sched/sch_prio.c                             |  2 +-
 net/sched/sch_qfq.c                              |  2 +-
 net/sched/sch_sfb.c                              |  2 +-
 net/sched/sch_sfq.c                              |  2 +-
 tools/testing/selftests/tc-testing/action-ebpf   | Bin 856 -> 9072 bytes
 tools/testing/selftests/tc-testing/action.c      |   5 +++
 .../tc-testing/tc-tests/infra/qdiscs.json        |  32 ++++++++++++++
 19 files changed, 87 insertions(+), 26 deletions(-)


^ permalink raw reply

* [PATCH net 2/3 v2] net/sched: Handle TC_ACT_REDIRECT from qdisc filter chains
From: Jamal Hadi Salim @ 2026-06-29 10:21 UTC (permalink / raw)
  To: netdev
  Cc: jiri, davem, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Simon Horman, toke, Steven Rostedt, Petr Machata,
	Alexei Starovoitov, Daniel Borkmann, John Fastabend,
	Jesper Dangaard Brouer, linux-rt-devel, bpf, security, stable,
	Jamal Hadi Salim, Victor Nogueira
In-Reply-To: <20260629102157.737306-1-jhs@mojatatu.com>

When a TC filter attached to a qdisc filter chain returns
TC_ACT_REDIRECT (ex: via an eBPF program calling bpf_redirect() or an
act_bpf action), the redirect was silently lost i.e no qdisc classify
function handled TC_ACT_REDIRECT, so the packet fell through the
switch and was enqueued normally instead of being redirected.

This has been broken since bpf_redirect() was introduced for TC in
commit 27b29f63058d ("bpf: add bpf_redirect() helper"). We got lucky
for a long time because bpf_net_context was a per-CPU variable that
was always available.
commit 401cb7dae813 ("net: Reference bpf_redirect_info via task_struct on PREEMPT_RT.")
turned bpf_net_context into a task_struct member that is only set up by
explicit callers. Without a caller setting it up, bpf_redirect() itself
crashes with a NULL pointer dereference in bpf_net_ctx_get_ri().

The NULL deref is fixed separately by extending the bpf_net_context
lifetime to cover qdisc enqueue. However, even with bpf_net_context
available, TC_ACT_REDIRECT from qdisc filter chains cannot be honored
without adding skb_do_redirect() calls to every qdisc classify
function, which would require changes across net/sched/. Isolate it
to ebpf core where it belongs.

Instead, add a tcf_classify_qdisc() inline helper in pkt_cls.h, as a
wrapper around tcf_classify() for use by qdisc classify functions and
tcf_qevent_handle(). When the classify verdict is TC_ACT_REDIRECT,
the wrapper converts it to TC_ACT_SHOT, dropping the packet rather
than letting it continue silently. Dropping is preferred over
letting the packet through because the user immediately sees packet
loss and, with the help of the rate-limited warning in the log
("use eBPF with clsact or mirred redirect instead"), can quickly
identify and fix the misconfiguration. Silently passing the packet
through would hide the problem and leave the user wondering why their
redirect is not working.

The clsact fast path, tc_run() continues to call tcf_classify() directly
and is unaffected: TC_ACT_REDIRECT is returned as-is and handled by
sch_handle_egress/ingress() calling skb_do_redirect() as before.

Also (to emphasize again to Sashiko):
Remove the TC_ACT_REDIRECT case from tcf_qevent_handle() as well.
skb_do_redirect() belongs in the BPF plumbing layer (net/core/), not
in net/sched/. The case was never consistent with the rest of the qdisc
classification infrastructure, where no classify function handles
TC_ACT_REDIRECT. It appears to have been a cut-and-paste artifact from
the qevent introduction rather than a deliberate design decision.

Fixes: 27b29f63058d ("bpf: add bpf_redirect() helper")
Fixes: 401cb7dae813 ("net: Reference bpf_redirect_info via task_struct on PREEMPT_RT.")
Tested-by: Victor Nogueira <victor@mojatatu.com>
Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com>
---
 include/net/pkt_cls.h    | 13 +++++++++++++
 net/sched/cls_api.c      |  6 +-----
 net/sched/sch_cake.c     |  2 +-
 net/sched/sch_drr.c      |  2 +-
 net/sched/sch_dualpi2.c  |  2 +-
 net/sched/sch_ets.c      |  2 +-
 net/sched/sch_fq_codel.c |  2 +-
 net/sched/sch_fq_pie.c   |  2 +-
 net/sched/sch_hfsc.c     |  2 +-
 net/sched/sch_htb.c      |  2 +-
 net/sched/sch_multiq.c   |  2 +-
 net/sched/sch_prio.c     |  2 +-
 net/sched/sch_qfq.c      |  2 +-
 net/sched/sch_sfb.c      |  2 +-
 net/sched/sch_sfq.c      |  2 +-
 15 files changed, 27 insertions(+), 18 deletions(-)

diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index 3bd08d7f39c1..3a542a72e9a5 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -159,6 +159,19 @@ static inline int tcf_classify(struct sk_buff *skb,
 
 #endif
 
+static inline int tcf_classify_qdisc(struct sk_buff *skb,
+				     const struct tcf_proto *tp,
+				     struct tcf_result *res, bool compat_mode)
+{
+	int ret = tcf_classify(skb, NULL, tp, res, compat_mode);
+
+	if (unlikely(ret == TC_ACT_REDIRECT)) {
+		pr_warn_once("TC_ACT_REDIRECT from qdisc filter chains is not supported; use eBPF with clsact or mirred redirect instead\n");
+		ret = TC_ACT_SHOT;
+	}
+	return ret;
+}
+
 static inline unsigned long
 __cls_set_class(unsigned long *clp, unsigned long cl)
 {
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 3e67600a4a1a..3ca56d060e28 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -4033,7 +4033,7 @@ struct sk_buff *tcf_qevent_handle(struct tcf_qevent *qe, struct Qdisc *sch, stru
 
 	fl = rcu_dereference_bh(qe->filter_chain);
 
-	switch (tcf_classify(skb, NULL, fl, &cl_res, false)) {
+	switch (tcf_classify_qdisc(skb, fl, &cl_res, false)) {
 	case TC_ACT_SHOT:
 		qdisc_qstats_drop(sch);
 		__qdisc_drop(skb, to_free);
@@ -4045,10 +4045,6 @@ struct sk_buff *tcf_qevent_handle(struct tcf_qevent *qe, struct Qdisc *sch, stru
 		__qdisc_drop(skb, to_free);
 		*ret = __NET_XMIT_STOLEN;
 		return NULL;
-	case TC_ACT_REDIRECT:
-		skb_do_redirect(skb);
-		*ret = __NET_XMIT_STOLEN;
-		return NULL;
 	case TC_ACT_CONSUMED:
 		*ret = __NET_XMIT_STOLEN;
 		return NULL;
diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c
index a3c185505afc..94eb47ac54ee 100644
--- a/net/sched/sch_cake.c
+++ b/net/sched/sch_cake.c
@@ -1730,7 +1730,7 @@ static u32 cake_classify(struct Qdisc *sch, struct cake_tin_data **t,
 		goto hash;
 
 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
-	result = tcf_classify(skb, NULL, filter, &res, false);
+	result = tcf_classify_qdisc(skb, filter, &res, false);
 
 	if (result >= 0) {
 #ifdef CONFIG_NET_CLS_ACT
diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c
index 020657f959b5..91b1ef824afa 100644
--- a/net/sched/sch_drr.c
+++ b/net/sched/sch_drr.c
@@ -312,7 +312,7 @@ static struct drr_class *drr_classify(struct sk_buff *skb, struct Qdisc *sch,
 
 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
 	fl = rcu_dereference_bh(q->filter_list);
-	result = tcf_classify(skb, NULL, fl, &res, false);
+	result = tcf_classify_qdisc(skb, fl, &res, false);
 	if (result >= 0) {
 #ifdef CONFIG_NET_CLS_ACT
 		switch (result) {
diff --git a/net/sched/sch_dualpi2.c b/net/sched/sch_dualpi2.c
index 5434df6ca8ef..98364f74211e 100644
--- a/net/sched/sch_dualpi2.c
+++ b/net/sched/sch_dualpi2.c
@@ -364,7 +364,7 @@ static int dualpi2_skb_classify(struct dualpi2_sched_data *q,
 		return NET_XMIT_SUCCESS;
 	}
 
-	result = tcf_classify(skb, NULL, fl, &res, false);
+	result = tcf_classify_qdisc(skb, fl, &res, false);
 	if (result >= 0) {
 #ifdef CONFIG_NET_CLS_ACT
 		switch (result) {
diff --git a/net/sched/sch_ets.c b/net/sched/sch_ets.c
index cb8cf437ce87..25fcf4079fec 100644
--- a/net/sched/sch_ets.c
+++ b/net/sched/sch_ets.c
@@ -391,7 +391,7 @@ static struct ets_class *ets_classify(struct sk_buff *skb, struct Qdisc *sch,
 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
 	if (TC_H_MAJ(skb->priority) != sch->handle) {
 		fl = rcu_dereference_bh(q->filter_list);
-		err = tcf_classify(skb, NULL, fl, &res, false);
+		err = tcf_classify_qdisc(skb, fl, &res, false);
 #ifdef CONFIG_NET_CLS_ACT
 		switch (err) {
 		case TC_ACT_STOLEN:
diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c
index cafd1f943d99..6cce86ba383c 100644
--- a/net/sched/sch_fq_codel.c
+++ b/net/sched/sch_fq_codel.c
@@ -91,7 +91,7 @@ static unsigned int fq_codel_classify(struct sk_buff *skb, struct Qdisc *sch,
 		return fq_codel_hash(q, skb) + 1;
 
 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
-	result = tcf_classify(skb, NULL, filter, &res, false);
+	result = tcf_classify_qdisc(skb, filter, &res, false);
 	if (result >= 0) {
 #ifdef CONFIG_NET_CLS_ACT
 		switch (result) {
diff --git a/net/sched/sch_fq_pie.c b/net/sched/sch_fq_pie.c
index 72f48fa4010b..069e1facd413 100644
--- a/net/sched/sch_fq_pie.c
+++ b/net/sched/sch_fq_pie.c
@@ -96,7 +96,7 @@ static unsigned int fq_pie_classify(struct sk_buff *skb, struct Qdisc *sch,
 		return fq_pie_hash(q, skb) + 1;
 
 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
-	result = tcf_classify(skb, NULL, filter, &res, false);
+	result = tcf_classify_qdisc(skb, filter, &res, false);
 	if (result >= 0) {
 #ifdef CONFIG_NET_CLS_ACT
 		switch (result) {
diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c
index 7e537295b8b6..e87f5021a199 100644
--- a/net/sched/sch_hfsc.c
+++ b/net/sched/sch_hfsc.c
@@ -1143,7 +1143,7 @@ hfsc_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
 	head = &q->root;
 	tcf = rcu_dereference_bh(q->root.filter_list);
-	while (tcf && (result = tcf_classify(skb, NULL, tcf, &res, false)) >= 0) {
+	while (tcf && (result = tcf_classify_qdisc(skb, tcf, &res, false)) >= 0) {
 #ifdef CONFIG_NET_CLS_ACT
 		switch (result) {
 		case TC_ACT_QUEUED:
diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index 908b9ba9ba2e..fdac0dc8f35a 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -243,7 +243,7 @@ static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch,
 	}
 
 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
-	while (tcf && (result = tcf_classify(skb, NULL, tcf, &res, false)) >= 0) {
+	while (tcf && (result = tcf_classify_qdisc(skb, tcf, &res, false)) >= 0) {
 #ifdef CONFIG_NET_CLS_ACT
 		switch (result) {
 		case TC_ACT_QUEUED:
diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c
index 4e465d11e3d7..004f0d275caf 100644
--- a/net/sched/sch_multiq.c
+++ b/net/sched/sch_multiq.c
@@ -36,7 +36,7 @@ multiq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
 	int err;
 
 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
-	err = tcf_classify(skb, NULL, fl, &res, false);
+	err = tcf_classify_qdisc(skb, fl, &res, false);
 #ifdef CONFIG_NET_CLS_ACT
 	switch (err) {
 	case TC_ACT_STOLEN:
diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c
index e4dd56a89072..79437c587e7e 100644
--- a/net/sched/sch_prio.c
+++ b/net/sched/sch_prio.c
@@ -39,7 +39,7 @@ prio_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
 	if (TC_H_MAJ(skb->priority) != sch->handle) {
 		fl = rcu_dereference_bh(q->filter_list);
-		err = tcf_classify(skb, NULL, fl, &res, false);
+		err = tcf_classify_qdisc(skb, fl, &res, false);
 #ifdef CONFIG_NET_CLS_ACT
 		switch (err) {
 		case TC_ACT_STOLEN:
diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c
index cb56787e1d25..6f3b7273cb16 100644
--- a/net/sched/sch_qfq.c
+++ b/net/sched/sch_qfq.c
@@ -709,7 +709,7 @@ static struct qfq_class *qfq_classify(struct sk_buff *skb, struct Qdisc *sch,
 
 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
 	fl = rcu_dereference_bh(q->filter_list);
-	result = tcf_classify(skb, NULL, fl, &res, false);
+	result = tcf_classify_qdisc(skb, fl, &res, false);
 	if (result >= 0) {
 #ifdef CONFIG_NET_CLS_ACT
 		switch (result) {
diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c
index b1d465094276..ed39869199c0 100644
--- a/net/sched/sch_sfb.c
+++ b/net/sched/sch_sfb.c
@@ -260,7 +260,7 @@ static bool sfb_classify(struct sk_buff *skb, struct tcf_proto *fl,
 	struct tcf_result res;
 	int result;
 
-	result = tcf_classify(skb, NULL, fl, &res, false);
+	result = tcf_classify_qdisc(skb, fl, &res, false);
 	if (result >= 0) {
 #ifdef CONFIG_NET_CLS_ACT
 		switch (result) {
diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c
index 758b88f21865..77675f9a4c46 100644
--- a/net/sched/sch_sfq.c
+++ b/net/sched/sch_sfq.c
@@ -171,7 +171,7 @@ static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch,
 		return sfq_hash(q, skb) + 1;
 
 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
-	result = tcf_classify(skb, NULL, fl, &res, false);
+	result = tcf_classify_qdisc(skb, fl, &res, false);
 	if (result >= 0) {
 #ifdef CONFIG_NET_CLS_ACT
 		switch (result) {
-- 
2.54.0


^ permalink raw reply related

* [PATCH net 3/3 v2] selftests/tc-testing: Verify bpf redirect on RED block with preceding clsact (egress) classifier
From: Jamal Hadi Salim @ 2026-06-29 10:21 UTC (permalink / raw)
  To: netdev
  Cc: jiri, davem, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Simon Horman, toke, Steven Rostedt, Petr Machata,
	Alexei Starovoitov, Daniel Borkmann, John Fastabend,
	Jesper Dangaard Brouer, linux-rt-devel, bpf, security, stable,
	Victor Nogueira, Jamal Hadi Salim
In-Reply-To: <20260629102157.737306-1-jhs@mojatatu.com>

From: Victor Nogueira <victor@mojatatu.com>

The bpf_net_context used by sch_handle_egress() is stack-allocated and torn
down in that function. By the time tcf_qevent_handle() runs
current->bpf_net_context is NULL.

When a filter attached to a qevent block (e.g. RED's early_drop or mark
qevents, which always uses shared blocks) returns TC_ACT_REDIRECT,
tcf_qevent_handle() calls skb_do_redirect(), which in turn calls bpf helper
bpf_net_ctx_get_ri(). That helper unconditionally dereferences
current->bpf_net_context resulting in a NULL pointer dereference.

Add a test case that reproduces this scenario by attaching a filter to
clsact (egress) and a bpf filter to a block attached to RED. Use TBF as
red's parent, so that  a traffic burst builds backlog and RED early-drops
triggers the block filter.

Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: Victor Nogueira <victor@mojatatu.com>
---
 .../testing/selftests/tc-testing/action-ebpf  | Bin 856 -> 9072 bytes
 tools/testing/selftests/tc-testing/action.c   |   5 +++
 .../tc-testing/tc-tests/infra/qdiscs.json     |  32 ++++++++++++++++++
 3 files changed, 37 insertions(+)

diff --git a/tools/testing/selftests/tc-testing/action-ebpf b/tools/testing/selftests/tc-testing/action-ebpf
index 4879479b2ee5c046279be0fe8f9ca313dfb7e618..52c47e42bf0af024a073cacc823c8270f906a8df 100644
GIT binary patch
literal 9072
zcmb<-^>JfjWMqH=MuzVU2p&w7fnkFTg6#liIxt8xFfwchvl$qsLh0>H5Js}1514@=
z&%nUI&VW$w9^k{kD9EU)D$L5PS|lzYF0Cra7^++>ULwxGz+}R}tm-LjFKNYX&CMji
zz`)GN=qb#=z@o_DDQwQoz`&})z^rP=&CSigzy@M+bK7w<FtF<}3Q7yHIY?AVGOL2L
zs!M_lVPN23Wq=5P4B=#DV3I&^x%e4CqTIra%&OenR@~OC3=BNHVEaKF3vLDmUS0-I
zVGyT-ksrk86K8~|>|o?)VBi;H@Dzra$G{)}QwmZi2vf(vAS4Xc!oa}rf|-GVm4T51
z6i$o`vQQQS0|PV&85kHqay%drW;2i~8K#8{%uWmp3@mOSf`OHVjggI&gPomGfPsO5
zF^Y|WX9`Fc7X!}>ka~6|1+X{=gCIzplQEEsK@cLt4AH^KAP$n@;9?L5i?gz`vT)61
zU|`@5Il#13f`?m*iGhJ>nFIq5ADFdVf`x}4%vvGA!6N`>t(4&55d^bVNeJ)=fmy31
zM0kY3tThr6JR)G$S_v5*Q7~(rgaVHkn6+L)g-0CB+9099BLQY@l+fXkR0G+&Ny30f
z3M{r+!i7f~%-SO1!6O4^ZI$rhkp;81Nd)l7fmz!nLU`oCtX&cjJPKgeZiyHkMKEiR
zL;{bJ5y<4d5-B{&VAei~5*`(>?0$(B9#t^wfJ6t68kluhqK9`QBLf4|5ebe7d>kN(
zN8Ju&!Vw7u1|EBRW(EePqY^WCoWRPDNi5)T2D6S!Ea80x(s)9GV+9`v(+LR<9v5$r
z>JuQ1fnY@^B{uK`DT4%0No?T>1{-!pVh01i5)%UhFQYUo4?7DpNF_MFSs4&)76vY7
zCI$v>I}4^~GCUgMATyrJFz{%DSubRmcyz$5moh9ox?me#$*}PlfLX6)*m(@WtT!?o
zJVs#FTNzFsV||b*?_{`mOu?-8GCVwHVAcm2K7nl@Pk)pV5L96LC?jwL#QP+}AjHA+
zNruPV9HjHJ3<HlPnDs@5g+bssNXa)D1|bEeZ!$bq;K2Sa!@y$=X8n*6U|`^}0eOz;
zw~PUgEm-3p850J6d1eL%Ek+4eO?D=JZDs}reMV7MJq|{GkcUi|6~M{Qf?0{*otc5b
zkx`!2ft`aZfSG}TJ0O6GSCYpSY$l&112iG<OS15|fyD$QIiLwuP?86ljD;ixpovmg
zQiR7HtWH!?g2w~wN-;?p9#62CxTFq`7dS8^Bn^1H!D3R9COkf1b<&a+JicHt8A%Tw
zzsI29kd^dd;0I+ce?}<=F$Pdx=KvS2EDWH6On{k5ftgu=A%YPk1In!o45ADS3~~$%
z44_=A$-uy%$H2e<%I=_|G=PDDA&P;4A&Y^5A%}s1p@4ybp_YMxp_ze!p@)HiVIl(q
z!+Zt?h7}A93|ko(81^wRFq~vyV7SD<!0?EHf#DSc1H)$q28M483=F>+7#P?X85p=3
z85l$u85m?385ooq85r~!85k@W85o=y85n#S85klN85mL+85r^y85k-U85rsq85nvQ
z85kxrGBC_!WMEjr$iT3Mk%3_sBLl;JMh1qnj0_Cd7#SGuGcqtdXJlY_$H>6&g^_{b
z7b61$GZO;?7ZU@6FcSlV3=;!`DiZ^PHWLGb850A83ljr_HxmOx91{aW3KIiEE)xSo
z8509TB@+Wf8xsRVHxmQHWF`iN*-Q)!OPClK)-o|LY-M6#*vrJgaF~gK;R+K2!!0HT
zhQ~|{3~!hi7=AD@FfcMRFeK+B=A|o?r4|)u=I1FG8R;47nKC3Mmt^MW=_NDhF~sL&
zCa2~Vr!pjGBo;Bm$2$fEIY!0@dq%m&heQUr#>Yby$LD7=WagE?c-i?dR#9q7W>IQ#
z2}3bMPHG-QX<l(=dR}UZ0!VRue5tV!LqT>)d`V?NDno8!Q8q(iX=-U|d~RYvL1tb$
zLqSn~Nq%yE4ntW^VqSbfQEG8&UI~O#lAH-)fYmS*6lLZYWtLPjWagz8r4|>*XQpN5
zrKDCc!03|Xc!)r95<^B}aRx(4a(r@5VsUY13PVa_Ng|ktPt8kV$V)89jL%GANK4Gk
z%&BB3O3lqLNsZ4eFk#5aPfpAMv*3bea6vPe%7Xl&5~wJc2{JuCH?<^@AuT7rJU%<M
zvX~(+BR?$-5gNrAAU*N%rG{n<C19z<l$4@)h}SZU<I{=~(-EqnaZzf)0FufqDlUO2
z$SjUe%}Y)!V8|?hY6XQ^en~z<e0)->p&3Il#64g#v!Ki*zPKnEEN5)Q0OqF@mw*^%
zV2R9vGP8J)NLo%}dNIWDIf+TBISfe!Y4HfZloXdF<`y8Fmy@5Dt^gt!;^RxrOc=^D
zi&Eo3k)K|iA77lBUd&LO&5)E|nwJuvl3Es@nZ^K){^Fu!aL__%GX@Y1c4<m+Nj#hZ
ziUyECW`P+)aY<rHDnn64JZhqek1sYh0=uy|KRKHLY-?s!Dg(rwkhGRj4&gDx#}{YE
zCzYn9F{nTbAV@)jo1FiekwF3~ox;q(0K&@=4H7006VyIbVqjo70BR?I3NxsBekdDM
ze1XhhW?*0dwH0Nd;t3244BAk30|Ntt36#Bnfq}sh$_AO~1!X^AU|<M^vOy(h9F(m9
zDw3dV2Sx^lGAKKNk%6He>`w-U21W*kCaAaq69Yp#l<mO8z%U8QPGDkSm=0w(FflNI
z+yQd)0wzc?3Su8%VqjPgRr7#}fngVv&A`mSaD)NUmQ`S8U^owDFJNY1xCvFGz{0@r
z6v}pBVPN<GWha1Y08l%Pfq|icg@NH00|NsW0|UbW76t}TQygS311kdq7pT|-RR^q)
z00y-L9atF{grVXMtdOFckAZ<<0V@N81|!rQRt5$`P$V%haDc2bhp-tzY*5@lOau9V
z0Zgz#!_0*Ubs#f9`a!i8sC5dezBoYw2+SnSz`&3MF^hwnfq@|d!Unf=LFoac6sEtL
zk%561B!Iw7psog}ssRNNC`~m(^@Avoogk$kZGxa`kC6eCG{997sI3cfp8^8|<8+8P
z52%@oP5nuzyf8=-q>+Jv7uh^(P!+<!zyqq1t3Zh!5;dR}Imj6xUEpXE2UVru#yA5*
zJwzQJDD8t3bwJDjd4WL^qywsM0z@6C8^kaViH($9b5iq0Qq=)1t}x3|Ld6@Xibbn+
zF)MLIVGb#>;Tk~2IHW8u&IT3d7KmaVTniN=*ZTR{&{|(NKbt{MAKr}MEJ`gYEy_~}
zagKL%4vF{owuY*Uhqn`Svq6<qVo6C+W>RTMYJ9wgMsX^*8KR*CF-JE$UrAG^v^X_I
zQxnvBP=E->XXk4amlTyImnguCas{noO$N?lT{}?Ct6-~OP+<VK5#AnwXxD}F6(9iB
zsX?wTo<Xk8A=c3L53Dr=qfskD5D#4CFo5a^SUCbJ!$BC-Bn6e9pkxlpqYEMV3&dp*
zVqjpnjKmjVfHe0&c?6^Y)Hnv^Q)E4>85tP1f%;Dt5WV2c#=yV;62A-5=mO~-gZQ9k
zB$p2Z1IP>zAFM9`)eew&G!p{@M+(SB1_n@B`v3p`|11m~3>-Dw3?Mf_WFRESED#G_
z9OiFyahQ5gbDNccgP{&ocrh~m=Hq2xWCRuJpi+vFkrC9nf%Nb}r6e<>JQF{w91EzS
z&)_R@sm`yAaZAD<73VVx=lwJX6-zq=J{1((2_#EytNSW+OeGK09bh`wvnwt9_@k2y
z0!I>LJ>^b$fQASTtbMp}G3T|;oM#+dfr~D(vM{hRaWQa0$`?@0!^_CT#J~uu1&{;<
z8Ckiw6j_-Rp>nJoD0(Cydh{6dON)#2GxL&jN>ftx6N__o(^K<Oi!zf@C2}(JN-Lnr
zUoRPydvtSh%uMt$Kn)hX3~*DZST6$<usARaBLf2q!^i*Fq?y6B6{ZX`1E|!;CJwDQ
z(WIEcBT#69pb!Gr!q~(?wHP*Ww3-X5gqZ<UBSSez6f;IDgGe%iyN3`qGRX{Yi6KOH
zFfcF(A%z1h-GMrqLP+5NOQ%X;^@wnQslUa*z#zx~s`(*i!$J_0cR?bc{07nmsuN*+
zP=y7m4`F;zs3?Ot;ILz00JQ_uk@z6fk<A0KLHQ1(1Y|yl4|5NU56f3DKBz85Ru5_$
zAoF4VgsBI$bCKmWKpd$1K=$dN@eR=Upt1}k1T)V9#6i+;gT{xoXJG1G(Bwf`6C?za
z2X*C<`Jiq+GCu^wL9#CbjUR)?2eqj|LNN28#Tz(<fcy)~YcP3GIgP9yG>m}EF9C6o
z?5{xM*P!u1?Rbz7%={J*2T6Yi8Xq)N0TP0#p8(<@sRyNFkPu8BJSvGKKL;d<#D~>6
zF!f8&<UzwBAR(Ci8W0Cb{{}Sv7BoI6zk!5c`uBi1Ncutj6_5~29@Hm5=AQs@kkp?+
z<6l7IUqR#FK;z#*<AeH0AR(CjPe2?b`(B{&-=Oh9eG-rmO#c@U2TA`AH2xnnKB!s)
z3BmM(%5P*o2dJ(_GLHw1FM!4e4M!pCmq3#T4OJn_gZhHVd=)hHpgs|@ybhYY0UF-~
zjSm_QL)LGDCJ*X=BFjVjQ1JZcfu`OEjURx<4?*LD`d7&2gYq<p531HdWhV&3_#iP@
zc?n{J;sjPcg7}~^J_aNR?T3K$!pcLC97qkUd;{@8N@3*{h!4WB@&?2PsfU#hAU;Sv
zEWg9}u>1?-!}1%555mahJ*W?i?0%5>F!zDX2Fb(9YmgjB47t1o$-~N9kUYqIQ2hrA
zACOv5-J=HLK+_|LuZ6}3&$A%Od!xz2{0}p)6ips9wg3_W*$=|YK^#!?3~J^vBtFP}
zynOru40<W4Nu}xWiAhOCsbvg$C8-r940=VWIeJbZZh9aNq&XiCZ_Y#bh~_=ifTFzg
zoXp~qVu)slp~WRd@%d?K#i<}+xDd?BoXot`_~McxWF4TvIcOUWwF?7w62yqiyfpYA
zC~C(jC#Nho9%MvuW;$Yk6-g_|N@VNOiV|~Eq4t4BWs6ISN)nS8^olEU!89}+py5U-
z#S0xfK{uxb)EsAEU{HhRKbSlkmjTq72Z@2&HZV0H8rBW~v5_$-j*<C?K#h9nm;tgl
zOg|`3B8$WFJ4|dI4*jrp1T4S9)T8S+!l5754j^RzZK!@w`iI#8qG9a-bpL|F3M3E1
zAU+7g_%IsQZukxpfYRt{LG?Gf7)U8-90^3fL30?oT2Olc)J}zkH%Jc%3qT7ZSU5_7
z`fpGSmIo2g=@@h~K~m|U#xer~14ti;55wqc!=T{@lZVlbQ2jAz8ql~5K@5;~HoE&0
zVD5*S19AtbeGZF%SiELJ?T3}4F#GR96EruD@PoMnRKCO5Fufr91t?G%7#LvfZIBoU
zqpJnQGe{}A`!_%b$YA0i_k(&-=<ZiU8b<(`55oeW!k>YGK^f`~m^gZRlx4&oejlLr
z!@>z<KZuQPKS&w6pFnn*fE0tqa6ud-jBY=u|Afu|u!IB4M<Dw_<0k0#gYpM9`+q>~
zhxHd=PJro$@j+97*z9irS;)Y^0IT0XTu_pN>4zH6@En?cVCKQ-0BA??8cYC6qr3Gn
zj`aHgWFcrA8>$b)1&s@#+YjoWfXqZsziObsVqjo^<zJ{Vu;wY82DQJLeg5ZzhTdUn
sKy(AN{D;LC$bOJG$Sx2K!=j)uDHsQdu7KJ<1F8W;fkp>l?uWH&07wxN(*OVf

delta 317
zcmez1c7tt#hG+yM0~|PjSq=;w6K#!|-2;3kf8>;vbYoy(U}5<9A1sGNNKf7<Ebhb3
zz`!8HzycRnfU;~E7#IW@SfM<S2@oa|GYf-WNoqw2Lt=7CW`16Lc0QD)n?2b|MpptN
zte4E7S6ot5l9<GxS6rD}l9)7kqm2C|E)G_I1_lP^$saj|Cx1|60h=E`IZ#f0vW2V#
zqw3^BS$jso$s1+u8SN$;%84@;O!kyhXVjnkQAu3%1H=Uk%upKSbcV@=a>A_P3=9lR
YATu>8pmH!86gW%_3=AAlaS1350I>8ljQ{`u

diff --git a/tools/testing/selftests/tc-testing/action.c b/tools/testing/selftests/tc-testing/action.c
index c32b99b80e19..350f2d36a773 100644
--- a/tools/testing/selftests/tc-testing/action.c
+++ b/tools/testing/selftests/tc-testing/action.c
@@ -20,4 +20,9 @@ __attribute__((section("action-ko"),used)) int action_ko(struct __sk_buff *s)
 	return TC_ACT_OK;
 }
 
+__attribute__((section("action-redirect"), used)) int action_redirect(struct __sk_buff *s)
+{
+	return TC_ACT_REDIRECT;
+}
+
 char _license[] __attribute__((section("license"),used)) = "GPL";
diff --git a/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json b/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json
index a1f97a4b606e..762f86ceab1c 100644
--- a/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json
+++ b/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json
@@ -1540,5 +1540,37 @@
             "$TC qdisc del dev $DUMMY root",
             "$IP addr del 10.10.10.10/24 dev $DUMMY || true"
         ]
+    },
+    {
+        "id": "fb8d",
+        "name": "Verify bpf redirect on RED block with preceding clsact (egress) classifier",
+        "category": [
+            "qdisc",
+            "red",
+            "qevent",
+            "clsact"
+        ],
+        "plugins": {
+            "requires": "nsPlugin"
+        },
+        "setup": [
+            "$IP addr add 10.10.10.1/24 dev $DUMMY",
+            "$IP neigh add 10.10.10.2 lladdr 02:00:00:00:00:01 dev $DUMMY nud permanent",
+            "$TC qdisc add dev $DUMMY handle 1: root tbf rate 1Mbit burst 10K limit 1M",
+            "$TC qdisc add dev $DUMMY parent 1:1 handle 11: red limit 1M avpkt 1400 probability 1 burst 38 harddrop min 30000 max 30001 qevent early_drop block 10",
+            "$TC qdisc add dev $DUMMY clsact",
+            "$TC filter add dev $DUMMY egress protocol ip prio 1 matchall action gact pass",
+            "$TC filter add block 10 protocol ip prio 1 matchall action bpf obj $EBPFDIR/action-ebpf sec action-redirect"
+        ],
+        "cmdUnderTest": "bash -c 'data=$(head -c 1400 /dev/zero | tr \"\\0\" \"x\"); exec 3>/dev/udp/10.10.10.2/12345; for i in $(seq 1 8000); do printf \"%s\" \"$data\" >&3; done; exit 0'",
+        "expExitCode": "0",
+        "verifyCmd": "$TC -s filter show block 10",
+        "matchPattern": "Sent [1-9][0-9]* bytes [1-9][0-9]* pkt",
+        "matchCount": "1",
+        "teardown": [
+            "$TC qdisc del dev $DUMMY clsact",
+            "$TC qdisc del dev $DUMMY handle 1: root",
+            "$IP addr del 10.10.10.1/24 dev $DUMMY"
+        ]
     }
 ]
-- 
2.54.0


^ permalink raw reply related

* [PATCH net 1/3 v2] net: Extend bpf_net_context lifetime to cover qdisc enqueue
From: Jamal Hadi Salim @ 2026-06-29 10:21 UTC (permalink / raw)
  To: netdev
  Cc: jiri, davem, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Simon Horman, toke, Steven Rostedt, Petr Machata,
	Alexei Starovoitov, Daniel Borkmann, John Fastabend,
	Jesper Dangaard Brouer, linux-rt-devel, bpf, security, stable,
	Jamal Hadi Salim, Victor Nogueira
In-Reply-To: <20260629102157.737306-1-jhs@mojatatu.com>

The bpf_net_context used by sch_handle_egress() is stack-allocated and torn
down in that function returned. By the time tcf_qevent_handle() runs
current->bpf_net_context is NULL.

When a filter attached to a qevent block (e.g. RED's early_drop or mark
qevents, which always use shared blocks) returns TC_ACT_REDIRECT,
tcf_qevent_handle() calls skb_do_redirect(), which in turn calls bpf helper
bpf_net_ctx_get_ri().  That helper unconditionally dereferences
current->bpf_net_context resulting in a NULL pointer dereference.

Note: The same holds for actions that invoke BPF redirect helpers
(e.g. act_bpf running a program that calls bpf_redirect()) during qevent
classification itself.

Fix:
Move the bpf_net_context lifecycle out of sch_handle_egress() into
__dev_queue_xmit(), so that it spans both the egress TC fast path and the
qdisc enqueue.
Note: The call is placed outside the egress_needed_key static branch
to cover the case where clsact static key is disabled. Unfortunately this
adds a small unconditional penalty to the code path _per packet_ only
guarded by CONFIG_NET_XGRESS (two writes and one read).

As pointed by sashiko [1]:
The same context must also be set up in net_tx_action()'s qdisc drain
path, since qdisc_run() -> netem_dequeue() -> qdisc_enqueue( RED child)
can trigger qevent classification asynchronously from softirq context.

This keeps all bpf_net_context management in net/core/dev.c i.e the
existing boundary between tc core and BPF without requiring any net/sched/
code to know about BPF plumbing.

Reproducer:

  tc qdisc add dev eth0 root handle 1: red limit 1MB min 10KB max 20KB \
      avpkt 1000 burst 100 qevent early_drop block 10
  tc filter add block 10 pref 1 bpf obj redirect.o

  traffic through eth0 triggers red_enqueue() -> tcf_qevent_handle() and,
  on a redirect verdict, a NULL deref in skb_do_redirect().

Fixes: 3625750f05ec ("net: sched: Introduce helpers for qevent blocks")
Tested-by: Victor Nogueira <victor@mojatatu.com>
Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com>
---
 net/core/dev.c | 31 +++++++++++++++++++++++--------
 1 file changed, 23 insertions(+), 8 deletions(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index 4b3d5cfdf6e0..b95a8b153c76 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4527,14 +4527,11 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
 {
 	struct bpf_mprog_entry *entry = rcu_dereference_bh(dev->tcx_egress);
 	enum skb_drop_reason drop_reason = SKB_DROP_REASON_TC_EGRESS;
-	struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
 	int sch_ret;
 
 	if (!entry)
 		return skb;
 
-	bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
-
 	/* qdisc_skb_cb(skb)->pkt_len & tcx_set_ingress() was
 	 * already set by the caller.
 	 */
@@ -4550,12 +4547,10 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
 		/* No need to push/pop skb's mac_header here on egress! */
 		skb_do_redirect(skb);
 		*ret = NET_XMIT_SUCCESS;
-		bpf_net_ctx_clear(bpf_net_ctx);
 		return NULL;
 	case TC_ACT_SHOT:
 		kfree_skb_reason(skb, drop_reason);
 		*ret = NET_XMIT_DROP;
-		bpf_net_ctx_clear(bpf_net_ctx);
 		return NULL;
 	/* used by tc_run */
 	case TC_ACT_STOLEN:
@@ -4565,10 +4560,8 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
 		fallthrough;
 	case TC_ACT_CONSUMED:
 		*ret = NET_XMIT_SUCCESS;
-		bpf_net_ctx_clear(bpf_net_ctx);
 		return NULL;
 	}
-	bpf_net_ctx_clear(bpf_net_ctx);
 
 	return skb;
 }
@@ -4767,6 +4760,9 @@ struct netdev_queue *netdev_core_pick_tx(struct net_device *dev,
  */
 int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
 {
+#ifdef CONFIG_NET_XGRESS
+	struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx = NULL;
+#endif
 	struct net_device *dev = skb->dev;
 	struct netdev_queue *txq = NULL;
 	enum skb_drop_reason reason;
@@ -4795,6 +4791,9 @@ int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
 	skb_update_prio(skb);
 
 	tcx_set_ingress(skb, false);
+#ifdef CONFIG_NET_XGRESS
+	bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
+#endif
 #ifdef CONFIG_NET_EGRESS
 	if (static_branch_unlikely(&egress_needed_key)) {
 		if (nf_hook_egress_active()) {
@@ -4898,12 +4897,18 @@ int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
 
 	reason = SKB_DROP_REASON_RECURSION_LIMIT;
 drop:
+#ifdef CONFIG_NET_XGRESS
+	bpf_net_ctx_clear(bpf_net_ctx);
+#endif
 	rcu_read_unlock_bh();
 
 	dev_core_stats_tx_dropped_inc(dev);
 	kfree_skb_list_reason(skb, reason);
 	return rc;
 out:
+#ifdef CONFIG_NET_XGRESS
+	bpf_net_ctx_clear(bpf_net_ctx);
+#endif
 	rcu_read_unlock_bh();
 	return rc;
 }
@@ -5815,6 +5820,9 @@ static __latent_entropy void net_tx_action(void)
 
 	if (sd->output_queue) {
 		struct Qdisc *head;
+#ifdef CONFIG_NET_XGRESS
+		struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
+#endif
 
 		local_irq_disable();
 		head = sd->output_queue;
@@ -5824,6 +5832,10 @@ static __latent_entropy void net_tx_action(void)
 
 		rcu_read_lock();
 
+#ifdef CONFIG_NET_XGRESS
+		bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
+#endif
+
 		while (head) {
 			spinlock_t *root_lock = NULL;
 			struct sk_buff *to_free;
@@ -5860,6 +5872,10 @@ static __latent_entropy void net_tx_action(void)
 			tcf_kfree_skb_list(to_free, q, NULL, qdisc_dev(q));
 		}
 
+#ifdef CONFIG_NET_XGRESS
+		bpf_net_ctx_clear(bpf_net_ctx);
+#endif
+
 		rcu_read_unlock();
 	}
 
-- 
2.54.0


^ permalink raw reply related

* Re: [PATCH net 1/3] net: Extend bpf_net_context lifetime to cover qdisc enqueue
From: Sebastian Andrzej Siewior @ 2026-06-29 10:29 UTC (permalink / raw)
  To: Jamal Hadi Salim
  Cc: netdev, bpf, davem, edumazet, kuba, pabeni, horms, toke, jiri,
	clrkwllms, rostedt, kuniyu, sdf.kernel, skhawaja, liuhangbin,
	krikku, mkarsten, victor, ast, hawk, john.fastabend, daniel,
	Sashiko
In-Reply-To: <20260626165156.169012-2-jhs@mojatatu.com>

On 2026-06-26 12:51:54 [-0400], Jamal Hadi Salim wrote:
> The bpf_net_context used by sch_handle_egress() is stack-allocated and torn
> down in that function returned. By the time tcf_qevent_handle() runs
> current->bpf_net_context is NULL.
> 
> When a filter attached to a qevent block (e.g. RED's early_drop or mark
> qevents, which always use shared blocks) returns TC_ACT_REDIRECT,
> tcf_qevent_handle() calls skb_do_redirect(), which in turn calls bpf helper
> bpf_net_ctx_get_ri(). That helper unconditionally dereferences
> current->bpf_net_context resulting in a NULL pointer dereference.
> 
> Note: The same holds for actions that invoke BPF redirect helpers
> (e.g. act_bpf running a program that calls bpf_redirect()) during qevent
> classification itself. And as a matter of fact the same assumption is
> made in the code outside of tc.
> 
> Fix:
> Move the bpf_net_context lifecycle out of sch_handle_egress() into
> __dev_queue_xmit(), so that it spans both the egress TC fast path and the
> qdisc enqueue. The setup is placed outside the egress_needed_key static
> branch because qevents are independent of clsact/NF egress hooks and
> that key may stay disabled when only a qevent-bearing qdisc is
> configured. Unfortunately this adds a small unconditional penalty to the
> code path _per packet_ only guarded by CONFIG_NET_XGRESS (two writes and
> one read for bpf_net_ctx_set, plus one write for bpf_net_ctx_clear).

I fail to understand this but you and sashiko have an understanding...
If there is TC_ACT_REDIRECT returned by tc_run(), then the skb is NULL
and as such uppon return from sch_handle_egress() the control flow goes
to the out label.
As a fix you move the bpf_net_ctx assigned to before CONFIG_NET_EGRESS
and clear it on exit. What do I miss here?

> This keeps all bpf_net_context management in net/core/dev.c i.e the
> existing boundary between tc core and BPF without requiring any net/sched/
> code to know about BPF plumbing.
> 
> Reproducer (see the accompanying tdc test):
> 
>   tc qdisc add dev eth0 root handle 1: red limit 1MB min 10KB max 20KB \
>       avpkt 1000 burst 100 qevent early_drop block 10
>   tc qdisc add dev eth0 clsact
>   tc filter add block 10 pref 1 bpf obj redirect.o

stupid question: how do I get this redirect.o? Just a simply thing to
reproduce this…

>   tc filter add dev eth0 egress protocol ip prio 1 matchall \
>       action gact pass
> 
>   traffic through eth0 triggers red_enqueue() -> tcf_qevent_handle() and,
>   on a redirect verdict, a NULL deref in skb_do_redirect().

Sebastian

^ permalink raw reply

* RE: [PATCH v3] xsk: fix memory corruptions in net/core/xdp.c
From: Fijalkowski, Maciej @ 2026-06-29 10:34 UTC (permalink / raw)
  To: Clement Lecigne, Lobakin, Aleksander, edumazet@google.com,
	netdev@vger.kernel.org
  Cc: bpf@vger.kernel.org, linux-kernel@vger.kernel.org,
	kuba@kernel.org, sdf@fomichev.me, horms@kernel.org,
	john.fastabend@gmail.com, ast@kernel.org, daniel@iogearbox.net
In-Reply-To: <20260629072300.1664622-1-clecigne@google.com>

> 
> From: Clément Lecigne <clecigne@google.com>
> 
> Commit 560d958c6c68 ("xsk: add generic XSk &xdp_buff -> skb conversion")
> introduced a vulnerability in the handling of XDP_PASS for AF_XDP zero-copy
> frames.
> 
> Note: Currently, this specific AF_XDP zero-copy conversion path is only
> reachable from the drivers/net/ethernet/intel/ice and
> drivers/net/ethernet/intel/idpf drivers.
> 
> When building an skb, xdp_build_skb_from_zc() uses the chunk size
> (xdp->frame_sz) for the allocation. However, napi_build_skb() automatically
> reserves space at the end of the allocation for the skb_shared_info
> structure.
> 
> Most high performance UMEM applications use 4K chunks, where the
> corruption cannot happen. However, if the UMEM is configured with 2KB
> chunks (a very common configuration to maximize packet density in memory),
> a standard 1500 MTU packet will trigger the corruption because the required
> space exceeds the 2048 byte chunk size:
> 
> Headroom (256) + Packet (1514) + skb_shared_info (320) = 2090 bytes
> 
> Because 2090 bytes > 2048 bytes and __skb_put() does not perform bounds
> checking, the memcpy() writes past the available linear data area and
> corrupts the skb_shared_info structure. This can lead to arbitrary code
> execution if pointers like destructor_arg are overwritten.
> 
> Additionally, in xdp_copy_frags_from_zc(), the allocation size is set
> strictly to the fragment size (len), but the subsequent memcpy() uses
> LARGEST_ALIGN(len). This mismatch results in an out-of-bounds write of
> up to 7 bytes, which triggers KASAN warnings and is unsafe despite typical
> page pool allocator padding.
> 
> Fix the skb allocation in xdp_build_skb_from_zc() by dynamically
> calculating the exact truesize required using SKB_HEAD_ALIGN() to
> properly account for the headroom, the LARGEST_ALIGN(len), and the
> skb_shared_info overhead.
> 
> Fix the out-of-bounds write in xdp_copy_frags_from_zc() by rounding up
> the allocation request using LARGEST_ALIGN(len) to match the copy
> operation.
> 
> Fixes: 560d958c6c68 ("xsk: add generic XSk &xdp_buff -> skb conversion")
> CC: Alexander Lobakin <aleksander.lobakin@intel.com>
> CC: Eric Dumazet <edumazet@google.com>
> Signed-off-by: Clément Lecigne <clecigne@google.com>

Hi Clement,

Do you have a reproducer for mentioned issue or is it only a fix from
theoretical POV?

To be clear, we were addressing headroom issues in this series:
https://lore.kernel.org/bpf/20260402154958.562179-1-maciej.fijalkowski@intel.com/

so I wanted to ask if you are able to have this malformed setup for
2k chunk size. That series should not allow for that.

I think this is the second time someone is trying to fix this area of code,
so it is not a nack or something, let us fix this, but I wanted to have
us on the same page.

Thanks,
Maciej

> ---
> Changes since v2:
>  - Used LARGEST_ALIGN to calculate the len to account for the aligned
> memcpy.
>  - Fixed the commit message to include the idpf driver.
> 
> Changes since v1:
>  - Used SKB_HEAD_ALIGN to properly calculate the required allocation size
>    including the skb_shared_info overhead.
>  - Re-ordered variable declarations.
> 
> ---
> diff --git a/net/core/xdp.c b/net/core/xdp.c
> index 9890a30584ba..7e39f17ad407 100644
> --- a/net/core/xdp.c
> +++ b/net/core/xdp.c
> @@ -698,8 +698,8 @@ static noinline bool xdp_copy_frags_from_zc(struct
> sk_buff *skb,
> 
>  	for (u32 i = 0; i < nr_frags; i++) {
>  		const skb_frag_t *frag = &xinfo->frags[i];
> -		u32 len = skb_frag_size(frag);
> -		u32 offset, truesize = len;
> +		u32 offset, len = skb_frag_size(frag);
> +		u32 truesize = LARGEST_ALIGN(len);
>  		struct page *page;
> 
>  		page = page_pool_dev_alloc(pp, &offset, &truesize);
> @@ -738,9 +738,10 @@ static noinline bool xdp_copy_frags_from_zc(struct
> sk_buff *skb,
>   */
>  struct sk_buff *xdp_build_skb_from_zc(struct xdp_buff *xdp)
>  {
> +	u32 headroom = xdp->data_meta - xdp->data_hard_start;
>  	const struct xdp_rxq_info *rxq = xdp->rxq;
> -	u32 len = xdp->data_end - xdp->data_meta;
> -	u32 truesize = xdp->frame_sz;
> +	u32 len = LARGEST_ALIGN(xdp->data_end - xdp->data_meta);
> +	u32 truesize = SKB_HEAD_ALIGN(headroom + len);
>  	struct sk_buff *skb = NULL;
>  	struct page_pool *pp;
>  	int metalen;
> @@ -762,7 +763,7 @@ struct sk_buff *xdp_build_skb_from_zc(struct
> xdp_buff *xdp)
>  	}
> 
>  	skb_mark_for_recycle(skb);
> -	skb_reserve(skb, xdp->data_meta - xdp->data_hard_start);
> +	skb_reserve(skb, headroom);
> 
>  	memcpy(__skb_put(skb, len), xdp->data_meta,
> LARGEST_ALIGN(len));
> 


^ permalink raw reply

* Re: [PATCH bpf-next v10 1/5] bpf: add bpf_icmp_send kfunc
From: Mahe Tardy @ 2026-06-29 10:35 UTC (permalink / raw)
  To: Stanislav Fomichev
  Cc: bpf, andrii, ast, daniel, john.fastabend, jordan, martin.lau,
	yonghong.song, emil, netdev, edumazet, kuba, pabeni, davem, horms
In-Reply-To: <aj6kdnfAB0LJKDcR@devvm7509.cco0.facebook.com>

On Fri, Jun 26, 2026 at 09:18:39AM -0700, Stanislav Fomichev wrote:
> On 06/25, Mahe Tardy wrote:
> > On Thu, Jun 25, 2026 at 09:24:59AM -0700, Stanislav Fomichev wrote:
> > > On 06/25, Mahe Tardy wrote:
> > 
> > [...]
> > 
> > > > +__bpf_kfunc int bpf_icmp_send(struct __sk_buff *skb_ctx, int type, int code)
> > > > +{
> > > > +	struct sk_buff *skb = (struct sk_buff *)skb_ctx;
> > > > +	struct sk_buff *nskb;
> > > > +	struct sock *sk;
> > > > +
> > > > +	sk = skb_to_full_sk(skb);
> > > > +	if (sk && sk->sk_kern_sock &&
> > > > +	    (sk->sk_protocol == IPPROTO_ICMP || sk->sk_protocol == IPPROTO_ICMPV6))
> > > > +		return -EBUSY;
> > > > +
> > > > +	switch (skb->protocol) {
> > > > +#if IS_ENABLED(CONFIG_INET)
> > > > +	case htons(ETH_P_IP): {
> > > > +		if (type != ICMP_DEST_UNREACH)
> > > > +			return -EOPNOTSUPP;
> > > > +		if (code < 0 || code > NR_ICMP_UNREACH ||
> > > > +		    code == ICMP_FRAG_NEEDED) /* needs a valid next-hop MTU */
> > > > +			return -EINVAL;
> > > > +
> > > > +		/* icmp_send expects skb_dst to be a real rtable. */
> > > > +		if (!skb_valid_dst(skb))
> > > > +			return -ENETUNREACH;
> > > > +
> > > > +		nskb = skb_clone(skb, GFP_ATOMIC);
> > > > +		if (!nskb)
> > > > +			return -ENOMEM;
> > > > +
> > > > +		memset(IPCB(nskb), 0, sizeof(*IPCB(nskb)));
> > > > +		icmp_send(nskb, type, code, 0);
> > > > +		consume_skb(nskb);
> > > > +		break;
> > > > +	}
> > > > +#endif
> > > > +#if IS_ENABLED(CONFIG_IPV6)
> > > > +	case htons(ETH_P_IPV6):
> > > > +		if (type != ICMPV6_DEST_UNREACH)
> > > > +			return -EOPNOTSUPP;
> > > > +		if (code < 0 || code > ICMPV6_REJECT_ROUTE)
> > > > +			return -EINVAL;
> > > 
> > > [..]
> > > 
> > > > +		/* icmpv6_send may treat skb_dst as rt6_info. */
> > > > +		if (skb_metadata_dst(skb))
> > > > +			return -ENETUNREACH;
> > > 
> > > A bit confused about this. Which part of icmpv6_send treats skb_dst as rt6_info?
> > > (I see the original sashiko report about dst, but icmp6 seems to be not
> > > requiring it)
> > 
> > Yeah I was also a bit confused because this came out of nowhere as soon
> > as I put the skb_valid_dst only on the IPv4 path (for different
> > reasons), but there is actually a potential trace in which we have type
> > confusion indeed:
> > 
> > - icmp6_send() checks scoped source addresses and calls icmp6_iif() at net/ipv6/icmp.c:702
> > - icmp6_iif() calls icmp6_dev() at net/ipv6/icmp.c:441
> > - icmp6_dev() does skb_rt6_info(skb) for loopback/L3 master devices at net/ipv6/icmp.c:428
> > - skb_rt6_info() casts any non-NULL dst to struct rt6_info at include/net/ip6_route.h:233
> > - rt6->rt6i_idev is then dereferenced at net/ipv6/icmp.c:434
> > 
> > When checking with pahole, we can find this on my local kernel:
> > 
> > struct rt6_info {
> > 	struct dst_entry           dst;                  /*     0   136 */
> > 	/* --- cacheline 2 boundary (128 bytes) was 8 bytes ago --- */
> > 	struct fib6_info *         from;                 /*   136     8 */
> > 	int                        sernum;               /*   144     4 */
> > 	struct rt6key              rt6i_dst;             /*   148    20 */
> > 	struct rt6key              rt6i_src;             /*   168    20 */
> > 	struct in6_addr            rt6i_gateway;         /*   188    16 */
> > 
> > 	/* XXX 4 bytes hole, try to pack */
> > 
> > 	/* --- cacheline 3 boundary (192 bytes) was 16 bytes ago --- */
> > 	struct inet6_dev *         rt6i_idev;            /*   208     8 */  <--- we dereference this
> > 	u32                        rt6i_flags;           /*   216     4 */
> > 	short unsigned int         rt6i_nfheader_len;    /*   220     2 */
> > 
> > 	/* size: 224, cachelines: 4, members: 9 */
> > 	/* sum members: 218, holes: 1, sum holes: 4 */
> > 	/* padding: 2 */
> > 	/* last cacheline: 32 bytes */
> > };
> > 
> > And the metadata_dst would look like this:
> > 
> > struct metadata_dst {
> > 	struct dst_entry           dst;                  /*     0   136 */
> > 	/* --- cacheline 2 boundary (128 bytes) was 8 bytes ago --- */
> > 	enum metadata_type         type;                 /*   136     4 */
> > 
> > 	/* XXX 4 bytes hole, try to pack */
> > 
> > 	union {
> > 		struct ip_tunnel_info tun_info;          /*   144    96 */
> > 		struct hw_port_info port_info;           /*   144    16 */
> > 		struct macsec_info macsec_info;          /*   144     8 */
> > 		struct xfrm_md_info xfrm_info;           /*   144    16 */
> > 	} u;                                             /*   144    96 */  <--- we land on this union
> > 
> > 	/* size: 240, cachelines: 4, members: 3 */
> > 	/* sum members: 236, holes: 1, sum holes: 4 */
> > 	/* last cacheline: 48 bytes */
> > };
> > 
> > Let's say it's a struct ip_tunnel_info:
> > 
> > struct ip_tunnel_info {
> > 	struct ip_tunnel_key       key;                  /*     0    64 */
> > 
> > 	/* XXX last struct has 7 bytes of padding */
> > 
> > 	/* --- cacheline 1 boundary (64 bytes) --- */
> > 	struct ip_tunnel_encap     encap;                /*    64     8 */  <--- 144 + 64 = 208 we land here
> > 	struct dst_cache           dst_cache;            /*    72    16 */
> > 	u8                         options_len;          /*    88     1 */
> > 	u8                         mode;                 /*    89     1 */
> > 
> > 	/* size: 96, cachelines: 2, members: 5 */
> > 	/* padding: 6 */
> > 	/* paddings: 1, sum paddings: 7 */
> > 	/* last cacheline: 32 bytes */
> > };
> > 
> > So I imagine this is fairly tricky to trigger but still a case of type
> > confusion. I have actually no idea how likely this can happen from my
> > call but the trace makes sense at least.
> 
> That logic seems to exist for the icmp6_send to find the input device
> (since the expected use-case for calling icmp6_send is to the incoming
> skb). And since you're mainly doing egress, I don't think this path will
> ever trigger (iow the check is not needed)?
> 
> Maybe you can add cgroup_ingress test case? Looks like this rt6_info
> path might trigger for ipv6 lo? I don't see any ingress test in your
> series, so might be good to have one regardless?

The initial reason I added only egress is because the use case of this
makes more sense if that's your local kernel giving you feedback about a
connection you are trying to establish, as a process, but is prevented.

But indeed, I could extend the test to ingress as well, I'd just like
ideally getting an ack from networking maintainers since this is already
v10 of this, before making some new changes.


^ permalink raw reply

* Re: [PATCH net 1/1] tcp: Require init_net CAP_NET_ADMIN for tcp_child_ehash_entries
From: Eric Dumazet @ 2026-06-29 10:41 UTC (permalink / raw)
  To: Ren Wei
  Cc: netdev, davem, pabeni, horms, chia-yu.chang, ij, idosch, fmancera,
	bronzed_45_vested, yuuchihsu, kuniyu, yuantan098, yifanwucs,
	tomapufckgml, bird, roxy520tt
In-Reply-To: <012fba43272abc560acfc0fa37ae22182a60b457.1782641525.git.roxy520tt@gmail.com>

On Sun, Jun 28, 2026 at 4:38 AM Ren Wei <n05ec@lzu.edu.cn> wrote:
>
> From: Zhiling Zou <roxy520tt@gmail.com>
>
> tcp_child_ehash_entries controls the size of the private TCP established
> hash table allocated for subsequently created child network namespaces.
> The value is consumed during child netns creation by tcp_set_hashinfo()
> and passed to inet_pernet_hashinfo_alloc(), which can allocate a large
> per-netns ehash.
>
> The sysctl is writable in each network namespace, and net sysctl
> permissions allow a task with CAP_NET_ADMIN in the namespace's owning
> user namespace to write it.  An unprivileged user can therefore create a
> user and network namespace, set tcp_child_ehash_entries to its maximum
> value, and repeatedly create nested network namespaces to force large
> kernel allocations and exhaust host memory.
>
> Require CAP_NET_ADMIN in the initial user namespace before accepting
> writes to tcp_child_ehash_entries.  This keeps the tuning knob available
> to the host administrator while preventing unprivileged user namespaces
> from using it to drive host-wide memory consumption.

I do not think this patch is desirable.
It breaks nested container use cases.
A container runtime running inside a container (with namespace-local
CAP_NET_ADMIN but not global)
would no longer be able to tune tcp_child_ehash_entries for its own
nested child namespaces.

inet_pernet_hashinfo_alloc() uses GFP_KERNEL_ACCOUNT for a reason, I
suggest you start using memcg :)

Keep in mind the sysctl could be set (by root) in init_net for some
reason, so only memcg will protect against OOM.

^ permalink raw reply

* Re: [PATCH net 1/3] net: Extend bpf_net_context lifetime to cover qdisc enqueue
From: Jamal Hadi Salim @ 2026-06-29 10:47 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior
  Cc: netdev, bpf, davem, edumazet, kuba, pabeni, horms, toke, jiri,
	clrkwllms, rostedt, kuniyu, sdf.kernel, skhawaja, liuhangbin,
	krikku, mkarsten, victor, ast, hawk, john.fastabend, daniel,
	Sashiko
In-Reply-To: <20260629102917.Ag2Vd7LR@linutronix.de>

 -

On Mon, Jun 29, 2026 at 6:29 AM Sebastian Andrzej Siewior
<bigeasy@linutronix.de> wrote:
>
> On 2026-06-26 12:51:54 [-0400], Jamal Hadi Salim wrote:
> > The bpf_net_context used by sch_handle_egress() is stack-allocated and torn
> > down in that function returned. By the time tcf_qevent_handle() runs
> > current->bpf_net_context is NULL.
> >
> > When a filter attached to a qevent block (e.g. RED's early_drop or mark
> > qevents, which always use shared blocks) returns TC_ACT_REDIRECT,
> > tcf_qevent_handle() calls skb_do_redirect(), which in turn calls bpf helper
> > bpf_net_ctx_get_ri(). That helper unconditionally dereferences
> > current->bpf_net_context resulting in a NULL pointer dereference.
> >
> > Note: The same holds for actions that invoke BPF redirect helpers
> > (e.g. act_bpf running a program that calls bpf_redirect()) during qevent
> > classification itself. And as a matter of fact the same assumption is
> > made in the code outside of tc.
> >
> > Fix:
> > Move the bpf_net_context lifecycle out of sch_handle_egress() into
> > __dev_queue_xmit(), so that it spans both the egress TC fast path and the
> > qdisc enqueue. The setup is placed outside the egress_needed_key static
> > branch because qevents are independent of clsact/NF egress hooks and
> > that key may stay disabled when only a qevent-bearing qdisc is
> > configured. Unfortunately this adds a small unconditional penalty to the
> > code path _per packet_ only guarded by CONFIG_NET_XGRESS (two writes and
> > one read for bpf_net_ctx_set, plus one write for bpf_net_ctx_clear).
>
> I fail to understand this but you and sashiko have an understanding...
> If there is TC_ACT_REDIRECT returned by tc_run(), then the skb is NULL
> and as such uppon return from sch_handle_egress() the control flow goes
> to the out label.
> As a fix you move the bpf_net_ctx assigned to before CONFIG_NET_EGRESS
> and clear it on exit. What do I miss here?
>

There are 2 separate filters.
IIUC, you are thinking of the first one which is the clsact egress
classifier (which runs in sch_handle_egress())  - its redirect would
indeed return NULL and skip qdisc enqueue.
The second one is the qevent redirect whch happens in
tcf_qevent_handle() during qdisc enqueue (block 10 in the reproducer).


> > This keeps all bpf_net_context management in net/core/dev.c i.e the
> > existing boundary between tc core and BPF without requiring any net/sched/
> > code to know about BPF plumbing.
> >
> > Reproducer (see the accompanying tdc test):
> >
> >   tc qdisc add dev eth0 root handle 1: red limit 1MB min 10KB max 20KB \
> >       avpkt 1000 burst 100 qevent early_drop block 10
> >   tc qdisc add dev eth0 clsact
> >   tc filter add block 10 pref 1 bpf obj redirect.o
>
> stupid question: how do I get this redirect.o? Just a simply thing to
> reproduce this…
>

It's just pseudo code for a bpf prog that redirects (so you can create
probably a few liner bpf prog).
Take a look at patch 3 which uses a prebuilt action-ebpf binary with
the action-redirect section (added by patch 3 to action.c).
If it's still not clear, I can craft one and send it to you.

cheers,
jamal

> >   tc filter add dev eth0 egress protocol ip prio 1 matchall \
> >       action gact pass
> >
> >   traffic through eth0 triggers red_enqueue() -> tcf_qevent_handle() and,
> >   on a redirect verdict, a NULL deref in skb_do_redirect().
>
> Sebastian

^ permalink raw reply

* Re: [PATCH net 1/1] tcp: Require init_net CAP_NET_ADMIN for tcp_child_ehash_entries
From: tt roxy @ 2026-06-29 10:51 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Ren Wei, netdev, davem, pabeni, horms, chia-yu.chang, ij, idosch,
	fmancera, bronzed_45_vested, yuuchihsu, kuniyu, yuantan098,
	yifanwucs, tomapufckgml, bird
In-Reply-To: <CANn89i+sq29-PbxDrCdUJk405Bs0759wXLBNhpGqE7TNTEPraQ@mail.gmail.com>

On Mon, Jun 29, 2026 at 6:41 PM Eric Dumazet <edumazet@google.com> wrote:
>
> On Sun, Jun 28, 2026 at 4:38 AM Ren Wei <n05ec@lzu.edu.cn> wrote:
> >
> > From: Zhiling Zou <roxy520tt@gmail.com>
> >
> > tcp_child_ehash_entries controls the size of the private TCP established
> > hash table allocated for subsequently created child network namespaces.
> > The value is consumed during child netns creation by tcp_set_hashinfo()
> > and passed to inet_pernet_hashinfo_alloc(), which can allocate a large
> > per-netns ehash.
> >
> > The sysctl is writable in each network namespace, and net sysctl
> > permissions allow a task with CAP_NET_ADMIN in the namespace's owning
> > user namespace to write it.  An unprivileged user can therefore create a
> > user and network namespace, set tcp_child_ehash_entries to its maximum
> > value, and repeatedly create nested network namespaces to force large
> > kernel allocations and exhaust host memory.
> >
> > Require CAP_NET_ADMIN in the initial user namespace before accepting
> > writes to tcp_child_ehash_entries.  This keeps the tuning knob available
> > to the host administrator while preventing unprivileged user namespaces
> > from using it to drive host-wide memory consumption.
>
> I do not think this patch is desirable.
> It breaks nested container use cases.
> A container runtime running inside a container (with namespace-local
> CAP_NET_ADMIN but not global)
> would no longer be able to tune tcp_child_ehash_entries for its own
> nested child namespaces.
>
> inet_pernet_hashinfo_alloc() uses GFP_KERNEL_ACCOUNT for a reason, I
> suggest you start using memcg :)
>
> Keep in mind the sysctl could be set (by root) in init_net for some
> reason, so only memcg will protect against OOM.

Thanks for the review.

Agreed, restricting writes to init_user_ns CAP_NET_ADMIN is too broad and
would break nested container use cases. We will drop this approach.

We will re-check the memcg accounting/fallback behavior for the per-netns
ehash allocation and prepare a different fix if there is still a path to
host-wide OOM outside the intended memcg limits.

^ permalink raw reply

* Re: [PATCH net] net: clear transport header during tunnel decapsulation
From: Paolo Abeni @ 2026-06-29 10:53 UTC (permalink / raw)
  To: Eric Dumazet, David S . Miller, Jakub Kicinski
  Cc: Simon Horman, Ido Schimmel, David Ahern, netdev, eric.dumazet,
	syzbot+d5d0d598a4cfdfafdc3b
In-Reply-To: <20260624073209.3703492-1-edumazet@google.com>

On 6/24/26 9:32 AM, Eric Dumazet wrote:
> diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
> index d3c677e9bff2080e4760347a3d873da4e83ac3ca..59192f58da2e3aae19d00505cc3bb04b083b77c5 100644
> --- a/net/ipv4/ip_tunnel_core.c
> +++ b/net/ipv4/ip_tunnel_core.c
> @@ -134,6 +134,7 @@ int __iptunnel_pull_header(struct sk_buff *skb, int hdr_len,
>  	__vlan_hwaccel_clear_tag(skb);
>  	skb_set_queue_mapping(skb, 0);
>  	skb_scrub_packet(skb, xnet);
> +	skb_unset_transport_header(skb);


In geneve_udp_encap_recv() the above is called a few lines before:

	geneveh = geneve_hdr(skb);

which in turn accesses indirectly the transport header via udp_hdr().

Also AFAICS even vxlan uses __iptunnel_pull_header() in the receive
path, possibly no additional unset needed in such driver.

Side note: it would be helpful if the syzbot CI reports could include
the fully decoded stack trace.

/P


^ permalink raw reply

* Re: [PATCH net-next v5 1/4] dpll: add DPLL_PIN_TYPE_INT_NCO pin type
From: Vadim Fedorenko @ 2026-06-29 10:53 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: Ivan Vecera, Jakub Kicinski, Arkadiusz Kubalewski, netdev,
	Jiri Pirko, David S. Miller, Donald Hunter, Eric Dumazet,
	Michal Schmidt, Paolo Abeni, Pasi Vaananen, Petr Oros,
	Prathosh Satish, Simon Horman, linux-kernel, Grzegorz Nitka
In-Reply-To: <aiftnkuT9IP31qUm@FV6GYCPJ69>

On 09/06/2026 12:06, Jiri Pirko wrote:
> Mon, Jun 08, 2026 at 01:33:56PM +0200, vadim.fedorenko@linux.dev wrote:
>> On 04/06/2026 17:42, Ivan Vecera wrote:
>>> On 6/4/26 5:16 PM, Jakub Kicinski wrote:
>>>> On Thu, 4 Jun 2026 17:01:36 +0200 Ivan Vecera wrote:
>>>>>> Purely going on intuition here but feels like NCO should be a mode
>>>>>> (enum dpll_mode) rather than one of the input pins?
>>>>>>
>>>>>> More acks here would be great, Vadim, Arkadiusz, Grzegorz... ?
>>>>>
>>>>> I had a long discussion with Jiri about this and we agreed finally
>>>>> that dpll_mode represents a reference (input pin) selection strategy
>>>>> mode and not a DPLL device running mode.
>>>>
>>>> Long discussion? I see 2 emails ;) Let's hear from others.
>>>> (thanks for the link BTW, _if_ there's a v6 please put it in the cover
>>>> letter)
>>>
>>> I called him... he explained me 'why?' in detail.
>>> I also appreciate others' opinion.
>>
>> Well, NCO mode means manual operation of frequency tuning. Does it mean
>> that different tunings may be applied to different out pins of DPLL
>> device? My assumption that it's not possible, and in this case NCO is
>> property/mode of DPLL device rather than single pin.
>>
>> @Jiri could you please share your detailed explanation on "why"?
> 
> Since the "why a pin and not a new dpll_mode?" question keeps coming up,
> let me try to describe why I believe that modelling NCO as an input pin
> (DPLL_PIN_TYPE_INT_NCO) is the right thing to do.
> 
> In the DPLL UAPI, 'mode' only describes the *input selection policy*:
> MANUAL means userspace picks which input the loop locks to, AUTOMATIC
> means the DPLL auto-selects the highest-priority input. I know there was
> some fuzz about this semantics in the early stages of upstreaming DPLL
> subsystem, but eventually this became very clear both in code and in
> kdoc:
> 
> <qoute>
>   * enum dpll_mode - working modes a dpll can support, differentiates if and how
>   *   dpll selects one of its inputs to syntonize with it, valid values for
>   *   DPLL_A_MODE attribute
> </quote>
> 
> NCO *is not* a third selection policy - it is just another *source* the
> loop is disciplined from. Except the source is steered by the host
> (via the PHC .adjfine() path) instead of being an external reference.
> Think of it as a virtual pin of some sort.
> 
> The object we already use for "a source the DPLL can lock to" is a pin,
> so an internal NCO belongs right next to DPLL_PIN_TYPE_INT_OSCILLATOR,
> which is already existing example of a similar virtual pin.
> 
> By having NCO as an input pin we reuse the existing model instead of
> inventing a parallel one. "Run as NCO" becomes "connect the NCO input"
> using the same connect/disconnect, pin state and pin-dump infrastructure
> as any other input. No new control surface, and it stays orthogonal to
> mode: we don't have to define what AUTOMATIC+NCO or pin priorities
> mean, and we don't grow enum dpll_mode and the supported-modes
> bitmask that every mode-aware consumer would then have to relearn.
> 
> For the pin info uAPI exposure we reuse the attributes pins already have
> - the output frequency offset from nominal is reported via the pin's
> fractional-frequency-offset / -ppt. A new device mode would need
> brand new device-level attributes for the same information.
> 
> Having said that, I think it's a perfect fit. The only "real" pull
> towards a new mode is that vendor datasheets call this NCO/DCO a "mode".
> But that is HW register terminology and we learned many times in
> the past that may be more or less misleading/incorrect wrt the uAPI.
> 
> Therefore my strong preference is DPLL_PIN_TYPE_INT_NCO, no new mode.
> Honestly, I don't really understand why it would make even little sense
> to have this as new mode. Perhaps I'm missing something, if you can
> describe it, that would be awesome.

Ok, I see your point. Even though the pin UAPI fits the model, I still
have some concerns:

1. I cannot really imagine AUTOMATIC mode selecting NCO pin by priority
in case other pins are gone somehow. It doesn't make sense without
steering SW running on the host. And the other way around - switching to
a higher priority pin while SW is keep "steering" DPLL. But looks like
we have discussed it in the other thread. Adding DPLL mode restrictions
based on pin selection/connection breaks the model, I think...

2. SW steering cannot be pure SW. Every disciplining algorithm relies on
measurements, the product of phase comparators. That technically means
the device has to have other inputs configure as monitor, which can be 
configured in AUTO mode with priorities. How will we model it then?

Thanks,
Vadim


^ permalink raw reply

* Re: [PATCH net 1/1] tcp: bound SYN-ACK timers to reqsk timeout range
From: Eric Dumazet @ 2026-06-29 11:00 UTC (permalink / raw)
  To: Ren Wei
  Cc: netdev, ncardwell, kuniyu, davem, pabeni, horms, chia-yu.chang,
	ij, bronzed_45_vested, yuuchihsu, idosch, yuantan098, yifanwucs,
	tomapufckgml, bird, roxy520tt
In-Reply-To: <02e24eb83639e9d7ecc623f000c60254bb5c40a5.1782643946.git.roxy520tt@gmail.com>

On Sun, Jun 28, 2026 at 4:43 AM Ren Wei <n05ec@lzu.edu.cn> wrote:
>
> From: Zhiling Zou <roxy520tt@gmail.com>
>

...

> -       max_retries = READ_ONCE(icsk->icsk_syn_retries) ? :
> -               READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_synack_retries) + 1;
> +       max_retries = READ_ONCE(icsk->icsk_syn_retries);
> +       if (!max_retries) {
> +               max_retries = READ_ONCE(net->ipv4.sysctl_tcp_synack_retries);
> +               max_retries++;
> +       }

Please do not change this part which looks good, let's avoid code churn.

^ permalink raw reply

* Re: [PATCH] mptcp: only honor zero-length DATA_FIN when a mapping is present
From: Michael Bommarito @ 2026-06-29 11:00 UTC (permalink / raw)
  To: Paolo Abeni
  Cc: Matthieu Baerts, Mat Martineau, Geliang Tang, Eric Dumazet,
	Jakub Kicinski, mptcp, netdev, linux-kernel
In-Reply-To: <3ad5bba8-18b9-48a6-94e0-99d958f23984@redhat.com>

On Mon, Jun 29, 2026 at 5:50 AM Paolo Abeni <pabeni@redhat.com> wrote:
> Isn't this fixed by commit 5e939544f9d2 ("mptcp: fix uninit-value in
> mptcp_established_options") ?

I did the reproduction ~10 days ago on linus's latest, so definitely
still reproducing.  I think 5e939544f9d2 was on the TX side and this
is about the RX option path, so they don't overlap on flows either.

Thanks,
Mike

^ permalink raw reply

* Re: [PATCH net v4] net/mlx5e: macsec: fix use-after-free of metadata_dst on RX SC delete
From: Tariq Toukan @ 2026-06-29 11:04 UTC (permalink / raw)
  To: Doruk Tan Ozturk, saeedm, leon, tariqt, mbloch, sd, andrew+netdev,
	davem, edumazet, kuba, pabeni
  Cc: horms, borisp, raeds, ehakim, netdev, linux-rdma, linux-kernel,
	stable
In-Reply-To: <20260627223059.29917-1-doruk@0sec.ai>



On 28/06/2026 1:30, Doruk Tan Ozturk wrote:
> When an offloaded MACsec RX SC is deleted, macsec_del_rxsc_ctx() freed
> the per-SC metadata_dst with metadata_dst_free(), which kfree()s the
> object unconditionally and ignores the dst reference count. The RX
> datapath in mlx5e_macsec_offload_handle_rx_skb() looks up the SC under
> rcu_read_lock() via xa_load(), takes a reference with dst_hold() and
> attaches the dst to the skb with skb_dst_set(). A reader that already
> obtained the rx_sc pointer can race with the delete path and operate on
> freed memory.
> 
> Fix the owner side by dropping the reference with dst_release() instead
> of freeing unconditionally, and convert the RX datapath to
> dst_hold_safe() so a reader racing the SC delete cannot attach a dst
> whose last reference was just dropped; only attach it when a reference
> was actually taken.
> 
> mlx5e_macsec_add_rxsc() also published sc_xarray_element via xa_alloc()
> before rx_sc->md_dst was allocated and initialised, so a datapath reader
> that looked the SC up by fs_id could observe rx_sc with md_dst still
> NULL or, on weakly-ordered architectures, a non-NULL md_dst pointer
> whose contents were not yet visible. NULL-check the xa_load() result and
> md_dst on the datapath, and reorder add_rxsc() so the xa_alloc() publish
> happens only after md_dst is fully initialised; the xarray RCU publish
> then pairs with the rcu_read_lock()/xa_load() in the datapath.
> 
> Note: macsec_del_rxsc_ctx() also kfree()s rx_sc->sc_xarray_element
> without an RCU grace period while the same datapath reads it under
> rcu_read_lock(); that is a separate pre-existing issue left to a
> follow-up patch.
> 
> Found by 0sec automated security-research tooling (https://0sec.ai).
> 
> Fixes: b7c9400cbc48 ("net/mlx5e: Implement MACsec Rx data path using MACsec skb_metadata_dst")
> Cc: stable@vger.kernel.org
> Signed-off-by: Doruk Tan Ozturk <doruk@0sec.ai>
> ---

Reviewed-by: Tariq Toukan <tariqt@nvidia.com>

Thanks.

^ permalink raw reply

* Re: [RFC net-next 08/15] ipxlat: add translation engine and dispatch core
From: Toke Høiland-Jørgensen @ 2026-06-29 11:08 UTC (permalink / raw)
  To: Ralf Lici
  Cc: netdev, Daniel Gröber, Antonio Quartulli, Andrew Lunn,
	David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	linux-kernel, Pablo Neira Ayuso, Florian Westphal, Phil Sutter,
	Beniamino Galvani
In-Reply-To: <20260624161854.686569-1-ralf@mandelbit.com>

Ralf Lici <ralf@mandelbit.com> writes:

> On Tue, 23 Jun 2026 21:59:44 +0200, Toke Høiland-Jørgensen <toke@kernel.org> wrote:
>> Ralf Lici <ralf@mandelbit.com> writes:
>> > On the BPF point specifically: I agree a BPF program should be able to
>> > decide whether to translate. What I am less sure about is whether
>> > redirecting to a netdevice is the best way to expose that. A TC action
>> > (yet another model, I know :)) gives you the same thing in-pipeline and
>> > more directly:
>> >
>> >     tc filter add dev wwan0 egress \
>> >         bpf obj match.o action ipxlat4to6 domain clat0
>> >
>> > Let BPF make the policy decision, with the native action doing the
>> > translation work that the current BPF CLAT implementations have trouble
>> > with: fragmentation, checksum corner cases, and ICMP error inner
>> > headers (as explained by Beniamino).
>> >
>> > So TC clsact looks like the natural in-kernel replacement for today's
>> > TC-BPF CLAT programs: no extra netdev, you attach to the existing
>> > uplink, direction is explicit, and on egress you sit on the real route
>> > dst, so the synthetic-dst and double-routing problems above just don't
>> > arise. The cost is more moving parts than a single bpf_redirect since
>> > userspace has to manage clsact, filters, priorities and action
>> > lifecycle/cleanup.
>>
>> Hmm, so no one really uses the bpf filter mechanism, since you can just
>> do everything from an action anyway (and with TCX attachment, you can
>> even avoid the overhead of the TC filter/action infrastructure
>> entirely). However, point taken wrt how to integrate this with BPF. I
>> guess the most flexible thing would be to expose the functionality
>> directly (as a kfunc callable from a BPF program). Which also fits with
>> your point below:
>>
>
> Ah, I see, the cls_bpf example was dated, and I like the kfunc angle
> better than a new TC action.
>
> I would probably keep that as the minimal per-packet interface: BPF can
> decide whether a packet should be translated, and the kfunc can do the
> actual translation work for packets whose translated form still fits the
> output MTU. The full 4->6 fragmentation case still looks like
> output-path/harness territory to me, since it is a 1->N fan-out
> operation.

Yeah, that would probably be fine; I would expect that in most cases
you'd want to configure your MTU to avoid fragmentation anyway :)

>> > For a gateway translator, though, I still think a device-bound model is
>> > less natural. There the translation point is more like a forwarding
>> > decision across routes and nexthops, so a route/LWT attachment, or
>> > possibly a netfilter attachment seems easier to reason about. Also, as
>> > you already pointed out while discussing LWT, an admin setting up NAT64
>> > is more likely to reach for an nft rule than for a clsact filter on a
>> > specific device.
>> >
>> > Taking a step back, ipxlat is really a generic translation engine plus a
>> > thin harness around it. So rather than pick one attachment, it might be
>> > worth structuring the engine so different harnesses can drive it.
>> > There's interesting precedent for this shape:
>> >
>> > - ILA, again, is the closest sibling: stateless IPv6 address translation
>> >   with a shared core in ila_common.c, driven both by an LWT frontend in
>> >   ila_lwt.c and by an inline netfilter hook with a netlink-configured
>> >   mapping table in ila_xlat.c.
>> >
>> > - act_ct is the precedent for the TC side specifically: a TC action that
>> >   reuses the netfilter conntrack engine rather than reimplementing it.
>> >
>> > And act_nat is the cautionary counter-example: a standalone TC
>> > reimplementation of stateless NAT that shares no code with nf_nat, and
>> > carries a "would be nice to share code" comment :)
>> >
>> > So I am wondering whether the right direction is to factor the
>> > translation engine cleanly, land it with one harness first, and keep the
>> > other attachment points as follow-up work once the core semantics are
>> > settled.
>> >
>> > Does that direction seem reasonable to you?
>>
>> Yes, reusable functionality that can be called from multiple places
>> sounds like a good fit; let's try to structure it that way!
>>
>
> Great, that's the direction I'll take then.
>
>> As for which hook to start with, well, let's see if we hear back from
>> the netfilter devs, but either netfilter or the routing subsystem (LWT
>> style) would be OK for me I think.
>>
>
> Works for me. The engine factoring is common to all of them, so I'll
> start there. Once it's in shape I can sketch a harness against it to
> sanity-check the interface.

Awesome, sounds good!

-Toke

^ permalink raw reply

* Re: [PATCH] net: stmmac: fix missed le32_to_cpu()
From: Ben Dooks @ 2026-06-29 11:11 UTC (permalink / raw)
  To: Maxime Chevallier, Jakub Kicinski
  Cc: Andrew Lunn, David S. Miller, Eric Dumazet, Paolo Abeni,
	Maxime Coquelin, Alexandre Torgue, Russell King (Oracle), netdev,
	linux-stm32, linux-arm-kernel, linux-kernel
In-Reply-To: <2a92fd9d-42b3-4564-b784-ec504d4d82b8@bootlin.com>

On 25/06/2026 08:07, Maxime Chevallier wrote:
> 
> 
> On 6/25/26 04:22, Jakub Kicinski wrote:
>> On Mon, 22 Jun 2026 19:51:39 +0200 Maxime Chevallier wrote:
>>> Hi Ben,
>>>
>>> On 6/22/26 16:37, Ben Dooks wrote:
>>>> The print in ndesc_display_ring() sends the des2 and des3
>>>> to the pr_info() without passing them through the relevant
>>>> conversion to cpu order.
>>>>
>>>> Fix the (prototype) sparse warnings by using le32_to_cpu():
>>>> drivers/net/ethernet/stmicro/stmmac/norm_desc.c:258:17: warning: incorrect type in argument 6 (different base types)
>>>> drivers/net/ethernet/stmicro/stmmac/norm_desc.c:258:17:    expected unsigned int
>>>> drivers/net/ethernet/stmicro/stmmac/norm_desc.c:258:17:    got restricted __le32 [usertype] des2
>>>> drivers/net/ethernet/stmicro/stmmac/norm_desc.c:258:17: warning: incorrect type in argument 7 (different base types)
>>>> drivers/net/ethernet/stmicro/stmmac/norm_desc.c:258:17:    expected unsigned int
>>>> drivers/net/ethernet/stmicro/stmmac/norm_desc.c:258:17:    got restricted __le32 [usertype] des3
>>>>
>>>> Signed-off-by: Ben Dooks <ben.dooks@codethink.co.uk>
>>>
>>> I agree on the principle, but this isn't a fix so this'll have to wait
>>> until net-next re-opens :)
>>
>> Humpf, why are we not seeing this on x86 allmodconfig ? 🤔️
>>
>> $ make C=1 W=1 drivers/net/ethernet/stmicro/stmmac/norm_desc.o
>>    DESCEND objtool
>>    CC [M]  drivers/net/ethernet/stmicro/stmmac/norm_desc.o
>>    CHECK   drivers/net/ethernet/stmicro/stmmac/norm_desc.c
>> $
> 
> Heh good point indeed !
>    
>>>> Fix the (prototype) sparse warnings by using le32_to_cpu():
> 
> Ben, what's this "prototype" sparse ? a custom tool of yours that
> you used to find that ?

I have an RFC to add variadic and thus also printf/scanf formatting
to sparse. This is waiting on review after the original got re-worked
to add scanf and a few other bug-fixed and shuffles.

Ref: https://marc.info/?l=linux-sparse&m=178185274600679&w=2


-- 
Ben Dooks				http://www.codethink.co.uk/
Senior Engineer				Codethink - Providing Genius

https://www.codethink.co.uk/privacy.html

^ permalink raw reply

* Re: [PATCH v3] xsk: fix memory corruptions in net/core/xdp.c
From: Clement Lecigne @ 2026-06-29 11:15 UTC (permalink / raw)
  To: Fijalkowski, Maciej
  Cc: Lobakin, Aleksander, edumazet@google.com, netdev@vger.kernel.org,
	bpf@vger.kernel.org, linux-kernel@vger.kernel.org,
	kuba@kernel.org, sdf@fomichev.me, horms@kernel.org,
	john.fastabend@gmail.com, ast@kernel.org, daniel@iogearbox.net
In-Reply-To: <DM4SPRMB00455DAA85BD5AEE9784C6EB82E82@DM4SPRMB0045.namprd11.prod.outlook.com>

[-- Attachment #1: Type: text/plain, Size: 5314 bytes --]

On Mon, Jun 29, 2026 at 12:34 PM Fijalkowski, Maciej
<maciej.fijalkowski@intel.com> wrote:
>
> >
> > From: Clément Lecigne <clecigne@google.com>
> >
> > Commit 560d958c6c68 ("xsk: add generic XSk &xdp_buff -> skb conversion")
> > introduced a vulnerability in the handling of XDP_PASS for AF_XDP zero-copy
> > frames.
> >
> > Note: Currently, this specific AF_XDP zero-copy conversion path is only
> > reachable from the drivers/net/ethernet/intel/ice and
> > drivers/net/ethernet/intel/idpf drivers.
> >
> > When building an skb, xdp_build_skb_from_zc() uses the chunk size
> > (xdp->frame_sz) for the allocation. However, napi_build_skb() automatically
> > reserves space at the end of the allocation for the skb_shared_info
> > structure.
> >
> > Most high performance UMEM applications use 4K chunks, where the
> > corruption cannot happen. However, if the UMEM is configured with 2KB
> > chunks (a very common configuration to maximize packet density in memory),
> > a standard 1500 MTU packet will trigger the corruption because the required
> > space exceeds the 2048 byte chunk size:
> >
> > Headroom (256) + Packet (1514) + skb_shared_info (320) = 2090 bytes
> >
> > Because 2090 bytes > 2048 bytes and __skb_put() does not perform bounds
> > checking, the memcpy() writes past the available linear data area and
> > corrupts the skb_shared_info structure. This can lead to arbitrary code
> > execution if pointers like destructor_arg are overwritten.
> >
> > Additionally, in xdp_copy_frags_from_zc(), the allocation size is set
> > strictly to the fragment size (len), but the subsequent memcpy() uses
> > LARGEST_ALIGN(len). This mismatch results in an out-of-bounds write of
> > up to 7 bytes, which triggers KASAN warnings and is unsafe despite typical
> > page pool allocator padding.
> >
> > Fix the skb allocation in xdp_build_skb_from_zc() by dynamically
> > calculating the exact truesize required using SKB_HEAD_ALIGN() to
> > properly account for the headroom, the LARGEST_ALIGN(len), and the
> > skb_shared_info overhead.
> >
> > Fix the out-of-bounds write in xdp_copy_frags_from_zc() by rounding up
> > the allocation request using LARGEST_ALIGN(len) to match the copy
> > operation.
> >
> > Fixes: 560d958c6c68 ("xsk: add generic XSk &xdp_buff -> skb conversion")
> > CC: Alexander Lobakin <aleksander.lobakin@intel.com>
> > CC: Eric Dumazet <edumazet@google.com>
> > Signed-off-by: Clément Lecigne <clecigne@google.com>
>
> Hi Clement,
>
> Do you have a reproducer for mentioned issue or is it only a fix from
> theoretical POV?
>
> To be clear, we were addressing headroom issues in this series:
> https://lore.kernel.org/bpf/20260402154958.562179-1-maciej.fijalkowski@intel.com/
>
> so I wanted to ask if you are able to have this malformed setup for
> 2k chunk size. That series should not allow for that.

I didn't manage to build a malformed setup and only used a LKM to reproduce
the issue artificially. I shared some more details with you privately.

Thanks,
-clem

>
> I think this is the second time someone is trying to fix this area of code,
> so it is not a nack or something, let us fix this, but I wanted to have
> us on the same page.
>
> Thanks,
> Maciej
>
> > ---
> > Changes since v2:
> >  - Used LARGEST_ALIGN to calculate the len to account for the aligned
> > memcpy.
> >  - Fixed the commit message to include the idpf driver.
> >
> > Changes since v1:
> >  - Used SKB_HEAD_ALIGN to properly calculate the required allocation size
> >    including the skb_shared_info overhead.
> >  - Re-ordered variable declarations.
> >
> > ---
> > diff --git a/net/core/xdp.c b/net/core/xdp.c
> > index 9890a30584ba..7e39f17ad407 100644
> > --- a/net/core/xdp.c
> > +++ b/net/core/xdp.c
> > @@ -698,8 +698,8 @@ static noinline bool xdp_copy_frags_from_zc(struct
> > sk_buff *skb,
> >
> >       for (u32 i = 0; i < nr_frags; i++) {
> >               const skb_frag_t *frag = &xinfo->frags[i];
> > -             u32 len = skb_frag_size(frag);
> > -             u32 offset, truesize = len;
> > +             u32 offset, len = skb_frag_size(frag);
> > +             u32 truesize = LARGEST_ALIGN(len);
> >               struct page *page;
> >
> >               page = page_pool_dev_alloc(pp, &offset, &truesize);
> > @@ -738,9 +738,10 @@ static noinline bool xdp_copy_frags_from_zc(struct
> > sk_buff *skb,
> >   */
> >  struct sk_buff *xdp_build_skb_from_zc(struct xdp_buff *xdp)
> >  {
> > +     u32 headroom = xdp->data_meta - xdp->data_hard_start;
> >       const struct xdp_rxq_info *rxq = xdp->rxq;
> > -     u32 len = xdp->data_end - xdp->data_meta;
> > -     u32 truesize = xdp->frame_sz;
> > +     u32 len = LARGEST_ALIGN(xdp->data_end - xdp->data_meta);
> > +     u32 truesize = SKB_HEAD_ALIGN(headroom + len);
> >       struct sk_buff *skb = NULL;
> >       struct page_pool *pp;
> >       int metalen;
> > @@ -762,7 +763,7 @@ struct sk_buff *xdp_build_skb_from_zc(struct
> > xdp_buff *xdp)
> >       }
> >
> >       skb_mark_for_recycle(skb);
> > -     skb_reserve(skb, xdp->data_meta - xdp->data_hard_start);
> > +     skb_reserve(skb, headroom);
> >
> >       memcpy(__skb_put(skb, len), xdp->data_meta,
> > LARGEST_ALIGN(len));
> >
>

[-- Attachment #2: S/MIME Cryptographic Signature --]
[-- Type: application/pkcs7-signature, Size: 5281 bytes --]

^ permalink raw reply

* Re: [RFC PATCH net-next] netpoll: hold RCU while walking napi_list
From: Breno Leitao @ 2026-06-29 11:17 UTC (permalink / raw)
  To: Runyu Xiao
  Cc: Jakub Kicinski, davem, edumazet, pabeni, horms, sashal, bigeasy,
	netdev, linux-kernel, jianhao.xu
In-Reply-To: <AFEAtADoKs22QyipHhKwe4op.3.1782623057057.Hmail.220255722@seu.edu.cn>

Hello,

On Sun, Jun 28, 2026 at 01:04:17PM +0800, Runyu Xiao wrote:
> Hi,
> 
> On Sat, 27 Jun 2026 14:21:05 -0700 Jakub Kicinski wrote:
> &gt; Please provide the stack trace from the report, rather than just saying
> &gt; that you can trigger it.

I am really suprised to see this warning. I've been runing this code with
CONFIG_PROVE_RCU_LIST for ages, and I haven't seen anything similar.

> Sure, sorry for not including it in the RFC.  The warning was from the
> reviewed reproducer used for the CONFIG_PROVE_RCU_LIST triage, not from
> a production crash.  The relevant part of the dmesg is:

Reading it, it does not come from the kernel's netpoll code at
all -- it comes from an out-of-tree module (!?)

>   WARNING: suspicious RCU usage
>   6.1.66 #3 Tainted: G           O
>   -----------------------------
>   /home/ubuntu22/msv_workspace/shared/vuln_msv.c:45 RCU-list traversed in non-reader section!!
> 
>   other info that might help us debug this:
> 
>   rcu_scheduler_active = 2, debug_locks = 1
>   no locks held by insmod/190.
> 
>   stack backtrace:
>   CPU: 1 PID: 190 Comm: insmod Tainted: G           O       6.1.66 #3

Have you tested it on a more modern kernel?

>   Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1 04/01/2014
>   Call Trace:
>    <task>
>    dump_stack_lvl+0x45/0x5d
>    lockdep_rcu_suspicious.cold+0x2d/0x64
>    poll_napi.constprop.0+0x43/0x71 [vuln_msv]
>    netpoll_poll_dev.constprop.0+0x27/0x36 [vuln_msv]
>    ? 0xffffffffc0005000
>    rcu_list_msv_init+0xe2/0x1000 [vuln_msv]

What is `vuln_msv` exactly?

Could you reproduce this from an in-kernel path instead -- a real
netpoll/netconsole/bonding caller, with the frames resolving to the kernel
rather than [vuln_msv]?

Meanwhile, NAK until the above is clarified

--
pw-bot: rejected

^ permalink raw reply

* [PATCH net-next v4 00/13] dpaa2-switch: add support for LAG offload
From: Ioana Ciornei @ 2026-06-29 11:22 UTC (permalink / raw)
  To: andrew+netdev, davem, edumazet, kuba, pabeni, netdev; +Cc: linux-kernel

This patch set adds support in dpaa2-switch for offloading upper bond
devices.

The first two patches remove the necessity to hold rtnl_lock during the
event processing workqueue by ensuring that all event were processed
before any changes in FDB layout happens.

Patch #3 updates the logic around choosing the FDB that should be used
on a switch port. This is necessary since with the addition of the LAG
offload, we need to take into account all ports which are under the same
bridge, even though not directly.

The next four patches clean up the FDB event by making them easier to
integrate with bond devices and also add the
dpaa2_switch_port_to_bridge_port() helper to be used in the LAG offload
support.

The 8th patch adds the necessary new APIs for the LAG configuration
while the next one uses them, both in the prechangeupper phase and the
changeupper one. Which ports can be part of the same LAG group is
configurable at boot time, thus we use the prechangeupper callback in
order to validate that a requested configuration can be offloaded or
not.

This set also extends the handling of FDBs and port objects so that they
are handled by the driver even on an offloaded bond device.

Changes in v4:
- Moved and split some patches so that any preparatory work is being
  done before the driver offloads upper bond devices.
- Add a defensive check in dpaa2_switch_port_bond_leave() for a NULL
port_priv->lag
- Extend the dpaa2_switch_prevent_bridging_with_8021q_upper() function
so that we prevent a bond device with VLAN uppers joinging a bridge.
The restriction is related to VLAN management in terms of the FDB which
can change upon a topology change. VLAN uppers can only be added once
the bridge topology is setup.
- Remove all FDB management from the bond join/leave paths. Decided to
reconfigure the FDB only on bridge join/leave since the FDB determines
the forwarding domain and when a bond is not bridged, from a
configuration standpoint, the individual lowers can be viewed as
standalone.
- Moved here the update to the dpaa2_switch_port_to_bridge_port()
function so that the LAG state is taken into account.
- Add a new per LAG field - primary - which is used to keep track of the
primary port of a LAG group instead of determining each time we need to
use it.
- Set 'skb->offload_fwd_mark' only when the port is under a bridge.
- Migrate FDBs in case the primary interface of a LAG changes.
- Use lag->primary instead of determining each time the primary
interface of a LAG device
- Link to v3: https://lore.kernel.org/all/20260603143623.3712024-1-ioana.ciornei@nxp.com/

Changes in v3:
- Add a check in dpsw_lag_set() for cfg->num_ifs against
DPSW_MAX_LAG_IFS
- Add kerneldoc for the dpsw_lag_cfg structure.
- Fix logic in prechangeupper callback in order to not call
dpaa2_switch_prechangeupper_sanity_checks() on !info->linking
- Fixed up the logic in the dpaa2_switch_port_bond_join()'s error path
so that the FDBs are cleaned-up properly and we do not end-up with FDB's
leaked, meaning that they could have been marked as in-use but actually
no port was using it.
- Mark the port_priv->lag field as __rcu and use the proper accesors for
it. This will eventually become useful in a later patch when the lag
field will be accessed concurrently from the NAPI context and the
join/leave paths
- Access lag field through rtnl_dereference() so that we adapt to the
__rcu change.
- Check that the brport is non-NULL before calling
switchdev_bridge_port_unoffload() on it.
- Get hold on port_priv->ethsw_data only after we know the device is a
dpaa2-switch one
- Update dpaa2_switch_foreign_dev_check() so that we check if there is
any port in the same switch as dev which offloads foreign_dev in case
this is a bridge port.
- Add mutex_destroy on the per LAG fdb_lock
- Make sure that all FDB events were processed on the workqueue on the
.remove() path.
- Delete the refcounted entry in dpaa2_switch_lag_fdb_del() as soon as
possible, even if the HW deletion would fail
- Access the port_priv->lag field only through the proper rcu accessors.
- Change the mask so that we restrict the trap only to the link local
addresses (01:80:c2:00:00:00 to 01:80:c2:00:00:0F) instead of the entire
reserved bridge block of addresses
- Link to v2: https://lore.kernel.org/all/20260512131554.952971-1-ioana.ciornei@nxp.com/

Changes in v2:
- Extend dpaa2_switch_prechangeupper_sanity_checks() with
netdev_walk_all_lower_dev() so that checks are done on all lower devices
of a bridge, even for the lowers of a bridged bond.
- Manage better the default VLAN on bond join
- Clean-up the error path in dpaa2_switch_port_bond_join()
- Call dpaa2_switch_port_bridge_leave() in case a port is leaving a bond
which is also a bridged port
- Update dpaa2_switch_port_bond_leave() so that in case of any failure
the driver tries to cleanup the LAG offload configuration.
- Call switchdev_bridge_port_unoffload() in a switch port is leaving a
bridge bond device.
- The rollback in dpaa2_switch_port_mdb_add() uses the newly introduced
dpaa2_switch_port_fdb_del() helper instead of the _mc counterpart.
- Update dpaa2_switch_foreign_dev_check() so that we check if between
the switch port and the foreign net_device is an offloaded path. Before
this change we also checked if the foreign_dev was offloaded or not by
the switch port.
- Update the switchdev_bridge_port_unoffload() by passing it the proper
context and the notifier blocks.
- Add dev_hold() and dev_put() calls for orig_dev
- In case dev_mc_add() fails, remove the MDB address from HW with the
proper function, dpaa2_switch_lag_fdb_del() or
dpaa2_switch_port_fdb_del(), depending on the LAG offload state.
- Fix 32bit build by using BIT_ULL
- Take a reference to port_priv->lag instead of reading it multiple
times.
- Link to v1: https://lore.kernel.org/all/20260506151540.1242997-1-ioana.ciornei@nxp.com/

Ioana Ciornei (13):
  dpaa2-switch: remove unnecessary dev_mc_add/dev_mc_del calls
  dpaa2-switch: avoid holding rtnl_lock in dpaa2_switch_event_work()
  dpaa2-switch: extend the FDB management to cover bond scenarios
  dpaa2-switch: create a separate dpaa2_switch_port_fdb_event() function
  dpaa2-switch: check early if an FDB entry should be added
  dpaa2-switch: add dpaa2_switch_port_to_bridge_port() helper
  dpaa2-switch: consolidate unicast and multicast management
  dpaa2-switch: add LAG configuration API
  dpaa2-switch: add support for LAG offload
  dpaa2-switch: offload FDBs added on an upper bond device
  dpaa2-switch: offload port objects on an upper bond device
  dpaa2-switch: trap all link local reserved addresses to the CPU
  dpaa2-switch: add support for imprecise source port

 .../ethernet/freescale/dpaa2/dpaa2-switch.c   | 931 +++++++++++++++---
 .../ethernet/freescale/dpaa2/dpaa2-switch.h   |  42 +-
 .../net/ethernet/freescale/dpaa2/dpsw-cmd.h   |  18 +-
 drivers/net/ethernet/freescale/dpaa2/dpsw.c   |  60 ++
 drivers/net/ethernet/freescale/dpaa2/dpsw.h   |  30 +
 5 files changed, 948 insertions(+), 133 deletions(-)

-- 
2.25.1


^ permalink raw reply

* [PATCH net-next v4 02/13] dpaa2-switch: avoid holding rtnl_lock in dpaa2_switch_event_work()
From: Ioana Ciornei @ 2026-06-29 11:22 UTC (permalink / raw)
  To: andrew+netdev, davem, edumazet, kuba, pabeni, netdev; +Cc: linux-kernel
In-Reply-To: <20260629112309.154328-1-ioana.ciornei@nxp.com>

The only reason why the rtnl_lock is held in the
dpaa2_switch_event_work() is so that there is no concurency between the
changeupper notifier which manages the per port FDB assignment and the
workqueue which adds / deletes addresses into that forwarding database.

To avoid this kind of concurency without a rtnl_lock, flush the event
workqueue as the last step from the pre_bridge_leave so that any
in-flight operations targeting the current FDB are finalized before the
bridge layout (and the per port FDB assignment) changes.

Signed-off-by: Ioana Ciornei <ioana.ciornei@nxp.com>
---
Changes in v4:
- New patch.
---
 drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c
index d70e6f06ac15..67c639fad0db 100644
--- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c
+++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c
@@ -2069,7 +2069,15 @@ static int dpaa2_switch_port_restore_rxvlan(struct net_device *vdev, int vid, vo
 
 static void dpaa2_switch_port_pre_bridge_leave(struct net_device *netdev)
 {
+	struct ethsw_port_priv *port_priv = netdev_priv(netdev);
+	struct ethsw_core *ethsw = port_priv->ethsw_data;
+
 	switchdev_bridge_port_unoffload(netdev, NULL, NULL, NULL);
+
+	/* Make sure that any FDB add/del operations are completed before the
+	 * bridge layout changes
+	 */
+	flush_workqueue(ethsw->workqueue);
 }
 
 static int dpaa2_switch_port_bridge_leave(struct net_device *netdev)
@@ -2281,7 +2289,6 @@ static void dpaa2_switch_event_work(struct work_struct *work)
 	struct switchdev_notifier_fdb_info *fdb_info;
 	int err;
 
-	rtnl_lock();
 	fdb_info = &switchdev_work->fdb_info;
 
 	switch (switchdev_work->event) {
@@ -2310,7 +2317,6 @@ static void dpaa2_switch_event_work(struct work_struct *work)
 		break;
 	}
 
-	rtnl_unlock();
 	kfree(switchdev_work->fdb_info.addr);
 	kfree(switchdev_work);
 	dev_put(dev);
-- 
2.25.1


^ permalink raw reply related

* [PATCH net-next v4 01/13] dpaa2-switch: remove unnecessary dev_mc_add/dev_mc_del calls
From: Ioana Ciornei @ 2026-06-29 11:22 UTC (permalink / raw)
  To: andrew+netdev, davem, edumazet, kuba, pabeni, netdev; +Cc: linux-kernel
In-Reply-To: <20260629112309.154328-1-ioana.ciornei@nxp.com>

The DPSW object does not implement strict address filtering thus any
call to the dev_mc_add() / dev_mc_del() is pointless. Remove these calls
from the dpaa2_switch_port_mdb_add() and dpaa2_switch_port_mdb_del()
functions.

And since the multicast addresses no longer reach the netdev->mc list,
there is no point in keeping the dpaa2_switch_port_lookup_address()
function which searches through that list to verify if the same address
is added multiple times.

Signed-off-by: Ioana Ciornei <ioana.ciornei@nxp.com>
---
Changes in v4:
- new patch
---
 .../ethernet/freescale/dpaa2/dpaa2-switch.c   | 50 +------------------
 1 file changed, 2 insertions(+), 48 deletions(-)

diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c
index 858ba844ac51..d70e6f06ac15 100644
--- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c
+++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c
@@ -1860,44 +1860,12 @@ int dpaa2_switch_port_vlans_add(struct net_device *netdev,
 					  vlan->changed);
 }
 
-static int dpaa2_switch_port_lookup_address(struct net_device *netdev, int is_uc,
-					    const unsigned char *addr)
-{
-	struct netdev_hw_addr_list *list = (is_uc) ? &netdev->uc : &netdev->mc;
-	struct netdev_hw_addr *ha;
-
-	netif_addr_lock_bh(netdev);
-	list_for_each_entry(ha, &list->list, list) {
-		if (ether_addr_equal(ha->addr, addr)) {
-			netif_addr_unlock_bh(netdev);
-			return 1;
-		}
-	}
-	netif_addr_unlock_bh(netdev);
-	return 0;
-}
-
 static int dpaa2_switch_port_mdb_add(struct net_device *netdev,
 				     const struct switchdev_obj_port_mdb *mdb)
 {
 	struct ethsw_port_priv *port_priv = netdev_priv(netdev);
-	int err;
-
-	/* Check if address is already set on this port */
-	if (dpaa2_switch_port_lookup_address(netdev, 0, mdb->addr))
-		return -EEXIST;
 
-	err = dpaa2_switch_port_fdb_add_mc(port_priv, mdb->addr);
-	if (err)
-		return err;
-
-	err = dev_mc_add(netdev, mdb->addr);
-	if (err) {
-		netdev_err(netdev, "dev_mc_add err %d\n", err);
-		dpaa2_switch_port_fdb_del_mc(port_priv, mdb->addr);
-	}
-
-	return err;
+	return dpaa2_switch_port_fdb_add_mc(port_priv, mdb->addr);
 }
 
 static int dpaa2_switch_port_obj_add(struct net_device *netdev,
@@ -2000,22 +1968,8 @@ static int dpaa2_switch_port_mdb_del(struct net_device *netdev,
 				     const struct switchdev_obj_port_mdb *mdb)
 {
 	struct ethsw_port_priv *port_priv = netdev_priv(netdev);
-	int err;
 
-	if (!dpaa2_switch_port_lookup_address(netdev, 0, mdb->addr))
-		return -ENOENT;
-
-	err = dpaa2_switch_port_fdb_del_mc(port_priv, mdb->addr);
-	if (err)
-		return err;
-
-	err = dev_mc_del(netdev, mdb->addr);
-	if (err) {
-		netdev_err(netdev, "dev_mc_del err %d\n", err);
-		return err;
-	}
-
-	return err;
+	return dpaa2_switch_port_fdb_del_mc(port_priv, mdb->addr);
 }
 
 static int dpaa2_switch_port_obj_del(struct net_device *netdev,
-- 
2.25.1


^ permalink raw reply related

* [PATCH net-next v4 03/13] dpaa2-switch: extend the FDB management to cover bond scenarios
From: Ioana Ciornei @ 2026-06-29 11:22 UTC (permalink / raw)
  To: andrew+netdev, davem, edumazet, kuba, pabeni, netdev; +Cc: linux-kernel
In-Reply-To: <20260629112309.154328-1-ioana.ciornei@nxp.com>

The dpaa2_switch_fdb_for_join() function is responsible with determining
what FDB should be used by a port as a consequence of it joining a
bridge. The rule is that all DPAA2 switch ports under the same bridge
will use the FDB of the first port which joined that bridge. Extend the
function so that the function also covers the scenario in which there is
bridged bond device.

For this to happen, in case a bond device is encountered through the
bridge ports the function needs to descend one level through its lowers
as well.

Signed-off-by: Ioana Ciornei <ioana.ciornei@nxp.com>
---
Changes in v4:
- New patch. The same idea was present also in v3 but the implemetation
changed quite a bit since there was some restructuring work done to the
main function in the meantime.
---
 .../ethernet/freescale/dpaa2/dpaa2-switch.c   | 35 +++++++++++++------
 1 file changed, 25 insertions(+), 10 deletions(-)

diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c
index 67c639fad0db..eacab00b586a 100644
--- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c
+++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c
@@ -71,9 +71,9 @@ static struct dpaa2_switch_fdb *
 dpaa2_switch_fdb_for_join(struct ethsw_port_priv *port_priv,
 			  struct net_device *upper_dev)
 {
-	struct ethsw_port_priv *other_port_priv;
-	struct net_device *other_dev;
-	struct list_head *iter;
+	struct ethsw_port_priv *other_port_priv = NULL;
+	struct net_device *other_dev, *other_dev2;
+	struct list_head *iter, *iter2;
 
 	/* The below call to netdev_for_each_lower_dev() demands the RTNL lock
 	 * being held. Assert on it so that it's easier to catch new code
@@ -82,17 +82,32 @@ dpaa2_switch_fdb_for_join(struct ethsw_port_priv *port_priv,
 	ASSERT_RTNL();
 
 	/* If part of a bridge, use the FDB of the first dpaa2 switch interface
-	 * to be present in that bridge
+	 * to be present in that bridge. The search descends one level through
+	 * a bridged bond's lowers as well.
 	 */
 	netdev_for_each_lower_dev(upper_dev, other_dev, iter) {
-		if (!dpaa2_switch_port_dev_check(other_dev))
-			continue;
+		if (netif_is_lag_master(other_dev)) {
+			netdev_for_each_lower_dev(other_dev, other_dev2, iter2) {
+				if (!dpaa2_switch_port_dev_check(other_dev2))
+					continue;
 
-		if (other_dev == port_priv->netdev)
-			continue;
+				if (other_dev2 == port_priv->netdev)
+					continue;
 
-		other_port_priv = netdev_priv(other_dev);
-		return other_port_priv->fdb;
+				other_port_priv = netdev_priv(other_dev2);
+				break;
+			}
+		} else {
+			if (!dpaa2_switch_port_dev_check(other_dev))
+				continue;
+
+			if (other_dev == port_priv->netdev)
+				continue;
+
+			other_port_priv = netdev_priv(other_dev);
+		}
+		if (other_port_priv)
+			return other_port_priv->fdb;
 	}
 
 	return port_priv->fdb;
-- 
2.25.1


^ permalink raw reply related

* [PATCH net-next v4 04/13] dpaa2-switch: create a separate dpaa2_switch_port_fdb_event() function
From: Ioana Ciornei @ 2026-06-29 11:23 UTC (permalink / raw)
  To: andrew+netdev, davem, edumazet, kuba, pabeni, netdev; +Cc: linux-kernel
In-Reply-To: <20260629112309.154328-1-ioana.ciornei@nxp.com>

Create a separate dpaa2_switch_port_fdb_event() function that will only
handle the FDB related events. With this change, the
dpaa2_switch_port_event() notifier handler can be written in a way that
it's easier to follow.

Signed-off-by: Ioana Ciornei <ioana.ciornei@nxp.com>
---
Changes in v4:
- none

Changes in v3:
- Get hold on port_priv->ethsw_data only after we know the device is a
dpaa2-switch one

Changes in v2:
- none
---
 .../ethernet/freescale/dpaa2/dpaa2-switch.c   | 28 ++++++++++++++-----
 1 file changed, 21 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c
index eacab00b586a..c7c84bf2fde7 100644
--- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c
+++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c
@@ -2337,21 +2337,18 @@ static void dpaa2_switch_event_work(struct work_struct *work)
 	dev_put(dev);
 }
 
-/* Called under rcu_read_lock() */
-static int dpaa2_switch_port_event(struct notifier_block *nb,
-				   unsigned long event, void *ptr)
+static int dpaa2_switch_port_fdb_event(struct notifier_block *nb,
+				       unsigned long event, void *ptr)
 {
 	struct net_device *dev = switchdev_notifier_info_to_dev(ptr);
 	struct ethsw_port_priv *port_priv = netdev_priv(dev);
 	struct ethsw_switchdev_event_work *switchdev_work;
 	struct switchdev_notifier_fdb_info *fdb_info = ptr;
-	struct ethsw_core *ethsw = port_priv->ethsw_data;
-
-	if (event == SWITCHDEV_PORT_ATTR_SET)
-		return dpaa2_switch_port_attr_set_event(dev, ptr);
+	struct ethsw_core *ethsw;
 
 	if (!dpaa2_switch_port_dev_check(dev))
 		return NOTIFY_DONE;
+	ethsw = port_priv->ethsw_data;
 
 	switchdev_work = kzalloc_obj(*switchdev_work, GFP_ATOMIC);
 	if (!switchdev_work)
@@ -2390,6 +2387,23 @@ static int dpaa2_switch_port_event(struct notifier_block *nb,
 	return NOTIFY_BAD;
 }
 
+/* Called under rcu_read_lock() */
+static int dpaa2_switch_port_event(struct notifier_block *nb,
+				   unsigned long event, void *ptr)
+{
+	struct net_device *dev = switchdev_notifier_info_to_dev(ptr);
+
+	switch (event) {
+	case SWITCHDEV_PORT_ATTR_SET:
+		return dpaa2_switch_port_attr_set_event(dev, ptr);
+	case SWITCHDEV_FDB_ADD_TO_DEVICE:
+	case SWITCHDEV_FDB_DEL_TO_DEVICE:
+		return dpaa2_switch_port_fdb_event(nb, event, ptr);
+	default:
+		return NOTIFY_DONE;
+	}
+}
+
 static int dpaa2_switch_port_obj_event(unsigned long event,
 				       struct net_device *netdev,
 				       struct switchdev_notifier_port_obj_info *port_obj_info)
-- 
2.25.1


^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox