* [PATCH v1 net] tcp: Fix dst leak in tcp_v6_connect().
From: Kuniyuki Iwashima @ 2026-05-06 7:04 UTC (permalink / raw)
To: Eric Dumazet, Neal Cardwell, David S. Miller, Jakub Kicinski,
Paolo Abeni
Cc: Simon Horman, Kuniyuki Iwashima, Kuniyuki Iwashima, netdev,
Damiano Melotti
If a socket is bound to a wildcard address, tcp_v[46]_connect()
updates it with a non-wildcard address based on the route lookup.
After bhash2 was introduced in the cited commit, we must call
inet_bhash2_update_saddr() to update the bhash2 entry as well.
If inet_bhash2_update_saddr() fails, we must release the refcount
for dst by ip_route_connect() or ip6_dst_lookup_flow().
While tcp_v4_connect() calls ip_rt_put() in the error path,
tcp_v6_connect() does not call dst_release().
Let's call dst_release() when inet_bhash2_update_saddr() fails
in tcp_v6_connect().
Fixes: 28044fc1d495 ("net: Add a bhash2 table hashed by port and address")
Reported-by: Damiano Melotti <melotti@google.com>
Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
---
net/ipv6/tcp_ipv6.c | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 2c3f7a739709..3f7bab6faf9c 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -288,8 +288,10 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr_unsized *uaddr,
saddr = &fl6->saddr;
err = inet_bhash2_update_saddr(sk, saddr, AF_INET6);
- if (err)
+ if (err) {
+ dst_release(dst);
goto failure;
+ }
}
/* set the source address */
--
2.54.0.545.g6539524ca2-goog
^ permalink raw reply related
* Re: [PATCH v2 2/2] RDMA/mlx5: get tph for p2p access when registering dma-buf mr
From: fengchengwen @ 2026-05-06 7:04 UTC (permalink / raw)
To: Zhiping Zhang, Alex Williamson, Jason Gunthorpe, Leon Romanovsky
Cc: Bjorn Helgaas, linux-rdma, linux-pci, netdev, dri-devel,
Keith Busch, Yochai Cohen, Yishai Hadas
In-Reply-To: <20260430200704.352228-3-zhipingz@meta.com>
On 5/1/2026 4:06 AM, Zhiping Zhang wrote:
> Query dma-buf TPH metadata when registering a dma-buf MR for peer to
> peer access and translate the raw steering tag into an mlx5 steering tag
> index. Factor mlx5_st_alloc_index() so callers that already have a raw
> steering tag can allocate the corresponding mlx5 index directly. Keep the
> DMAH path as the first priority and only fall back to dma-buf metadata when
> no DMAH is supplied.
>
> Pass the device's supported ST width (8 or 16 bit, derived from
> pdev->tph_req_type) to get_tph() so the exporter can reject tags that
> exceed the consumer's capability. Initialize ret in mlx5_st_create() so the
> cached steering-tag path returns success cleanly under clang builds.
>
> Signed-off-by: Zhiping Zhang <zhipingz@meta.com>
>
> diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
> --- a/drivers/infiniband/hw/mlx5/mr.c
> +++ b/drivers/infiniband/hw/mlx5/mr.c
> @@ -46,6 +46,8 @@
> #include "data_direct.h"
> #include "dmah.h"
>
> +MODULE_IMPORT_NS("DMA_BUF");
> +
> static int mkey_max_umr_order(struct mlx5_ib_dev *dev)
> {
> if (MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset))
> @@ -899,6 +901,40 @@ static struct dma_buf_attach_ops mlx5_ib_dmabuf_attach_ops = {
> .invalidate_mappings = mlx5_ib_dmabuf_invalidate_cb,
> };
>
> +static void get_tph_mr_dmabuf(struct mlx5_ib_dev *dev, int fd, u16 *st_index,
> + u8 *ph)
> +{
> + struct pci_dev *pdev = dev->mdev->pdev;
> + struct dma_buf *dmabuf;
> + u16 steering_tag;
> + u8 st_width;
> + int ret;
> +
> + st_width = (pdev->tph_req_type == PCI_TPH_REQ_EXT_TPH) ? 16 : 8;
The tph_req_type is defined under CONFIG_PCIE_TPH, how about add a wrap function
to query it.
> +
> + dmabuf = dma_buf_get(fd);
> + if (IS_ERR(dmabuf))
> + return;
> +
> + if (!dmabuf->ops->get_tph)
> + goto end_dbuf_put;
> +
> + ret = dmabuf->ops->get_tph(dmabuf, &steering_tag, ph, st_width);
> + if (ret) {
> + mlx5_ib_dbg(dev, "get_tph failed (%d)\n", ret);
> + goto end_dbuf_put;
> + }
> +
> + ret = mlx5_st_alloc_index_by_tag(dev->mdev, steering_tag, st_index);
> + if (ret) {
> + *ph = MLX5_IB_NO_PH;
> + mlx5_ib_dbg(dev, "st_alloc_index_by_tag failed (%d)\n", ret);
> + }
> +
> +end_dbuf_put:
> + dma_buf_put(dmabuf);
> +}
> +
> static struct ib_mr *
> reg_user_mr_dmabuf(struct ib_pd *pd, struct device *dma_device,
> u64 offset, u64 length, u64 virt_addr,
> @@ -941,6 +977,8 @@ reg_user_mr_dmabuf(struct ib_pd *pd, struct device *dma_device,
> ph = dmah->ph;
> if (dmah->valid_fields & BIT(IB_DMAH_CPU_ID_EXISTS))
> st_index = mdmah->st_index;
> + } else {
> + get_tph_mr_dmabuf(dev, fd, &st_index, &ph);
> }
>
> mr = alloc_cacheable_mr(pd, &umem_dmabuf->umem, virt_addr,
> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/st.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/st.c
> --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/st.c
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/st.c
> @@ -29,7 +29,7 @@ struct mlx5_st *mlx5_st_create(struct mlx5_core_dev *dev)
> u8 direct_mode = 0;
> u16 num_entries;
> u32 tbl_loc;
> - int ret;
> + int ret = 0;
>
> if (!MLX5_CAP_GEN(dev, mkey_pcie_tph))
> return NULL;
> @@ -92,23 +92,18 @@ void mlx5_st_destroy(struct mlx5_core_dev *dev)
> kfree(st);
> }
>
> -int mlx5_st_alloc_index(struct mlx5_core_dev *dev, enum tph_mem_type mem_type,
> - unsigned int cpu_uid, u16 *st_index)
> +int mlx5_st_alloc_index_by_tag(struct mlx5_core_dev *dev, u16 tag,
> + u16 *st_index)
> {
> struct mlx5_st_idx_data *idx_data;
> struct mlx5_st *st = dev->st;
> unsigned long index;
> u32 xa_id;
> - u16 tag;
> - int ret;
> + int ret = 0;
>
> if (!st)
> return -EOPNOTSUPP;
>
> - ret = pcie_tph_get_cpu_st(dev->pdev, mem_type, cpu_uid, &tag);
> - if (ret)
> - return ret;
> -
> if (st->direct_mode) {
> *st_index = tag;
> return 0;
> @@ -152,6 +147,20 @@ int mlx5_st_alloc_index(struct mlx5_core_dev *dev, enum tph_mem_type mem_type,
> mutex_unlock(&st->lock);
> return ret;
> }
> +EXPORT_SYMBOL_GPL(mlx5_st_alloc_index_by_tag);
> +
> +int mlx5_st_alloc_index(struct mlx5_core_dev *dev, enum tph_mem_type mem_type,
> + unsigned int cpu_uid, u16 *st_index)
> +{
> + u16 tag;
> + int ret;
> +
> + ret = pcie_tph_get_cpu_st(dev->pdev, mem_type, cpu_uid, &tag);
> + if (ret)
> + return ret;
> +
> + return mlx5_st_alloc_index_by_tag(dev, tag, st_index);
> +}
> EXPORT_SYMBOL_GPL(mlx5_st_alloc_index);
>
> int mlx5_st_dealloc_index(struct mlx5_core_dev *dev, u16 st_index)
> diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
> --- a/include/linux/mlx5/driver.h
> +++ b/include/linux/mlx5/driver.h
> @@ -1166,10 +1166,17 @@ int mlx5_dm_sw_icm_dealloc(struct mlx5_core_dev *dev, enum mlx5_sw_icm_type type
> u64 length, u16 uid, phys_addr_t addr, u32 obj_id);
>
> #ifdef CONFIG_PCIE_TPH
> +int mlx5_st_alloc_index_by_tag(struct mlx5_core_dev *dev, u16 tag,
> + u16 *st_index);
> int mlx5_st_alloc_index(struct mlx5_core_dev *dev, enum tph_mem_type mem_type,
> unsigned int cpu_uid, u16 *st_index);
> int mlx5_st_dealloc_index(struct mlx5_core_dev *dev, u16 st_index);
> #else
> +static inline int mlx5_st_alloc_index_by_tag(struct mlx5_core_dev *dev,
> + u16 tag, u16 *st_index)
> +{
> + return -EOPNOTSUPP;
> +}
> static inline int mlx5_st_alloc_index(struct mlx5_core_dev *dev,
> enum tph_mem_type mem_type,
> unsigned int cpu_uid, u16 *st_index)
>
>
^ permalink raw reply
* Re: [PATCH net-next 1/6] bridge: uapi: Add neigh_forward_grat netlink attributes
From: Ido Schimmel @ 2026-05-06 7:03 UTC (permalink / raw)
To: Jakub Kicinski
Cc: Danielle Ratson, netdev, donald.hunter, davem, edumazet, pabeni,
horms, razor, andrew+netdev, shuah, ast, liuhangbin, daniel,
aroulin, fmaurer, sdf.kernel, sd, kees, nickgarlis, amorenoz,
alasdair, johannes.wiesboeck, petrm, linux-kernel, bridge,
linux-kselftest
In-Reply-To: <20260505190044.0608dfb1@kernel.org>
On Tue, May 05, 2026 at 07:00:44PM -0700, Jakub Kicinski wrote:
> On Sun, 3 May 2026 10:35:27 +0300 Danielle Ratson wrote:
> > --- a/Documentation/netlink/specs/rt-link.yaml
> > +++ b/Documentation/netlink/specs/rt-link.yaml
> > @@ -1700,6 +1700,9 @@ attribute-sets:
> > -
> > name: backup-nhid
> > type: u32
> > + -
> > + name: neigh-forward-grat
> > + type: flag
>
> I think this should be u8 ? neigh-vlan-suppress looks buggy too
I pointed this out during internal review, but assumed I am missing
something since almost all the attributes use flag when they are in fact
u8. We can fix neigh-forward-grat to use u8 in v2 and change the rest in
net. To be clear, I believe the following should be converted from flag
to u8:
mode, guard, protect, fast-leave, learning, unicast-flood, proxyarp,
learning-sync, proxyarp-wifi, mcast-flood, mcast-to-ucast, vlan-tunnel,
bcast-flood, neigh-suppress, isolated, mrp-ring-open, mrp-in-open,
locked, mab, neigh-vlan-suppress
> flag is a type without a payload, the presence of the attr is
> the entire information
>
> None of the AIs seem to catch this, I think you may have over-split
> this submission a little bit. This patch may have been better off
> squashed into patch 4 ?
Related: The AI also did not catch that the spec was missing (easy to
forget for rtnetlink). Do you think it's worth adding to review-prompts?
^ permalink raw reply
* Re: [PATCH net v7 1/2] ipv6: flowlabel: take ip6_fl_lock across mem_check and fl_intern
From: Maoyi Xie @ 2026-05-06 7:03 UTC (permalink / raw)
To: Willem de Bruijn
Cc: David S . Miller, Jakub Kicinski, Paolo Abeni, Eric Dumazet,
David Ahern, Alexey Kuznetsov, Willem de Bruijn, netdev,
linux-kernel, stable
In-Reply-To: <willemdebruijn.kernel.2b6abb6aae1d8@gmail.com>
Hi Willem,
Thanks for the review on both, and for the Reviewed-by tags.
>> Signed-off-by: Maoyi Xie <maoyi.xie@ntu.edu.sg>
> Please update either your git config name or Signed-off-by to
> make sure that the two are the same.
Sorry for the noise. We sent the series through Gmail because
our NTU SMTP does not accept git send-email, and Gmail rewrote
the From header of the resulting mail. v8 will set the email
header From to the Gmail address and inject an in-body From: line
with the NTU address, so the trailer matches the author.
>> +/* Caller must hold ip6_fl_lock. */
> nit: lockdep_assert_held as used below is preferable over
> comments
Will replace the comment with lockdep_assert_held(&ip6_fl_lock)
at the top of fl_intern() in v8.
> Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Will add the same Fixes trailer to 1/2 in v8.
We will respin as v8 after the 24h netdev window.
Maoyi
Nanyang Technological University
https://maoyixie.com/
^ permalink raw reply
* [PATCH v1 net] ipmr: Call ipmr_fib_lookup() under RCU.
From: Kuniyuki Iwashima @ 2026-05-06 6:59 UTC (permalink / raw)
To: David Ahern, Ido Schimmel, David S. Miller, Eric Dumazet,
Jakub Kicinski, Paolo Abeni
Cc: Simon Horman, Kuniyuki Iwashima, Kuniyuki Iwashima, netdev,
syzkaller, Yi Lai
Yi Lai reported RCU splat in reg_vif_xmit() below. [0]
When CONFIG_IP_MROUTE_MULTIPLE_TABLES=n, ipmr_fib_lookup()
uses rcu_dereference() without explicit rcu_read_lock().
Although rcu_read_lock_bh() is already held by the caller
__dev_queue_xmit(), lockdep requires explicit rcu_read_lock()
for rcu_dereference().
Let's move up rcu_read_lock() in reg_vif_xmit() to
cover ipmr_fib_lookup().
[0]:
WARNING: suspicious RCU usage
7.1.0-rc2-next-20260504-9d0d467c3572 #1 Not tainted
-----------------------------
net/ipv4/ipmr.c:329 suspicious rcu_dereference_check() usage!
other info that might help us debug this:
rcu_scheduler_active = 2, debug_locks = 1
2 locks held by syz.2.17/1779:
#0: ffffffff87896440 (rcu_read_lock_bh){....}-{1:3}, at: local_bh_disable include/linux/bottom_half.h:20 [inline]
#0: ffffffff87896440 (rcu_read_lock_bh){....}-{1:3}, at: rcu_read_lock_bh include/linux/rcupdate.h:891 [inline]
#0: ffffffff87896440 (rcu_read_lock_bh){....}-{1:3}, at: __dev_queue_xmit+0x239/0x4140 net/core/dev.c:4792
#1: ffff88801a199d18 (_xmit_PIMREG#2){+...}-{3:3}, at: spin_lock include/linux/spinlock.h:342 [inline]
#1: ffff88801a199d18 (_xmit_PIMREG#2){+...}-{3:3}, at: __netif_tx_lock include/linux/netdevice.h:4795 [inline]
#1: ffff88801a199d18 (_xmit_PIMREG#2){+...}-{3:3}, at: __dev_queue_xmit+0x1d5d/0x4140 net/core/dev.c:4865
stack backtrace:
CPU: 1 UID: 0 PID: 1779 Comm: syz.2.17 Not tainted 7.1.0-rc2-next-20260504-9d0d467c3572 #1 PREEMPT(lazy)
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014
Call Trace:
<TASK>
__dump_stack lib/dump_stack.c:94 [inline]
dump_stack_lvl+0x121/0x150 lib/dump_stack.c:120
dump_stack+0x19/0x20 lib/dump_stack.c:129
lockdep_rcu_suspicious+0x15b/0x1f0 kernel/locking/lockdep.c:6878
ipmr_fib_lookup net/ipv4/ipmr.c:329 [inline]
reg_vif_xmit+0x2ee/0x3c0 net/ipv4/ipmr.c:540
__netdev_start_xmit include/linux/netdevice.h:5382 [inline]
netdev_start_xmit include/linux/netdevice.h:5391 [inline]
xmit_one net/core/dev.c:3889 [inline]
dev_hard_start_xmit+0x170/0x700 net/core/dev.c:3905
__dev_queue_xmit+0x1df1/0x4140 net/core/dev.c:4871
dev_queue_xmit include/linux/netdevice.h:3423 [inline]
packet_xmit+0x252/0x370 net/packet/af_packet.c:276
packet_snd net/packet/af_packet.c:3082 [inline]
packet_sendmsg+0x39ad/0x5650 net/packet/af_packet.c:3114
sock_sendmsg_nosec net/socket.c:797 [inline]
__sock_sendmsg net/socket.c:812 [inline]
____sys_sendmsg+0xa21/0xba0 net/socket.c:2716
___sys_sendmsg+0x121/0x1c0 net/socket.c:2770
__sys_sendmsg+0x177/0x220 net/socket.c:2802
__do_sys_sendmsg net/socket.c:2807 [inline]
__se_sys_sendmsg net/socket.c:2805 [inline]
__x64_sys_sendmsg+0x80/0xc0 net/socket.c:2805
x64_sys_call+0x1d9c/0x21c0 arch/x86/include/generated/asm/syscalls_64.h:47
do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline]
do_syscall_64+0xc1/0x1020 arch/x86/entry/syscall_64.c:94
entry_SYSCALL_64_after_hwframe+0x76/0x7e
RIP: 0033:0x7f37e563ee5d
Code: ff c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 93 af 1b 00 f7 d8 64 89 01 48
RSP: 002b:00007ffe5caa7fa8 EFLAGS: 00000246 ORIG_RAX: 000000000000002e
RAX: ffffffffffffffda RBX: 00000000005c5fa0 RCX: 00007f37e563ee5d
RDX: 0000000000000000 RSI: 00002000000012c0 RDI: 0000000000000004
RBP: 00000000005c5fa0 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000
R13: 0000000000000000 R14: 00000000005c5fac R15: 00000000005c5fa0
</TASK>
Fixes: b3b6babf4751 ("ipmr: Free mr_table after RCU grace period.")
Reported-by: syzkaller <syzkaller@googlegroups.com>
Reported-by: Yi Lai <yi1.lai@intel.com>
Closes: https://lore.kernel.org/netdev/afrY34dLXNUboevf@ly-workstation/
Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
---
net/ipv4/ipmr.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 05fb6eefe0be..2628cd3a93a6 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -537,15 +537,16 @@ static netdev_tx_t reg_vif_xmit(struct sk_buff *skb, struct net_device *dev)
};
int err;
+ rcu_read_lock();
err = ipmr_fib_lookup(net, &fl4, &mrt);
if (err < 0) {
+ rcu_read_unlock();
kfree_skb(skb);
return err;
}
DEV_STATS_ADD(dev, tx_bytes, skb->len);
DEV_STATS_INC(dev, tx_packets);
- rcu_read_lock();
/* Pairs with WRITE_ONCE() in vif_add() and vif_delete() */
ipmr_cache_report(mrt, skb, READ_ONCE(mrt->mroute_reg_vif_num),
--
2.54.0.545.g6539524ca2-goog
^ permalink raw reply related
* Re: [PATCH batadv 0/8] batman-adv: follow up fixes
From: Sven Eckelmann @ 2026-05-06 6:59 UTC (permalink / raw)
To: Jakub Kicinski
Cc: Marek Lindner, Simon Wunderlich, Antonio Quartulli,
David S. Miller, Eric Dumazet, Paolo Abeni, Simon Horman,
b.a.t.m.a.n, netdev, linux-kernel, Ao Zhou, Haoze Xie,
Jiexun Wang, Juefei Pu, Luxing Yin, Ruide Cao, Xin Liu, Yifan Wu,
Yuan Tan
In-Reply-To: <20260505172017.6caf7347@kernel.org>
[-- Attachment #1: Type: text/plain, Size: 2324 bytes --]
On Wednesday, 6 May 2026 02:20:17 CEST Jakub Kicinski wrote:
> Ah, I see. I was asking because I don't recall us getting much batadv
> patches CCed to netdev. Maybe it's simply because there wasn't that
> many of them to begin with.
>
> If the rate keeps up let's add an X: to MAINTAINERS to avoid
> netdev@ being CCed. IDK if that's what X is supposed to be used for
> but we use it for wireless and bluetooth already. The PRs still
> flow thru networking tree, but there's no need for netdev to be CCed
> on 99% of the patch submissions.
>
Thanks for the hint, queued up a patch:
$ ./scripts/get_maintainer.pl -f net/batman-adv/*
Marek Lindner <marek.lindner@mailbox.org> (maintainer:BATMAN ADVANCED)
Simon Wunderlich <sw@simonwunderlich.de> (maintainer:BATMAN ADVANCED)
Antonio Quartulli <antonio@mandelbit.com> (maintainer:BATMAN ADVANCED)
Sven Eckelmann <sven@narfation.org> (maintainer:BATMAN ADVANCED)
b.a.t.m.a.n@lists.open-mesh.org (moderated list:BATMAN ADVANCED)
linux-kernel@vger.kernel.org (open list)
$ ./scripts/get_maintainer.pl -f Documentation/networking/batman-adv.rst
Marek Lindner <marek.lindner@mailbox.org> (maintainer:BATMAN ADVANCED)
Simon Wunderlich <sw@simonwunderlich.de> (maintainer:BATMAN ADVANCED)
Antonio Quartulli <antonio@mandelbit.com> (maintainer:BATMAN ADVANCED)
Sven Eckelmann <sven@narfation.org> (maintainer:BATMAN ADVANCED)
Jonathan Corbet <corbet@lwn.net> (maintainer:DOCUMENTATION)
Shuah Khan <skhan@linuxfoundation.org> (reviewer:DOCUMENTATION)
b.a.t.m.a.n@lists.open-mesh.org (moderated list:BATMAN ADVANCED)
linux-doc@vger.kernel.org (open list:DOCUMENTATION)
linux-kernel@vger.kernel.org (open list)
$ ./scripts/get_maintainer.pl -f include/uapi/linux/batadv_packet.h include/uapi/linux/batman_adv.h
Marek Lindner <marek.lindner@mailbox.org> (maintainer:BATMAN ADVANCED)
Simon Wunderlich <sw@simonwunderlich.de> (maintainer:BATMAN ADVANCED)
Antonio Quartulli <antonio@mandelbit.com> (maintainer:BATMAN ADVANCED)
Sven Eckelmann <sven@narfation.org> (maintainer:BATMAN ADVANCED)
b.a.t.m.a.n@lists.open-mesh.org (moderated list:BATMAN ADVANCED)
linux-kernel@vger.kernel.org (open list)
@Yuan: Thanks for updating your internal rules. But the main problem (more
patches send to netdev) was caused by my oversight. Btw. thanks for all your
contributions.
Regards,
Sven
[-- Attachment #2: This is a digitally signed message part. --]
[-- Type: application/pgp-signature, Size: 228 bytes --]
^ permalink raw reply
* Re: [PATCH v2 1/2] vfio: add dma-buf get_tph callback and DMA_BUF_TPH feature
From: fengchengwen @ 2026-05-06 6:58 UTC (permalink / raw)
To: Zhiping Zhang, Alex Williamson, Jason Gunthorpe, Leon Romanovsky
Cc: Bjorn Helgaas, linux-rdma, linux-pci, netdev, dri-devel,
Keith Busch, Yochai Cohen, Yishai Hadas
In-Reply-To: <20260430200704.352228-2-zhipingz@meta.com>
On 5/1/2026 4:06 AM, Zhiping Zhang wrote:
> Add a dma-buf callback that returns raw TPH metadata from the exporter
> so peer devices can reuse the steering tag and processing hint
> associated with a VFIO-exported buffer.
>
> Add a new VFIO_DEVICE_FEATURE_DMA_BUF_TPH ioctl that takes the fd from
> VFIO_DEVICE_FEATURE_DMA_BUF along with a steering tag and processing
> hint, validates the fd is a vfio-exported dma-buf belonging to this
> device, and stores the TPH values under memory_lock. This keeps the
> existing VFIO_DEVICE_FEATURE_DMA_BUF uAPI completely unchanged.
>
> The user sequences setting TPH on the dma-buf before the importer
> consumes it.
>
> Add an st_width parameter to get_tph() so the exporter can reject
> steering tags that exceed the consumer's supported width (8 vs 16 bit).
> When no TPH metadata was supplied, get_tph() returns -EOPNOTSUPP.
>
> Signed-off-by: Zhiping Zhang <zhipingz@meta.com>
>
> diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
> --- a/drivers/vfio/pci/vfio_pci_core.c
> +++ b/drivers/vfio/pci/vfio_pci_core.c
> @@ -1534,6 +1534,9 @@ int vfio_pci_core_ioctl_feature(struct vfio_device *device, u32 flags,
> return vfio_pci_core_feature_token(vdev, flags, arg, argsz);
> case VFIO_DEVICE_FEATURE_DMA_BUF:
> return vfio_pci_core_feature_dma_buf(vdev, flags, arg, argsz);
> + case VFIO_DEVICE_FEATURE_DMA_BUF_TPH:
> + return vfio_pci_core_feature_dma_buf_tph(vdev, flags, arg,
> + argsz);
> default:
> return -ENOTTY;
> }
> diff --git a/drivers/vfio/pci/vfio_pci_dmabuf.c b/drivers/vfio/pci/vfio_pci_dmabuf.c
> --- a/drivers/vfio/pci/vfio_pci_dmabuf.c
> +++ b/drivers/vfio/pci/vfio_pci_dmabuf.c
> @@ -19,6 +19,9 @@ struct vfio_pci_dma_buf {
> u32 nr_ranges;
> struct kref kref;
> struct completion comp;
> + u16 steering_tag;
> + u8 ph;
> + u8 tph_present : 1;
> u8 revoked : 1;
> };
>
> @@ -69,6 +72,22 @@ vfio_pci_dma_buf_map(struct dma_buf_attachment *attachment,
> return ret;
> }
>
> +static int vfio_pci_dma_buf_get_tph(struct dma_buf *dmabuf, u16 *steering_tag,
> + u8 *ph, u8 st_width)
> +{
> + struct vfio_pci_dma_buf *priv = dmabuf->priv;
> +
> + if (!priv->tph_present)
> + return -EOPNOTSUPP;
> +
> + if (st_width < 16 && priv->steering_tag > ((1U << st_width) - 1))
> + return -EINVAL;
The checker will failed in following cases:
1. If the exporter passed 8bit st, and importer support 16bit st, then it will pass
the checker.
2. The exporter enabled 16bit st and its st is < 256 (note: the pcie protocol doesn't
restrict 16bit-st must >=256), and importer only support 8bit st, then it will also
pass the checker
Suggest userspace passing both st(8bit) and extend-st(16bit), and importer chose the
right one.
> +
> + *steering_tag = priv->steering_tag;
> + *ph = priv->ph;
> + return 0;
> +}
> +
> static void vfio_pci_dma_buf_unmap(struct dma_buf_attachment *attachment,
> struct sg_table *sgt,
> enum dma_data_direction dir)
> @@ -101,6 +120,7 @@ static void vfio_pci_dma_buf_release(struct dma_buf *dmabuf)
>
> static const struct dma_buf_ops vfio_pci_dmabuf_ops = {
> .attach = vfio_pci_dma_buf_attach,
> + .get_tph = vfio_pci_dma_buf_get_tph,
> .map_dma_buf = vfio_pci_dma_buf_map,
> .unmap_dma_buf = vfio_pci_dma_buf_unmap,
> .release = vfio_pci_dma_buf_release,
> @@ -331,6 +351,55 @@ int vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags,
> return ret;
> }
>
> +int vfio_pci_core_feature_dma_buf_tph(struct vfio_pci_core_device *vdev,
> + u32 flags,
> + struct vfio_device_feature_dma_buf_tph __user *arg,
> + size_t argsz)
> +{
> + struct vfio_device_feature_dma_buf_tph set_tph;
> + struct vfio_pci_dma_buf *priv;
> + struct dma_buf *dmabuf;
> + int ret;
> +
> + ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_SET,
> + sizeof(set_tph));
> + if (ret != 1)
> + return ret;
> +
> + if (copy_from_user(&set_tph, arg, sizeof(set_tph)))
> + return -EFAULT;
> +
> + if (set_tph.reserved)
> + return -EINVAL;
> +
> + dmabuf = dma_buf_get(set_tph.dmabuf_fd);
> + if (IS_ERR(dmabuf))
> + return PTR_ERR(dmabuf);
> +
> + if (dmabuf->ops != &vfio_pci_dmabuf_ops) {
> + ret = -EINVAL;
> + goto out_put;
> + }
> +
> + priv = dmabuf->priv;
> + down_write(&vdev->memory_lock);
> + if (priv->vdev != vdev) {
> + ret = -EINVAL;
> + goto out_unlock;
> + }
> +
> + priv->steering_tag = set_tph.steering_tag;
> + priv->ph = set_tph.ph;
> + priv->tph_present = 1;
> + ret = 0;
> +
> +out_unlock:
> + up_write(&vdev->memory_lock);
> +out_put:
> + dma_buf_put(dmabuf);
> + return ret;
> +}
> +
> void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, bool revoked)
> {
> struct vfio_pci_dma_buf *priv;
> diff --git a/drivers/vfio/pci/vfio_pci_priv.h b/drivers/vfio/pci/vfio_pci_priv.h
> --- a/drivers/vfio/pci/vfio_pci_priv.h
> +++ b/drivers/vfio/pci/vfio_pci_priv.h
> @@ -118,6 +118,10 @@ static inline bool vfio_pci_is_vga(struct pci_dev *pdev)
> int vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags,
> struct vfio_device_feature_dma_buf __user *arg,
> size_t argsz);
> +int vfio_pci_core_feature_dma_buf_tph(struct vfio_pci_core_device *vdev,
> + u32 flags,
> + struct vfio_device_feature_dma_buf_tph __user *arg,
> + size_t argsz);
> void vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device *vdev);
> void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, bool revoked);
> #else
> @@ -128,6 +132,13 @@ vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags,
> {
> return -ENOTTY;
> }
> +static inline int
> +vfio_pci_core_feature_dma_buf_tph(struct vfio_pci_core_device *vdev, u32 flags,
> + struct vfio_device_feature_dma_buf_tph __user *arg,
> + size_t argsz)
> +{
> + return -ENOTTY;
> +}
> static inline void vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device *vdev)
> {
> }
> diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h
> --- a/include/linux/dma-buf.h
> +++ b/include/linux/dma-buf.h
> @@ -113,6 +113,23 @@ struct dma_buf_ops {
> */
> void (*unpin)(struct dma_buf_attachment *attach);
>
> + /**
> + * @get_tph:
> + * @dmabuf: DMA buffer for which to retrieve TPH metadata
> + * @steering_tag: Returns the raw TPH steering tag
> + * @ph: Returns the TPH processing hint
> + * @st_width: Consumer's supported steering tag width in bits (8 or 16)
> + *
> + * Return the TPH (TLP Processing Hints) metadata associated with this
> + * DMA buffer. Exporters that do not provide TPH metadata should return
> + * -EOPNOTSUPP. If the steering tag exceeds @st_width bits, return
> + * -EINVAL.
> + *
> + * This callback is optional.
> + */
> + int (*get_tph)(struct dma_buf *dmabuf, u16 *steering_tag, u8 *ph,
> + u8 st_width);
> +
> /**
> * @map_dma_buf:
> *
> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> --- a/include/uapi/linux/vfio.h
> +++ b/include/uapi/linux/vfio.h
> @@ -1534,6 +1534,28 @@ struct vfio_device_feature_dma_buf {
> */
> #define VFIO_DEVICE_FEATURE_MIG_PRECOPY_INFOv2 12
>
> +/**
> + * Upon VFIO_DEVICE_FEATURE_SET associate TPH (TLP Processing Hints) metadata
> + * with a vfio-exported dma-buf. The dma-buf must have been created by
> + * VFIO_DEVICE_FEATURE_DMA_BUF on this device.
> + *
> + * dmabuf_fd is the file descriptor returned by VFIO_DEVICE_FEATURE_DMA_BUF.
> + * steering_tag and ph are the raw TPH values that importing drivers should use
> + * when accessing the buffer.
> + *
> + * The user must set TPH on the dma-buf before the importer consumes it.
> + *
> + * Return: 0 on success, -errno on failure.
> + */
> +#define VFIO_DEVICE_FEATURE_DMA_BUF_TPH 13
> +
> +struct vfio_device_feature_dma_buf_tph {
> + __s32 dmabuf_fd;
> + __u16 steering_tag;
> + __u8 ph;
> + __u8 reserved;
> +};
> +
> /* -------- API for Type1 VFIO IOMMU -------- */
>
> /**
>
>
^ permalink raw reply
* Re: [PATCH] dept: update documentation function names to match implementation
From: Byungchul Park @ 2026-05-06 6:27 UTC (permalink / raw)
To: Yunseong Kim
Cc: bagasdotme, 2407018371, Dai.Ngo, Liam.Howlett, a.hindborg,
ada.coupriediaz, adilger.kernel, akpm, alex.gaynor,
alexander.shishkin, aliceryhl, amir73il, andi.shyti, andrii, anna,
arnd, ast, baolin.wang, bigeasy, bjorn3_gh, boqun.feng, bp,
brauner, broonie, bsegall, catalin.marinas, chenhuacai,
chris.p.wilson, christian.koenig, chuck.lever, cl, clrkwllms,
corbet, da.gomez, dakr, damien.lemoal, dan.j.williams,
daniel.vetter, dave.hansen, david, dennis, dietmar.eggemann,
djwong, dri-devel, duyuyang, dwmw, francesco, frederic, gary,
geert+renesas, geert, gregkh, guoweikang.kernel, gustavo,
gwan-gyeong.mun, hamohammed.sa, hannes, harry.yoo, hch,
her0gyugyu, hpa, jack, jglisse, jiangshanlai, jlayton,
joel.granados, joel, joelagnelf, johannes.berg, josef, josh,
jpoimboe, juri.lelli, kees, kernel-team, kernel_team,
kevin.brodsky, kristina.martsenko, lillian, linaro-mm-sig, link,
linux-arch, linux-arm-kernel, linux-block, linux-doc, linux-ext4,
linux-fsdevel, linux-i2c, linux-ide, linux-kernel, linux-media,
linux-mm, linux-modules, linux-nfs, linux-rt-devel, linux,
longman, lorenzo.stoakes, lossin, luto, mark.rutland, masahiroy,
mathieu.desnoyers, matthew.brost, max.byungchul.park, mcgrof,
melissa.srw, mgorman, mhocko, miguel.ojeda.sandonis, minchan,
mingo, mjguzik, neeraj.upadhyay, neil, neilb, netdev, ngupta,
ojeda, okorniev, oleg, paulmck, penberg, peterz, petr.pavlu,
qiang.zhang, rcu, richard.weiyang, rientjes, rodrigosiqueiramelo,
rostedt, rppt, rust-for-linux, samitolvanen, sashal, shakeel.butt,
sj, sumit.semwal, surenb, tglx, thomas.weissschuh, tim.c.chen, tj,
tmgross, tom, torvalds, trondmy, tytso, urezki, usamaarif642,
vbabka, vdavydov.dev, vincent.guittot, vschneid, wangfushuai,
wangkefeng.wang, will, willy, wsa+renesas, x86, yeoreum.yun, ysk,
yunseong.kim, yuzhao, ziy
In-Reply-To: <20260428162614.786365-2-yunseong.kim@est.tech>
On Tue, Apr 28, 2026 at 06:26:15PM +0200, Yunseong Kim wrote:
> Synchronize function names in the documentation with the actual
> implementation to fix naming inconsistencies.
Good catch! Thanks Yunseong. I will apply it on the top.
Byungchul
> Signed-off-by: Yunseong Kim <yunseong.kim@est.tech>
> ---
> Documentation/dev-tools/dept.rst | 2 +-
> Documentation/dev-tools/dept_api.rst | 2 +-
> 2 files changed, 2 insertions(+), 2 deletions(-)
>
> diff --git a/Documentation/dev-tools/dept.rst b/Documentation/dev-tools/dept.rst
> index 333166464543..31b2fe629fab 100644
> --- a/Documentation/dev-tools/dept.rst
> +++ b/Documentation/dev-tools/dept.rst
> @@ -97,7 +97,7 @@ No. What about the following?
>
> mutex_lock A
> mutex_lock A <- DEADLOCK
> - wait_for_complete B <- DEADLOCK
> + wait_for_completion B <- DEADLOCK
> complete B
> mutex_unlock A
> mutex_unlock A
> diff --git a/Documentation/dev-tools/dept_api.rst b/Documentation/dev-tools/dept_api.rst
> index 409116a62849..74e7b1424ad5 100644
> --- a/Documentation/dev-tools/dept_api.rst
> +++ b/Documentation/dev-tools/dept_api.rst
> @@ -113,7 +113,7 @@ Do not use these APIs directly. The raw APIs of dept are:
> dept_stage_wait(map, key, ip, wait_func, time);
> dept_request_event_wait_commit();
> dept_clean_stage();
> - dept_stage_event(task, ip);
> + dept_ttwu_stage_wait(task, ip);
> dept_ecxt_enter(map, evt_flags, ip, ecxt_func, evt_func, sub_local);
> dept_ecxt_holding(map, evt_flags);
> dept_request_event(map, ext_wgen);
> --
> 2.53.0
^ permalink raw reply
* RE: [PATCH net] tipc: avoid sending zero-length stream messages
From: Tung Quang Nguyen @ 2026-05-06 6:41 UTC (permalink / raw)
To: Cássio Gabriel
Cc: netdev@vger.kernel.org, tipc-discussion@lists.sourceforge.net,
linux-kernel@vger.kernel.org, stable@vger.kernel.org,
syzbot+aa7d098bd6fa788fae8e@syzkaller.appspotmail.com, Jon Maloy,
David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
Simon Horman
In-Reply-To: <20260506-tipc-zero-length-stream-stall-v1-1-5d75f202227b@gmail.com>
>Subject: [PATCH net] tipc: avoid sending zero-length stream messages
>
>TIPC stream send currently enters the transmit loop even when the user
>payload length is zero. This can build and transmit a header-only connection
>message.
>
>For local TIPC sockets, such messages are delivered synchronously through the
>loopback receive path. When this happens while socket backlog processing is
>being flushed, reply transmission can re-enter TIPC receive processing
>repeatedly and trigger an RCU stall.
>
Can you demonstrate this scenario using code ? It is better to point out what current code is faulty.
>Make zero-length sends on connected SOCK_STREAM TIPC sockets a no-op
>after the existing connection/congestion wait has succeeded. Leave implicit
>connection setup and SOCK_SEQPACKET behavior unchanged.
>
>Fixes: 365ad353c256 ("tipc: reduce risk of user starvation during link
>congestion")
>Cc: stable@vger.kernel.org
>Reported-by: syzbot+aa7d098bd6fa788fae8e@syzkaller.appspotmail.com
>Closes:
>https://lore.kernel.org/all/000000000000cedbc405ae81531f@google.com/
>Closes: https://syzkaller.appspot.com/bug?extid=aa7d098bd6fa788fae8e
>Signed-off-by: Cássio Gabriel <cassiogabrielcontato@gmail.com>
>---
> net/tipc/socket.c | 2 ++
> 1 file changed, 2 insertions(+)
>
>diff --git a/net/tipc/socket.c b/net/tipc/socket.c index
>9329919fb07f..3c7838713d74 100644
>--- a/net/tipc/socket.c
>+++ b/net/tipc/socket.c
>@@ -1585,6 +1585,8 @@ static int __tipc_sendstream(struct socket *sock,
>struct msghdr *m, size_t dlen)
> tipc_sk_connected(sk)));
> if (unlikely(rc))
> break;
>+ if (unlikely(!dlen && sk->sk_type == SOCK_STREAM))
>+ break;
This change is wrong. It immediately breaks normal connection set up because the ACK (zero in length) has no chance to be sent back from the server to the client.
Please try to test your patch before submission.
> send = min_t(size_t, dlen - sent, TIPC_MAX_USER_MSG_SIZE);
> blocks = tsk->snd_backlog;
> if (tsk->oneway++ >= tsk->nagle_start && maxnagle &&
>
>---
>base-commit: 95084f1883a760e0d4290698346759d58e2b944a
>change-id: 20260505-tipc-zero-length-stream-stall-2c3741de2c93
>
>Best regards,
>--
>Cássio Gabriel <cassiogabrielcontato@gmail.com>
>
^ permalink raw reply
* RE: [PATCH v5 net-next 04/15] net: enetc: add basic operations to the FDB table
From: Wei Fang @ 2026-05-06 6:37 UTC (permalink / raw)
To: Paolo Abeni
Cc: netdev@vger.kernel.org, linux-kernel@vger.kernel.org,
devicetree@vger.kernel.org, linuxppc-dev@lists.ozlabs.org,
linux-arm-kernel@lists.infradead.org, imx@lists.linux.dev,
Claudiu Manoil, Vladimir Oltean, Clark Wang,
andrew+netdev@lunn.ch, davem@davemloft.net, edumazet@google.com,
kuba@kernel.org, robh@kernel.org, krzk+dt@kernel.org,
conor+dt@kernel.org, f.fainelli@gmail.com, Frank Li,
chleroy@kernel.org, horms@kernel.org, linux@armlinux.org.uk
In-Reply-To: <6324783e-b5c8-462d-8cf7-f6cc9c01ea89@redhat.com>
> On 4/30/26 4:49 AM, Wei Fang wrote:
> > The FDB table is used for MAC learning lookups and MAC forwarding lookups.
> > Each table entry includes information such as a FID and MAC address that
> > may be unicast or multicast and a forwarding destination field containing
> > a port bitmap identifying the associated port(s) with the MAC address.
> > FDB table entries can be static or dynamic. Static entries are added from
> > software whereby dynamic entries are added either by software or by the
> > hardware as MAC addresses are learned in the datapath.
> >
> > The FDB table can only be managed by the command BD ring using table
> > management protocol version 2.0. Table management command operations
> Add,
> > Delete, Update and Query are supported. And the FDB table supports three
> > access methods: Entry ID, Exact Match Key Element and Search. This patch
> > adds the following basic supports to the FDB table.
> >
> > ntmp_fdbt_update_entry() - update the configuration element data of a
> > specified FDB entry
> >
> > ntmp_fdbt_delete_entry() - delete a specified FDB entry
> >
> > ntmp_fdbt_add_entry() - add an entry into the FDB table
> >
> > ntmp_fdbt_search_port_entry() - Search the FDB entry on the specified
> > port based on RESUME_ENTRY_ID.
> >
> > Signed-off-by: Wei Fang <wei.fang@nxp.com>
> > ---
> > drivers/net/ethernet/freescale/enetc/ntmp.c | 203
> +++++++++++++++++-
> > .../ethernet/freescale/enetc/ntmp_private.h | 61 +++++-
> > include/linux/fsl/ntmp.h | 44 +++-
> > 3 files changed, 305 insertions(+), 3 deletions(-)
> >
> > diff --git a/drivers/net/ethernet/freescale/enetc/ntmp.c
> b/drivers/net/ethernet/freescale/enetc/ntmp.c
> > index c94a928622fd..4ed8d783a9a2 100644
> > --- a/drivers/net/ethernet/freescale/enetc/ntmp.c
> > +++ b/drivers/net/ethernet/freescale/enetc/ntmp.c
> > @@ -1,7 +1,7 @@
> > // SPDX-License-Identifier: (GPL-2.0+ OR BSD-3-Clause)
> > /*
> > * NETC NTMP (NETC Table Management Protocol) 2.0 Library
> > - * Copyright 2025 NXP
> > + * Copyright 2025-2026 NXP
> > */
> >
> > #include <linux/dma-mapping.h>
> > @@ -21,11 +21,15 @@
> > /* Define NTMP Table ID */
> > #define NTMP_MAFT_ID 1
> > #define NTMP_RSST_ID 3
> > +#define NTMP_FDBT_ID 15
> >
> > /* Generic Update Actions for most tables */
> > #define NTMP_GEN_UA_CFGEU BIT(0)
> > #define NTMP_GEN_UA_STSEU BIT(1)
> >
> > +/* Query Action: 0: Full query, 1: Only query entry ID */
> > +#define NTMP_QA_ENTRY_ID 1
>
> Sashiko noted that the above comments looks inconsistent with the update
> code, where NTMP_QA_ENTRY_ID apparently uses a full query, and 0 just
> the entry ID.
>
The definition is correct, 0 indicates a full query, 1 indicates just query the
entry ID. It seems you misunderstood Sashiko's comment. Below is the
comment from Sashiko.
Since this command uses the NTMP_QA_ENTRY_ID ('Only query entry ID') query
action, the hardware returns only a 4-byte entry ID at offset 0. However,
in struct fdbt_resp_query, the entry_id field is located at offset 4,
following the status field.
I would say this is a false positive. Below is the response data structure of a
full query. NTMP_QA_ENTRY_ID does not mean the hardware will return
only a 4-byte entry ID at offset 0, it indicates the fields after entry_id will
not be present in the response data, such as keye, cfge, acte and resv.
struct fdbt_resp_query {
__le32 status;
__le32 entry_id;
struct fdbt_keye_data keye;
struct fdbt_cfge_data cfge;
u8 acte;
u8 resv[3];
};
^ permalink raw reply
* Re: [v2 PATCH] xfrm: ipcomp: Free destination pages on acomp errors
From: Orion Zhu @ 2026-05-06 6:33 UTC (permalink / raw)
To: Herbert Xu
Cc: Ren Wei, netdev, steffen.klassert, davem, edumazet, kuba, pabeni,
horms, yuantan098, yifanwucs, tomapufckgml, bird, ronbogo
In-Reply-To: <afm18k8_CKGOl0oT@gondor.apana.org.au>
On Tue, 5 May 2026 at 02:19, Herbert Xu <herbert@gondor.apana.org.au> wrote:
>
> On Tue, May 05, 2026 at 04:52:59PM +0800, Ren Wei wrote:
> > From: Yilin Zhu <zylzyl2333@gmail.com>
> >
> > ipcomp_setup_req() allocates destination pages for the acomp output
> > scatterlist. On successful completion, ipcomp_post_acomp() attaches the
> > used pages to the skb and frees any unused pages.
> >
> > On an acomp error, ipcomp_post_acomp() skips directly to freeing the
> > request. acomp_request_free() only releases the request itself, so the
> > caller-allocated destination pages are left allocated.
> >
> > Track the number of destination pages allocated for the request and free
> > them on the error path before releasing the request.
> >
> > Fixes: eb2953d26971 ("xfrm: ipcomp: Use crypto_acomp interface")
> > Cc: stable@kernel.org
> > Reported-by: Yuan Tan <yuantan098@gmail.com>
> > Reported-by: Yifan Wu <yifanwucs@gmail.com>
> > Reported-by: Juefei Pu <tomapufckgml@gmail.com>
> > Reported-by: Xin Liu <bird@lzu.edu.cn>
> > Co-developed-by: Peihan Liu <ronbogo@outlook.com>
> > Signed-off-by: Peihan Liu <ronbogo@outlook.com>
> > Signed-off-by: Yilin Zhu <zylzyl2333@gmail.com>
> > Signed-off-by: Ren Wei <n05ec@lzu.edu.cn>
> > ---
> > net/xfrm/xfrm_ipcomp.c | 16 +++++++++++++++-
> > 1 file changed, 15 insertions(+), 1 deletion(-)
>
> Thanks for the patch! How about just moving the out_free_req label:
>
> ---8<---
> Move the out_free_req label up by a couple of lines so that the
> allocated dst SG list gets freed on error as well as success.
>
> Fixes: eb2953d26971 ("xfrm: ipcomp: Use crypto_acomp interface")
> Cc: stable@kernel.org
> Reported-by: Yuan Tan <yuantan098@gmail.com>
> Reported-by: Yifan Wu <yifanwucs@gmail.com>
> Reported-by: Juefei Pu <tomapufckgml@gmail.com>
> Reported-by: Xin Liu <bird@lzu.edu.cn>
> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
>
> diff --git a/net/xfrm/xfrm_ipcomp.c b/net/xfrm/xfrm_ipcomp.c
> index 5f38dff16177..2947321b043d 100644
> --- a/net/xfrm/xfrm_ipcomp.c
> +++ b/net/xfrm/xfrm_ipcomp.c
> @@ -51,11 +51,12 @@ static int ipcomp_post_acomp(struct sk_buff *skb, int err, int hlen)
> struct scatterlist *dsg;
> int len, dlen;
>
> + extra = acomp_request_extra(req);
> + dsg = extra->sg;
> +
> if (unlikely(err))
> goto out_free_req;
>
> - extra = acomp_request_extra(req);
> - dsg = extra->sg;
> dlen = req->dlen;
>
> pskb_trim_unique(skb, 0);
> @@ -84,10 +85,10 @@ static int ipcomp_post_acomp(struct sk_buff *skb, int err, int hlen)
> skb_shinfo(skb)->nr_frags++;
> } while ((dlen -= len));
>
> +out_free_req:
> for (; dsg; dsg = sg_next(dsg))
> __free_page(sg_page(dsg));
>
> -out_free_req:
> acomp_request_free(req);
> return err;
> }
> --
> Email: Herbert Xu <herbert@gondor.apana.org.au>
> Home Page: http://gondor.apana.org.au/~herbert/
> PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
Thanks, but I think moving the label is not safe for all paths reaching
ipcomp_post_acomp().
ipcomp_post_acomp() is also called when ipcomp_setup_req() returns an
ERR_PTR. In particular, acomp_request_alloc_extra() can fail after
ipcomp_cb(skb)->req is set to NULL, so moving acomp_request_extra(req)
before the error check would dereference NULL.
There are also setup failures after the request is allocated but before
the destination SG list contains any allocated pages. skb_to_sgvec() can
fail before sg_init_table(dsg, dnfrags), and alloc_page() can fail before
the first destination page is installed. In those cases walking extra->sg
with sg_next() and freeing sg_page(dsg) would operate on an uninitialized
or empty destination SG list, potentially freeing NULL or non-owned pages.
That is why the patch tracks the number of destination pages actually
allocated and frees exactly that count on the acomp error path.
Best regards,
Yilin Zhu
^ permalink raw reply
* Re: [PATCH 1/6] lib: include crc32.h conditionally on CONFIG_CRC32
From: Eric Biggers @ 2026-05-06 6:30 UTC (permalink / raw)
To: Arnd Bergmann
Cc: Yury Norov, Paul Walmsley, Palmer Dabbelt, Albert Ou,
Alexandre Ghiti, Yury Norov, Rasmus Villemoes, Andrew Lunn,
David S . Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
Andrew Morton, Alexei Starovoitov, Daniel Borkmann,
Jesper Dangaard Brouer, John Fastabend, Stanislav Fomichev,
Ruan Jinjie, linux-kernel, linux-riscv, Linux-Arch, Netdev, bpf,
Nathan Chancellor
In-Reply-To: <abf4a586-b675-4e29-9570-fd3ed4158f58@app.fastmail.com>
On Mon, May 04, 2026 at 09:05:30PM +0200, Arnd Bergmann wrote:
> On Mon, May 4, 2026, at 20:32, Yury Norov wrote:
> > On Mon, May 04, 2026 at 07:18:49PM +0200, Arnd Bergmann wrote:
> >> On Mon, May 4, 2026, at 18:46, Yury Norov wrote:
> >> > Never heard about such a thing like "optional interface". And git grep
> >> > tends to second that...
> >>
> >> I meant any library interface that can be turned on or off
> >
> > So? If I disable CRC32, can I use the either_crc()? In case of that
> > networking header, the answer is yes. In some other piece of code
> > the answer is no. Is that correct?
>
> Since it's a macro defiend in terms of both bitref32 and
> crc32_le, you can only call it from dead code, such as an
> inline function that is not itself used, or from inside of
> a block that is protected with IS_ENABLED(CONFIG_CRC32) etc.
>
> >> >>
> >> >> Don't add #ifdef blocks around headers. If the header cannot
> >> >> be included without side-effects, change the linux/crc32.h
> >> >> file instead of its users.
> >> >
> >> > linux/acpi.h does that like many othes. What exactly is wrong with
> >> > protecting headers inclusion?
> >>
> >> There is no "protecting" here, you just add complexity to the
> >> build when headers are sometimes included indirectly and but
> >> other times are not, depending on kernel configuration.
> >
> > Sorry, don't understand... I use the 'protecting' term with the meaning:
> > the functionality that is explicitly disabled should be never used.
> > Otherwise, what for we disable it?
>
> Arguably, both configuration symbols are at the point of not actually
> saving enough object code size to actually be worth the Kconfig
> dependencies.
>
> As long as we have CONFIG_CRC32 and CONFIG_BITREVERSE, the
> point of having the Kconfig symbols is to let drivers request
> the inclusion of the library helpers.
>
> >> It's unlikely to cause problems for the crc32.h header, but
> >> the acpi example definitely risks running into circular
> >> inclusions when you end up with some other header that depending
> >> on configuration ends up including linux/acpi.h while also
> >> bring included indirectly from that one.
> >>
> >> >> It looks like the problem is the check for CONFIG_GENERIC_BITREVERSE
> >> >> in include/asm-generic/bitops/__bitrev.h, which ends up
> >> >> hinding the generic___bitrev32() helper without need.
> >> >>
> >> >> Simply removing the #ifdef there should avoid the build failure.
> >> >
> >> > OK, it seems like this is what I don't understand.
> >> >
> >> > We've got an optional feature, like CRC32, which is enabled by
> >> > CONFIG_CRC32. The most conservative way is to declare everything
> >> > CRC32-related in the corresponding header, and then protect the header
> >> > with IS_ENABLED(CONFIG_CRC32).
> >> >
> >> > I understand that from practical perspective, we can declare some simple
> >> > macros, like header size, unprotected. But what we've got now is a sort
> >> > of mess: all CRC32-related functions are declared unprotected, and
> >> > generic headers are good to use them. Compiler is happy while those
> >> > functions are actually unused. Next, CRC32 depends on BITREVERSE, which
> >> > is again unprotected, and it may optionally have an arch implementation.
> >> >
> >> > So if arch bitrev() is implemented, you can use part of bitreverse and
> >> > crc32 APIs despite that they are explicitly disabled - just because they
> >> > are implemented as macros in unprotected headers. And you cannot use some
> >> > others - because they are implemented differently, as a real functions.
> >>
> >> I think you trying to solve a non-problem here.
> >
> > This was reported by Nathan for tinyconfig. At least x86 and s390 are
> > affected.
> >
> > https://lore.kernel.org/all/20260429202922.GA3575295@ax162/
> >
> > Is tinyconfig important?
>
> Nathan reported a build regression caused by a small mistake
> in 596a9ea9015b ("bitops: Define generic __bitrev8/16/32 for reuse"),
> which is of course needs to be fixed.
>
> What I meant is that there is no reason to not use the obvious
> fix and do
>
> --- a/include/asm-generic/bitops/__bitrev.h
> +++ b/include/asm-generic/bitops/__bitrev.h
> @@ -2,7 +2,6 @@
> #ifndef _ASM_GENERIC_BITOPS___BITREV_H_
> #define _ASM_GENERIC_BITOPS___BITREV_H_
>
> -#ifdef CONFIG_GENERIC_BITREVERSE
> #include <asm/types.h>
>
> extern u8 const byte_rev_table[256];
> @@ -20,6 +19,5 @@ static __always_inline __attribute_const__ u32 generic___bitrev32(u32 x)
> {
> return (generic___bitrev16(x & 0xffff) << 16) | generic___bitrev16(x >> 16);
> }
> -#endif /* CONFIG_GENERIC_BITREVERSE */
>
> #endif /* _ASM_GENERIC_BITOPS___BITREV_H_ */
>
> > Right now half CRC32 is available if CONFIG_CRC32 is on, and half is
> > not available. The bitreverse is the same. If HAVE_ARCH_BITREVERSE is
> > enabled, one can use the API, bypassing the BITREVERSE. This doesn't
> > sound right to me long-term.
> >
> > Whatever this ends up, let's figure out a consistent solution please?
>
> I really don't think we need any sort of solution here, aside from
> the trivial regression fix that returns it to the previous working
> state.
I agree with Arnd. The standard practice in the kernel is for headers
to be includeable even when the associated kconfig (if any) isn't
enabled. In such a case usually the inline functions, macros, and the
declarations of out-of-line functions and global variables are still
made available in the header.
The point of the kconfig symbol is just to build the file containing the
definitions of out-of-line functions and global variables.
Yes, when those definitions are needed and calling code forgets to
select that kconfig symbol, a build error results. But it's "only" a
build error; those get found and fixed relatively easily. The
alternative of hiding everything would mean a lot more #ifdefs in other
code, which ends up being very messy.
- Eric
^ permalink raw reply
* Re: [PATCH iwl-next v4 3/3] igc: add support for forcing link speed without autonegotiation
From: Abdul Rahim, Faizal @ 2026-05-06 6:25 UTC (permalink / raw)
To: Simon Horman, khai.wen.tan
Cc: anthony.l.nguyen, andrew+netdev, davem, edumazet, kuba, pabeni,
intel-wired-lan, netdev, linux-kernel, faizal.abdul.rahim,
hong.aun.looi, khai.wen.tan
In-Reply-To: <20260430135057.1653330-1-horms@kernel.org>
On 30/4/2026 9:50 pm, Simon Horman wrote:
> From: 'Simon Horman' <horms@kernel.org>
>
> This is an AI-generated review of your patch. The human sending this
> email has considered the AI review valid, or at least plausible.
> Full review at: https://sashiko.dev
> ---
> igc: add support for forcing link speed without autonegotiation
>
> This commit allows users to force 10/100 Mb/s link speed and duplex via
> ethtool when autonegotiation is disabled. Previously, the driver rejected
> these requests.
>
>> diff --git a/drivers/net/ethernet/intel/igc/igc_ethtool.c b/drivers/net/ethernet/intel/igc/igc_ethtool.c
>> index cfcbf2fdad6e..9997ebbdf778 100644
>> --- a/drivers/net/ethernet/intel/igc/igc_ethtool.c
>> +++ b/drivers/net/ethernet/intel/igc/igc_ethtool.c
> [ ... ]
>> @@ -2000,6 +2013,41 @@ static int igc_ethtool_get_link_ksettings(struct net_device *netdev,
>> return 0;
>> }
>>
>> +/**
>> + * igc_handle_autoneg_disabled - Configure forced speed/duplex settings
>> + * @adapter: private driver structure
>> + * @speed: requested speed (must be SPEED_10 or SPEED_100)
>> + * @duplex: requested duplex
>> + *
>> + * Records forced speed/duplex when autoneg is disabled.
>> + * Caller must validate speed before calling this function.
>> + */
>> +static void igc_handle_autoneg_disabled(struct igc_adapter *adapter, u32 speed,
>> + u8 duplex)
>> +{
>> + struct igc_mac_info *mac = &adapter->hw.mac;
>> +
>> + switch (speed) {
>> + case SPEED_10:
>> + mac->forced_speed_duplex = (duplex == DUPLEX_FULL) ?
>> + IGC_FORCED_10F : IGC_FORCED_10H;
>> + break;
>> + case SPEED_100:
>> + mac->forced_speed_duplex = (duplex == DUPLEX_FULL) ?
>> + IGC_FORCED_100F : IGC_FORCED_100H;
>> + break;
>> + default:
>> + WARN_ONCE(1, "Unsupported speed %u\n", speed);
>> + return;
>> + }
>> +
>> + mac->autoneg_enabled = false;
>> +
>> + /* Half-duplex cannot support flow control per IEEE 802.3 */
>> + if (duplex != DUPLEX_FULL)
>> + adapter->hw.fc.requested_mode = igc_fc_none;
>
> Does this permanently overwrite the user's flow control preference?
>
> If half-duplex is requested, requested_mode is set to igc_fc_none. Because
> requested_mode represents the persistent state for the user's flow control
> preference, the user might permanently lose their configuration. If they later
> switch back to full-duplex, requested_mode remains igc_fc_none, and flow
> control stays disabled.
>
> Would it be better to dynamically override hw->fc.current_mode to
> igc_fc_none during link configuration instead of mutating requested_mode?
>
Yes you're right, thanks, will update.
>> +}
> [ ... ]
>> @@ -2071,14 +2126,20 @@ igc_ethtool_set_link_ksettings(struct net_device *netdev,
>> }
>> }
>>
>> + if (cmd->base.autoneg == AUTONEG_DISABLE &&
>> + cmd->base.speed != SPEED_10 && cmd->base.speed != SPEED_100) {
>> + netdev_info(dev, "Unsupported speed for forced link\n");
>> + return -EINVAL;
>> + }
>> +
>
> Should we also validate cmd->base.duplex here to prevent a fallback to
> half-duplex?
>
> If a user runs a command like "ethtool -s eth0 speed 100 autoneg off" while
> the link is down, ethtool queries and preserves the current duplex value,
> which is DUPLEX_UNKNOWN (0xff).
>
> In igc_handle_autoneg_disabled(), the code checks if duplex == DUPLEX_FULL,
> which evaluates to false, and forces the interface to 100 Mbps half-duplex.
>
> Could we explicitly check that cmd->base.duplex is either DUPLEX_HALF or
> DUPLEX_FULL and return -EINVAL otherwise?
Will update.
>
>> while (test_and_set_bit(__IGC_RESETTING, &adapter->state))
>> usleep_range(1000, 2000);
>
> This isn't a bug introduced by this patch, but does this ad-hoc
> synchronization loop reimplement locking without fairness or lockdep support?
>
> According to the Networking Subsystem Guidelines, using bit operations as a
> substitute for real locks violates core kernel locking rules and introduces
> potential races.
>
> Could this be replaced with a proper concurrency mechanism such as a mutex?
>
Hi Simon, you've raised this issue in v2, and after discussion, you've
agreed that this change doesn't belong in this patch series.
Not sure if I missed anything?
>>
>> - if (cmd->base.autoneg == AUTONEG_ENABLE) {
>> + if (cmd->base.autoneg == AUTONEG_ENABLE)
>> igc_handle_autoneg_enabled(adapter, cmd);
>> - } else {
>> - netdev_info(dev, "Force mode currently not supported\n");
>> - }
>> + else
>> + igc_handle_autoneg_disabled(adapter, cmd->base.speed,
>> + cmd->base.duplex);
>>
>> /* MDI-X => 2; MDI => 1; Auto => 3 */
>> if (cmd->base.eth_tp_mdix_ctrl) {
>
^ permalink raw reply
* Re: [PATCH iwl-next v4 0/3] igc: add support for forcing link speed without autonegotiation
From: Abdul Rahim, Faizal @ 2026-05-06 6:21 UTC (permalink / raw)
To: David Laight, KhaiWenTan
Cc: anthony.l.nguyen, andrew+netdev, davem, edumazet, kuba, pabeni,
intel-wired-lan, netdev, linux-kernel, faizal.abdul.rahim,
hong.aun.looi, khai.wen.tan
In-Reply-To: <20260430154105.505739ac@pumpkin>
On 30/4/2026 10:41 pm, David Laight wrote:
> On Tue, 28 Apr 2026 14:00:06 +0800
> KhaiWenTan <khai.wen.tan@linux.intel.com> wrote:
>
>> From: Faizal Rahim <faizal.abdul.rahim@linux.intel.com>
>>
>> This series adds support for forcing 10/100 Mb/s link speed via ethtool
>> when autonegotiation is disabled on the igc driver.
>
> I'll ask 'why' ?
>
> In particular forcing half/full duplex has always been a very good way
> of 'breaking' a network connection.
>
> It really is much better to restrict the advertised link modes and let
> the autodetect/autonegotiation logic in the phy/mac do its job.
>
> About the only think I can think of is to force 10M HDX when connected
> to a remote system that supports 10M/100M HDX.
> In that case you need to send out single link test pulses, not the
> burst used to identify 100M HDX, or the pattern encoded on the burst
> used by autonegotiation.
> But you need to got back to the mid 1990s to find such systems.
> Anything that supports FDX will do autonegotiation.
>
> David
>
There's a use case requested:
Profinet Certification tool reports that forcing a link speed without
auto-negotiation is not working.
Forcing the link speed is a critical feature for the industrial automation
"fast-start" use case. When there is a connection lost, the system must
come back up as fast as possible. In PROFINET, that means to force the
speed and rejoin the controller loops. Without supporting forcing the speed
to 100M in Foxville, the certification tool would not be able to certify
the availability of this feature.
I'm hoping this context is enough to justify the need?
^ permalink raw reply
* Re: [PATCH v1 net] ipmr: Free mr_table after RCU grace period.
From: Kuniyuki Iwashima @ 2026-05-06 6:20 UTC (permalink / raw)
To: Lai, Yi
Cc: David S. Miller, David Ahern, Eric Dumazet, Jakub Kicinski,
Paolo Abeni, Simon Horman, Kuniyuki Iwashima, netdev,
linux-kernel
In-Reply-To: <afrY34dLXNUboevf@ly-workstation>
On Tue, May 5, 2026 at 11:00 PM Lai, Yi <yi1.lai@intel.com> wrote:
>
> On Thu, Apr 23, 2026 at 05:34:54AM +0000, Kuniyuki Iwashima wrote:
> > With CONFIG_IP_MROUTE_MULTIPLE_TABLES=n, ipmr_fib_lookup()
> > does not check if net->ipv4.mrt is NULL.
> >
> > Since default_device_exit_batch() is called after ->exit_rtnl(),
> > a device could receive IGMP packets and access net->ipv4.mrt
> > during/after ipmr_rules_exit_rtnl().
> >
> > If ipmr_rules_exit_rtnl() had already cleared it and freed the
> > memory, the access would trigger null-ptr-deref or use-after-free.
> >
> > Let's fix it by using RCU helper and free mrt after RCU grace
> > period.
> >
> > In addition, check_net(net) is added to mroute_clean_tables()
> > and ipmr_cache_unresolved() to synchronise via mfc_unres_lock.
> > This prevents ipmr_cache_unresolved() from putting skb into
> > c->_c.mfc_un.unres.unresolved after mroute_clean_tables()
> > purges it.
> >
> > For the same reason, timer_shutdown_sync() is moved after
> > mroute_clean_tables().
> >
> > Since rhltable_destroy() holds mutex internally, rcu_work is
> > used, and it is placed as the first member because rcu_head
> > must be placed within <4K offset. mr_table is alraedy 3864
> > bytes without rcu_work.
> >
> > Note that IP6MR is not yet converted to ->exit_rtnl(), so this
> > change is not needed for now but will be.
> >
> > Fixes: b22b01867406 ("ipmr: Convert ipmr_net_exit_batch() to ->exit_rtnl().")
> > Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
>
> Hi Kuniyuki Iwashima,
>
> Greetings!
>
> I used Syzkaller and found that there is WARNING: suspicious RCU usage in reg_vif_xmit in linux-next next-20260505.
>
> After bisection and the first bad commit is:
> "
> b3b6babf4751 ipmr: Free mr_table after RCU grace period
> "
>
> All detailed into can be found at:
> https://github.com/laifryiee/syzkaller_logs/tree/main/260506_091248_reg_vif_xmit
> Syzkaller repro code:
> https://github.com/laifryiee/syzkaller_logs/tree/main/260506_091248_reg_vif_xmit/repro.c
> Syzkaller repro syscall steps:
> https://github.com/laifryiee/syzkaller_logs/tree/main/260506_091248_reg_vif_xmit/repro.prog
> Syzkaller report:
> https://github.com/laifryiee/syzkaller_logs/tree/main/260506_091248_reg_vif_xmit/repro.report
> Kconfig(make olddefconfig):
> https://github.com/laifryiee/syzkaller_logs/tree/main/260506_091248_reg_vif_xmit/kconfig_origin
> Bisect info:
> https://github.com/laifryiee/syzkaller_logs/tree/main/260506_091248_reg_vif_xmit/bisect_info.log
> bzImage:
> https://github.com/laifryiee/syzkaller_logs/raw/refs/heads/main/260506_091248_reg_vif_xmit/bzImage_next-20260505
> Issue dmesg:
> https://github.com/laifryiee/syzkaller_logs/blob/main/260506_091248_reg_vif_xmit/next-20260505_dmesg.log
>
> "
> [ 18.611146] =============================
> [ 18.611406] WARNING: suspicious RCU usage
> [ 18.611657] 7.1.0-rc2-next-20260505-next-2026050 #1 Not tainted
> [ 18.612022] -----------------------------
> [ 18.612289] net/ipv4/ipmr.c:329 suspicious rcu_dereference_check() usage!
> [ 18.612755]
> [ 18.612755] other info that might help us debug this:
> [ 18.612755]
> [ 18.613314]
> [ 18.613314] rcu_scheduler_active = 2, debug_locks = 1
> [ 18.613758] 2 locks held by repro/725:
> [ 18.614195] #0: ffffffff87896440 (rcu_read_lock_bh){....}-{1:3}, at: __dev_queue_xmit+0x239/0x4140
> [ 18.614860] #1: ff1100000df5b918 (_xmit_PIMREG#2){+...}-{3:3}, at: __dev_queue_xmit+0x1d5d/0x4140
> [ 18.615505]
> [ 18.615505] stack backtrace:
> [ 18.615814] CPU: 0 UID: 0 PID: 725 Comm: repro Not tainted 7.1.0-rc2-next-20260505-next-2026050 #1 PREEMPT(lazy)
> [ 18.615826] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.3-0-ga6ed6b701f0a-prebuilt.qemu4
> [ 18.615831] Call Trace:
> [ 18.615834] <TASK>
> [ 18.615838] dump_stack_lvl+0x121/0x150
> [ 18.615853] dump_stack+0x19/0x20
> [ 18.615864] lockdep_rcu_suspicious+0x15b/0x1f0
> [ 18.615882] reg_vif_xmit+0x2ee/0x3c0
Thanks for the report.
I'll just move up rcu_read_lock() in reg_vif_xmit().
ipmr_fib_lookup() for CONFIG_IP_MROUTE_MULTIPLE_TABLES=y
calls rcu_read_lock() at the same timing anyway.
^ permalink raw reply
* Re: [Intel-wired-lan] [PATCH iwl-next v4 1/3] igc: remove unused autoneg_failed field
From: Abdul Rahim, Faizal @ 2026-05-06 6:07 UTC (permalink / raw)
To: Paul Menzel
Cc: khai.wen.tan, anthony.l.nguyen, andrew+netdev, davem, edumazet,
kuba, pabeni, intel-wired-lan, netdev, linux-kernel,
faizal.abdul.rahim, hong.aun.looi, khai.wen.tan,
Aleksandr Loktionov
In-Reply-To: <d3d4915c-1bc5-4e04-bfc4-9d9787849c6f@molgen.mpg.de>
On 28/4/2026 11:06 pm, Paul Menzel wrote:
> Dear Faizal,
>
>
> Am 28.04.26 um 12:39 schrieb Abdul Rahim, Faizal:
>
>> On 28/4/2026 2:56 pm, Paul Menzel wrote:
>
>>> Am 28.04.26 um 08:00 schrieb KhaiWenTan:
>>>
>>> (Should spaces be added in your name?)
>>>
>>>> From: Faizal Rahim <faizal.abdul.rahim@linux.intel.com>
>>>>
>>>> autoneg_failed in struct igc_mac_info is never set in the igc driver.
>>>> Remove the field and the dead code checking it in
>>>> igc_config_fc_after_link_up().
>>>
>>> Could you please elaborate. Why is removal the correct fix, and it’s not
>>> an incomplete feature? Does auto-negotiation always succeed?
>>
>> Auto-negotiation does not always succeed, but igc does not use
>> autoneg_failed to handle that case, the field was never set anywhere
>> in the igc driver.
>>
>> Before this patch, the only igc references to autoneg_failed were
>> the struct member declaration and the read in
>> igc_config_fc_after_link_up(). No igc code ever assigned it to true,
>> and git history shows no commit that added a setter since the code
>> creation in 2018.
>>
>> The field originates from the e1000/e1000e fiber/serdes forced-link
>> path: when MAC-level auto-negotiation on fiber times out, the driver
>> forces link up and sets autoneg_failed so the flow-control code knows
>> pause was not negotiated and must be forced. igc has no fiber or
>> serdes media, it only supports copper (igc_media_type_copper), so
>> the code that sets autoneg_failed was never ported.
>>
>> On copper, PHY auto-negotiation failure is handled differently:
>> - No link at all: igc_check_for_copper_link() returns before reaching
>> flow-control configuration, there's nothing to configure FC on.
>> - Link present but autoneg not yet complete:
>> igc_config_fc_after_link_up() checks MII_SR_AUTONEG_COMPLETE and
>> returns early without resolving pause. The next link-status event
>> re-triggers the check.
>> - Autoneg completes (including via parallel detection fallback when
>> the link partner doesn't autoneg): the PHY still sets
>> AUTONEG_COMPLETE but LP_ABILITY won't have PAUSE bits since the
>> partner never sent autoneg pages. The existing flow-control logic
>> in igc_config_fc_after_link_up() handles that correctly, it falls
>> through to igc_fc_none or igc_fc_rx_pause based on requested_mode.
>>
>> None of these paths need autoneg_failed. Keeping the field would be
>> misleading to reader.
>
> Thank you. For me the information about just supporting copper would be
> great to have in the commit message.
Will update.
>
>>>> Reviewed-by: Looi, Hong Aun <hong.aun.looi@intel.com>
>>>
>>> Please order it to not use the comma: Hong Aun Looi
>>
>> Will do, thanks.
>>
>>>> Reviewed-by: Aleksandr Loktionov <aleksandr.loktionov@intel.com>
>>>> Signed-off-by: Faizal Rahim <faizal.abdul.rahim@linux.intel.com>
>>>> Signed-off-by: KhaiWenTan <khai.wen.tan@linux.intel.com>
>>>> ---
>>>> drivers/net/ethernet/intel/igc/igc_hw.h | 1 -
>>>> drivers/net/ethernet/intel/igc/igc_mac.c | 16 +---------------
>>>> 2 files changed, 1 insertion(+), 16 deletions(-)
>>>>
>>>> diff --git a/drivers/net/ethernet/intel/igc/igc_hw.h b/drivers/net/
>>>> ethernet/intel/igc/igc_hw.h
>>>> index be8a49a86d09..86ab8f566f44 100644
>>>> --- a/drivers/net/ethernet/intel/igc/igc_hw.h
>>>> +++ b/drivers/net/ethernet/intel/igc/igc_hw.h
>>>> @@ -92,7 +92,6 @@ struct igc_mac_info {
>>>> bool asf_firmware_present;
>>>> bool arc_subsystem_valid;
>>>>
>>>> - bool autoneg_failed;
>>>> bool get_link_status;
>>>> };
>>>>
>>>> diff --git a/drivers/net/ethernet/intel/igc/igc_mac.c b/drivers/net/
>>>> ethernet/intel/igc/igc_mac.c
>>>> index 7ac6637f8db7..142beb9ae557 100644
>>>> --- a/drivers/net/ethernet/intel/igc/igc_mac.c
>>>> +++ b/drivers/net/ethernet/intel/igc/igc_mac.c
>>>> @@ -438,28 +438,14 @@ void igc_config_collision_dist(struct igc_hw *hw)
>>>> * Checks the status of auto-negotiation after link up to ensure that
>>>> the
>
> Just for your information, that your mailer wraps the lines of the quotes.
Ohh okay, let me check, thanks!
> […]
>
>>>> * speed and duplex were not forced. If the link needed to be
>>>> forced, then
>>>> * flow control needs to be forced also. If auto-negotiation is enabled
>>>> - * and did not fail, then we configure flow control based on our link
>>>> - * partner.
>>>> + * then we configure flow control based on our link partner.
>>>> */
>>>> s32 igc_config_fc_after_link_up(struct igc_hw *hw)
>>>> {
>>>> u16 mii_status_reg, mii_nway_adv_reg, mii_nway_lp_ability_reg;
>>>> - struct igc_mac_info *mac = &hw->mac;
>>>> u16 speed, duplex;
>>>> s32 ret_val = 0;
>>>>
>>>> - /* Check for the case where we have fiber media and auto-neg failed
>>>> - * so we had to force link. In this case, we need to force the
>>>> - * configuration of the MAC to match the "fc" parameter.
>>>> - */
>>>> - if (mac->autoneg_failed)
>>>> - ret_val = igc_force_mac_fc(hw);
>>>> -
>>>> - if (ret_val) {
>>>> - hw_dbg("Error forcing flow control settings\n");
>>>> - goto out;
>>>> - }
>>>> -
>>>> /* In auto-neg, we need to check and see if Auto-Neg has completed,
>>>> * and if so, how the PHY and link partner has flow control
>>>> * configured.
>
> Kind regards,
>
> Paul
>
^ permalink raw reply
* Re: [PATCH v1 net] ipmr: Free mr_table after RCU grace period.
From: Lai, Yi @ 2026-05-06 5:59 UTC (permalink / raw)
To: Kuniyuki Iwashima
Cc: David S. Miller, David Ahern, Eric Dumazet, Jakub Kicinski,
Paolo Abeni, Simon Horman, Kuniyuki Iwashima, netdev,
linux-kernel
In-Reply-To: <20260423053456.4097409-1-kuniyu@google.com>
On Thu, Apr 23, 2026 at 05:34:54AM +0000, Kuniyuki Iwashima wrote:
> With CONFIG_IP_MROUTE_MULTIPLE_TABLES=n, ipmr_fib_lookup()
> does not check if net->ipv4.mrt is NULL.
>
> Since default_device_exit_batch() is called after ->exit_rtnl(),
> a device could receive IGMP packets and access net->ipv4.mrt
> during/after ipmr_rules_exit_rtnl().
>
> If ipmr_rules_exit_rtnl() had already cleared it and freed the
> memory, the access would trigger null-ptr-deref or use-after-free.
>
> Let's fix it by using RCU helper and free mrt after RCU grace
> period.
>
> In addition, check_net(net) is added to mroute_clean_tables()
> and ipmr_cache_unresolved() to synchronise via mfc_unres_lock.
> This prevents ipmr_cache_unresolved() from putting skb into
> c->_c.mfc_un.unres.unresolved after mroute_clean_tables()
> purges it.
>
> For the same reason, timer_shutdown_sync() is moved after
> mroute_clean_tables().
>
> Since rhltable_destroy() holds mutex internally, rcu_work is
> used, and it is placed as the first member because rcu_head
> must be placed within <4K offset. mr_table is alraedy 3864
> bytes without rcu_work.
>
> Note that IP6MR is not yet converted to ->exit_rtnl(), so this
> change is not needed for now but will be.
>
> Fixes: b22b01867406 ("ipmr: Convert ipmr_net_exit_batch() to ->exit_rtnl().")
> Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
Hi Kuniyuki Iwashima,
Greetings!
I used Syzkaller and found that there is WARNING: suspicious RCU usage in reg_vif_xmit in linux-next next-20260505.
After bisection and the first bad commit is:
"
b3b6babf4751 ipmr: Free mr_table after RCU grace period
"
All detailed into can be found at:
https://github.com/laifryiee/syzkaller_logs/tree/main/260506_091248_reg_vif_xmit
Syzkaller repro code:
https://github.com/laifryiee/syzkaller_logs/tree/main/260506_091248_reg_vif_xmit/repro.c
Syzkaller repro syscall steps:
https://github.com/laifryiee/syzkaller_logs/tree/main/260506_091248_reg_vif_xmit/repro.prog
Syzkaller report:
https://github.com/laifryiee/syzkaller_logs/tree/main/260506_091248_reg_vif_xmit/repro.report
Kconfig(make olddefconfig):
https://github.com/laifryiee/syzkaller_logs/tree/main/260506_091248_reg_vif_xmit/kconfig_origin
Bisect info:
https://github.com/laifryiee/syzkaller_logs/tree/main/260506_091248_reg_vif_xmit/bisect_info.log
bzImage:
https://github.com/laifryiee/syzkaller_logs/raw/refs/heads/main/260506_091248_reg_vif_xmit/bzImage_next-20260505
Issue dmesg:
https://github.com/laifryiee/syzkaller_logs/blob/main/260506_091248_reg_vif_xmit/next-20260505_dmesg.log
"
[ 18.611146] =============================
[ 18.611406] WARNING: suspicious RCU usage
[ 18.611657] 7.1.0-rc2-next-20260505-next-2026050 #1 Not tainted
[ 18.612022] -----------------------------
[ 18.612289] net/ipv4/ipmr.c:329 suspicious rcu_dereference_check() usage!
[ 18.612755]
[ 18.612755] other info that might help us debug this:
[ 18.612755]
[ 18.613314]
[ 18.613314] rcu_scheduler_active = 2, debug_locks = 1
[ 18.613758] 2 locks held by repro/725:
[ 18.614195] #0: ffffffff87896440 (rcu_read_lock_bh){....}-{1:3}, at: __dev_queue_xmit+0x239/0x4140
[ 18.614860] #1: ff1100000df5b918 (_xmit_PIMREG#2){+...}-{3:3}, at: __dev_queue_xmit+0x1d5d/0x4140
[ 18.615505]
[ 18.615505] stack backtrace:
[ 18.615814] CPU: 0 UID: 0 PID: 725 Comm: repro Not tainted 7.1.0-rc2-next-20260505-next-2026050 #1 PREEMPT(lazy)
[ 18.615826] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.3-0-ga6ed6b701f0a-prebuilt.qemu4
[ 18.615831] Call Trace:
[ 18.615834] <TASK>
[ 18.615838] dump_stack_lvl+0x121/0x150
[ 18.615853] dump_stack+0x19/0x20
[ 18.615864] lockdep_rcu_suspicious+0x15b/0x1f0
[ 18.615882] reg_vif_xmit+0x2ee/0x3c0
[ 18.615898] dev_hard_start_xmit+0x170/0x700
[ 18.615915] __dev_queue_xmit+0x1df1/0x4140
[ 18.615931] ? __might_fault+0x14a/0x1b0
[ 18.615943] ? __this_cpu_preempt_check+0x21/0x30
[ 18.615961] ? __pfx___dev_queue_xmit+0x10/0x10
[ 18.615977] ? _copy_from_iter+0x288/0x15e0
[ 18.615989] ? __virt_addr_valid+0x22c/0x420
[ 18.616004] ? __virt_addr_valid+0x22c/0x420
[ 18.616018] ? __this_cpu_preempt_check+0x21/0x30
[ 18.616030] ? __pfx__copy_from_iter+0x10/0x10
[ 18.616048] ? __sanitizer_cov_trace_const_cmp1+0x1e/0x30
[ 18.616064] ? packet_parse_headers+0x439/0x7b0
[ 18.616076] ? packet_parse_headers+0x202/0x7b0
[ 18.616088] ? __pfx_packet_parse_headers+0x10/0x10
[ 18.616103] packet_xmit+0x252/0x370
[ 18.616119] packet_sendmsg+0x39ad/0x5650
[ 18.616136] ? __lock_acquire+0x412/0x2390
[ 18.616174] ? __pfx_packet_sendmsg+0x10/0x10
[ 18.616189] ? audit_watch_handle_event+0x130/0x900
[ 18.616201] ? __import_iovec+0x1df/0x660
[ 18.616213] ? _copy_from_user+0x75/0xa0
[ 18.616229] ? __pfx_packet_sendmsg+0x10/0x10
[ 18.616242] ____sys_sendmsg+0xa21/0xba0
[ 18.616257] ? __pfx_____sys_sendmsg+0x10/0x10
[ 18.616274] ? __this_cpu_preempt_check+0x21/0x30
[ 18.616285] ? lock_release+0x14f/0x2c0
[ 18.616305] ___sys_sendmsg+0x121/0x1c0
[ 18.616322] ? __pfx____sys_sendmsg+0x10/0x10
[ 18.616347] ? __handle_mm_fault+0x656/0x2cb0
[ 18.616388] __sys_sendmsg+0x177/0x220
[ 18.616403] ? __pfx___sys_sendmsg+0x10/0x10
[ 18.616428] ? seqcount_lockdep_reader_access.constprop.0+0xc0/0xd0
[ 18.616440] ? __sanitizer_cov_trace_cmp4+0x1a/0x20
[ 18.616453] ? ktime_get_coarse_real_ts64+0xad/0xf0
[ 18.616471] __x64_sys_sendmsg+0x80/0xc0
[ 18.616487] x64_sys_call+0x1d9c/0x21c0
[ 18.616499] do_syscall_64+0xc1/0x1020
[ 18.616517] entry_SYSCALL_64_after_hwframe+0x76/0x7e
[ 18.616527] RIP: 0033:0x7f93b863ee5d
[ 18.616536] Code: ff c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c8
[ 18.616546] RSP: 002b:00007fff211cf048 EFLAGS: 00000246 ORIG_RAX: 000000000000002e
[ 18.616555] RAX: ffffffffffffffda RBX: 0000200000000380 RCX: 00007f93b863ee5d
[ 18.616561] RDX: 0000000000000000 RSI: 00002000000012c0 RDI: 0000000000000004
[ 18.616567] RBP: 00007fff211cf070 R08: 0000000200000000 R09: 0000000200000000
[ 18.616573] R10: 0000000200000000 R11: 0000000000000246 R12: 00007fff211cf188
[ 18.616579] R13: 0000000000401164 R14: 0000000000403e08 R15: 00007f93b886e000
[ 18.616601] </TASK>
"
Hope this cound be insightful to you.
Regards,
Yi Lai
---
If you don't need the following environment to reproduce the problem or if you
already have one reproduced environment, please ignore the following information.
How to reproduce:
git clone https://gitlab.com/xupengfe/repro_vm_env.git
cd repro_vm_env
tar -xvf repro_vm_env.tar.gz
cd repro_vm_env; ./start3.sh // it needs qemu-system-x86_64 and I used v7.1.0
// start3.sh will load bzImage_2241ab53cbb5cdb08a6b2d4688feb13971058f65 v6.2-rc5 kernel
// You could change the bzImage_xxx as you want
// Maybe you need to remove line "-drive if=pflash,format=raw,readonly=on,file=./OVMF_CODE.fd \" for different qemu version
You could use below command to log in, there is no password for root.
ssh -p 10023 root@localhost
After login vm(virtual machine) successfully, you could transfer reproduced
binary to the vm by below way, and reproduce the problem in vm:
gcc -pthread -o repro repro.c
scp -P 10023 repro root@localhost:/root/
Get the bzImage for target kernel:
Please use target kconfig and copy it to kernel_src/.config
make olddefconfig
make -jx bzImage //x should equal or less than cpu num your pc has
Fill the bzImage file into above start3.sh to load the target kernel in vm.
Tips:
If you already have qemu-system-x86_64, please ignore below info.
If you want to install qemu v7.1.0 version:
git clone https://github.com/qemu/qemu.git
cd qemu
git checkout -f v7.1.0
mkdir build
cd build
yum install -y ninja-build.x86_64
yum -y install libslirp-devel.x86_64
../configure --target-list=x86_64-softmmu --enable-kvm --enable-vnc --enable-gtk --enable-sdl --enable-usb-redir --enable-slirp
make
make install
> ---
> include/linux/mroute_base.h | 3 +
> net/ipv4/ipmr.c | 108 +++++++++++++++++++-----------------
> net/ipv4/ipmr_base.c | 16 ++++++
> 3 files changed, 77 insertions(+), 50 deletions(-)
>
> diff --git a/include/linux/mroute_base.h b/include/linux/mroute_base.h
> index cf3374580f74..5d75cc5b057e 100644
> --- a/include/linux/mroute_base.h
> +++ b/include/linux/mroute_base.h
> @@ -226,6 +226,7 @@ struct mr_table_ops {
>
> /**
> * struct mr_table - a multicast routing table
> + * @work: used for table destruction
> * @list: entry within a list of multicast routing tables
> * @net: net where this table belongs
> * @ops: protocol specific operations
> @@ -243,6 +244,7 @@ struct mr_table_ops {
> * @mroute_reg_vif_num: PIM-device vif index
> */
> struct mr_table {
> + struct rcu_work work;
> struct list_head list;
> possible_net_t net;
> struct mr_table_ops ops;
> @@ -274,6 +276,7 @@ void vif_device_init(struct vif_device *v,
> unsigned short flags,
> unsigned short get_iflink_mask);
>
> +void mr_table_free(struct mr_table *mrt);
> struct mr_table *
> mr_table_alloc(struct net *net, u32 id,
> struct mr_table_ops *ops,
> diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
> index 8a08d09b4c30..2058ca860294 100644
> --- a/net/ipv4/ipmr.c
> +++ b/net/ipv4/ipmr.c
> @@ -151,16 +151,6 @@ static struct mr_table *__ipmr_get_table(struct net *net, u32 id)
> return NULL;
> }
>
> -static struct mr_table *ipmr_get_table(struct net *net, u32 id)
> -{
> - struct mr_table *mrt;
> -
> - rcu_read_lock();
> - mrt = __ipmr_get_table(net, id);
> - rcu_read_unlock();
> - return mrt;
> -}
> -
> static int ipmr_fib_lookup(struct net *net, struct flowi4 *flp4,
> struct mr_table **mrt)
> {
> @@ -293,7 +283,7 @@ static void __net_exit ipmr_rules_exit_rtnl(struct net *net,
> struct mr_table *mrt, *next;
>
> list_for_each_entry_safe(mrt, next, &net->ipv4.mr_tables, list) {
> - list_del(&mrt->list);
> + list_del_rcu(&mrt->list);
> ipmr_free_table(mrt, dev_kill_list);
> }
> }
> @@ -315,28 +305,30 @@ bool ipmr_rule_default(const struct fib_rule *rule)
> }
> EXPORT_SYMBOL(ipmr_rule_default);
> #else
> -#define ipmr_for_each_table(mrt, net) \
> - for (mrt = net->ipv4.mrt; mrt; mrt = NULL)
> -
> static struct mr_table *ipmr_mr_table_iter(struct net *net,
> struct mr_table *mrt)
> {
> if (!mrt)
> - return net->ipv4.mrt;
> + return rcu_dereference(net->ipv4.mrt);
> return NULL;
> }
>
> -static struct mr_table *ipmr_get_table(struct net *net, u32 id)
> +static struct mr_table *__ipmr_get_table(struct net *net, u32 id)
> {
> - return net->ipv4.mrt;
> + return rcu_dereference_check(net->ipv4.mrt,
> + lockdep_rtnl_is_held() ||
> + !rcu_access_pointer(net->ipv4.mrt));
> }
>
> -#define __ipmr_get_table ipmr_get_table
> +#define ipmr_for_each_table(mrt, net) \
> + for (mrt = __ipmr_get_table(net, 0); mrt; mrt = NULL)
>
> static int ipmr_fib_lookup(struct net *net, struct flowi4 *flp4,
> struct mr_table **mrt)
> {
> - *mrt = net->ipv4.mrt;
> + *mrt = rcu_dereference(net->ipv4.mrt);
> + if (!*mrt)
> + return -EAGAIN;
> return 0;
> }
>
> @@ -347,7 +339,8 @@ static int __net_init ipmr_rules_init(struct net *net)
> mrt = ipmr_new_table(net, RT_TABLE_DEFAULT);
> if (IS_ERR(mrt))
> return PTR_ERR(mrt);
> - net->ipv4.mrt = mrt;
> +
> + rcu_assign_pointer(net->ipv4.mrt, mrt);
> return 0;
> }
>
> @@ -358,9 +351,10 @@ static void __net_exit ipmr_rules_exit(struct net *net)
> static void __net_exit ipmr_rules_exit_rtnl(struct net *net,
> struct list_head *dev_kill_list)
> {
> - ipmr_free_table(net->ipv4.mrt, dev_kill_list);
> + struct mr_table *mrt = rcu_dereference_protected(net->ipv4.mrt, 1);
>
> - net->ipv4.mrt = NULL;
> + RCU_INIT_POINTER(net->ipv4.mrt, NULL);
> + ipmr_free_table(mrt, dev_kill_list);
> }
>
> static int ipmr_rules_dump(struct net *net, struct notifier_block *nb,
> @@ -381,6 +375,17 @@ bool ipmr_rule_default(const struct fib_rule *rule)
> EXPORT_SYMBOL(ipmr_rule_default);
> #endif
>
> +static struct mr_table *ipmr_get_table(struct net *net, u32 id)
> +{
> + struct mr_table *mrt;
> +
> + rcu_read_lock();
> + mrt = __ipmr_get_table(net, id);
> + rcu_read_unlock();
> +
> + return mrt;
> +}
> +
> static inline int ipmr_hash_cmp(struct rhashtable_compare_arg *arg,
> const void *ptr)
> {
> @@ -441,12 +446,11 @@ static void ipmr_free_table(struct mr_table *mrt, struct list_head *dev_kill_lis
>
> WARN_ON_ONCE(!mr_can_free_table(net));
>
> - timer_shutdown_sync(&mrt->ipmr_expire_timer);
> mroute_clean_tables(mrt, MRT_FLUSH_VIFS | MRT_FLUSH_VIFS_STATIC |
> MRT_FLUSH_MFC | MRT_FLUSH_MFC_STATIC,
> &ipmr_dev_kill_list);
> - rhltable_destroy(&mrt->mfc_hash);
> - kfree(mrt);
> + timer_shutdown_sync(&mrt->ipmr_expire_timer);
> + mr_table_free(mrt);
>
> WARN_ON_ONCE(!net_initialized(net) && !list_empty(&ipmr_dev_kill_list));
> list_splice(&ipmr_dev_kill_list, dev_kill_list);
> @@ -1135,12 +1139,19 @@ static int ipmr_cache_report(const struct mr_table *mrt,
> static int ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi,
> struct sk_buff *skb, struct net_device *dev)
> {
> + struct net *net = read_pnet(&mrt->net);
> const struct iphdr *iph = ip_hdr(skb);
> - struct mfc_cache *c;
> + struct mfc_cache *c = NULL;
> bool found = false;
> int err;
>
> spin_lock_bh(&mfc_unres_lock);
> +
> + if (!check_net(net)) {
> + err = -EINVAL;
> + goto err;
> + }
> +
> list_for_each_entry(c, &mrt->mfc_unres_queue, _c.list) {
> if (c->mfc_mcastgrp == iph->daddr &&
> c->mfc_origin == iph->saddr) {
> @@ -1153,10 +1164,8 @@ static int ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi,
> /* Create a new entry if allowable */
> c = ipmr_cache_alloc_unres();
> if (!c) {
> - spin_unlock_bh(&mfc_unres_lock);
> -
> - kfree_skb(skb);
> - return -ENOBUFS;
> + err = -ENOBUFS;
> + goto err;
> }
>
> /* Fill in the new cache entry */
> @@ -1166,17 +1175,8 @@ static int ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi,
>
> /* Reflect first query at mrouted. */
> err = ipmr_cache_report(mrt, skb, vifi, IGMPMSG_NOCACHE);
> -
> - if (err < 0) {
> - /* If the report failed throw the cache entry
> - out - Brad Parker
> - */
> - spin_unlock_bh(&mfc_unres_lock);
> -
> - ipmr_cache_free(c);
> - kfree_skb(skb);
> - return err;
> - }
> + if (err < 0)
> + goto err;
>
> atomic_inc(&mrt->cache_resolve_queue_len);
> list_add(&c->_c.list, &mrt->mfc_unres_queue);
> @@ -1189,18 +1189,26 @@ static int ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi,
>
> /* See if we can append the packet */
> if (c->_c.mfc_un.unres.unresolved.qlen > 3) {
> - kfree_skb(skb);
> + c = NULL;
> err = -ENOBUFS;
> - } else {
> - if (dev) {
> - skb->dev = dev;
> - skb->skb_iif = dev->ifindex;
> - }
> - skb_queue_tail(&c->_c.mfc_un.unres.unresolved, skb);
> - err = 0;
> + goto err;
> + }
> +
> + if (dev) {
> + skb->dev = dev;
> + skb->skb_iif = dev->ifindex;
> }
>
> + skb_queue_tail(&c->_c.mfc_un.unres.unresolved, skb);
> +
> spin_unlock_bh(&mfc_unres_lock);
> + return 0;
> +
> +err:
> + spin_unlock_bh(&mfc_unres_lock);
> + if (c)
> + ipmr_cache_free(c);
> + kfree_skb(skb);
> return err;
> }
>
> @@ -1346,7 +1354,7 @@ static void mroute_clean_tables(struct mr_table *mrt, int flags,
> }
>
> if (flags & MRT_FLUSH_MFC) {
> - if (atomic_read(&mrt->cache_resolve_queue_len) != 0) {
> + if (atomic_read(&mrt->cache_resolve_queue_len) != 0 || !check_net(net)) {
> spin_lock_bh(&mfc_unres_lock);
> list_for_each_entry_safe(c, tmp, &mrt->mfc_unres_queue, list) {
> list_del(&c->list);
> diff --git a/net/ipv4/ipmr_base.c b/net/ipv4/ipmr_base.c
> index 37a3c144276c..3930d612c3de 100644
> --- a/net/ipv4/ipmr_base.c
> +++ b/net/ipv4/ipmr_base.c
> @@ -28,6 +28,20 @@ void vif_device_init(struct vif_device *v,
> v->link = dev->ifindex;
> }
>
> +static void __mr_free_table(struct work_struct *work)
> +{
> + struct mr_table *mrt = container_of(to_rcu_work(work),
> + struct mr_table, work);
> +
> + rhltable_destroy(&mrt->mfc_hash);
> + kfree(mrt);
> +}
> +
> +void mr_table_free(struct mr_table *mrt)
> +{
> + queue_rcu_work(system_unbound_wq, &mrt->work);
> +}
> +
> struct mr_table *
> mr_table_alloc(struct net *net, u32 id,
> struct mr_table_ops *ops,
> @@ -50,6 +64,8 @@ mr_table_alloc(struct net *net, u32 id,
> kfree(mrt);
> return ERR_PTR(err);
> }
> +
> + INIT_RCU_WORK(&mrt->work, __mr_free_table);
> INIT_LIST_HEAD(&mrt->mfc_cache_list);
> INIT_LIST_HEAD(&mrt->mfc_unres_queue);
>
> --
> 2.54.0.rc2.533.g4f5dca5207-goog
>
^ permalink raw reply
* RE: [Intel-wired-lan] [PATCH net] i40e: fix memcmp of pointer in i40e_hw_set_dcb_config()
From: Arland, ArpanaX @ 2026-05-06 5:52 UTC (permalink / raw)
To: Aaron Esau, intel-wired-lan@lists.osuosl.org
Cc: netdev@vger.kernel.org, Nguyen, Anthony L, Kitszel, Przemyslaw,
Kubalewski, Arkadiusz, stable@vger.kernel.org
In-Reply-To: <20260329162151.2043655-1-aaron1esau@gmail.com>
> -----Original Message-----
> From: Intel-wired-lan <intel-wired-lan-bounces@osuosl.org> On Behalf Of Aaron Esau
> Sent: Sunday, March 29, 2026 9:52 PM
> To: intel-wired-lan@lists.osuosl.org
> Cc: netdev@vger.kernel.org; Nguyen, Anthony L <anthony.l.nguyen@intel.com>; Kitszel, Przemyslaw <przemyslaw.kitszel@intel.com>; Kubalewski, Arkadiusz <arkadiusz.kubalewski@intel.com>; stable@vger.kernel.org; Aaron Esau <aaron1esau@gmail.com>
> Subject: [Intel-wired-lan] [PATCH net] i40e: fix memcmp of pointer in i40e_hw_set_dcb_config()
>
> In i40e_hw_set_dcb_config(), both new_cfg and old_cfg are pointers to struct i40e_dcbx_config, so sizeof(new_cfg) evaluates to the size of a pointer (8 bytes on 64-bit) rather than the size of the struct. > Likewise, &new_cfg and &old_cfg are the addresses of the pointer variables on the stack, not the addresses of the actual config structs.
>
> As a result, the memcmp never compares the actual configuration data, meaning the "no change needed" early return never fires. Every call to this function performs a full DCB reconfiguration (quiescing > all VSIs, reprogramming via "Set LLDP MIB" AQC, and reconfiguring VEB/VSIs) even when the configuration has not changed.
>
> Fix this by comparing the structs themselves rather than the pointers.
>
> Fixes: 4b208eaa8078 ("i40e: Add init and default config of software based DCB")
> Cc: stable@vger.kernel.org
> Signed-off-by: Aaron Esau <aaron1esau@gmail.com>
> ---
>
> Found using Coccinelle/spatch with a semantic patch that matches
> sizeof(ptr) and &ptr used together where ptr is a pointer type.
>
> drivers/net/ethernet/intel/i40e/i40e_main.c | 2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
>
Tested-by: Arpana Arland <arpanax.arland@intel.com> (A Contingent worker at Intel)
^ permalink raw reply
* [PATCH net-next v8] net: mana: Expose hardware diagnostic info via debugfs
From: Erni Sri Satya Vennela @ 2026-05-06 5:51 UTC (permalink / raw)
To: kys, haiyangz, wei.liu, decui, longli, andrew+netdev, davem,
edumazet, kuba, pabeni, kotaranov, horms, shradhagupta, ernis,
dipayanroy, shirazsaleem, yury.norov, kees, linux-hyperv, netdev,
linux-kernel, linux-rdma
Add debugfs entries to expose hardware configuration and diagnostic
information that aids in debugging driver initialization and runtime
operations without adding noise to dmesg.
The debugfs directory for each PCI device is named using pci_name()
(the unique BDF address), and its creation and removal is integrated
into mana_gd_setup() and mana_gd_cleanup_device() respectively, so
that all callers (probe, remove, suspend, resume, shutdown) share a
single code path.
Device-level entries (under /sys/kernel/debug/mana/<BDF>/):
- num_msix_usable, max_num_queues: Max resources from hardware
- gdma_protocol_ver, pf_cap_flags1: VF version negotiation results
- num_vports, bm_hostmode: Device configuration
Per-vPort entries (under /sys/kernel/debug/mana/<BDF>/vportN/):
- port_handle: Hardware vPort handle
- max_sq, max_rq: Max queues from vPort config
- indir_table_sz: Indirection table size
- steer_rx, steer_rss, steer_update_tab, steer_cqe_coalescing:
Last applied steering configuration parameters
Signed-off-by: Erni Sri Satya Vennela <ernis@linux.microsoft.com>
---
Changes in v8:
* Move debugfs_create_u16("num_vports", ...) and
debugfs_create_u8("bm_hostmode", ...) to after ac->num_ports has been
assigned and clamped to MAX_PORTS_IN_MANA_DEV, so the value exposed
via debugfs always reflects the final, hardware-reported count
rather than a transient zero or unclamped value.
* Update the stale comment above mana_gd_resume() to reflect the new
rollback-on-failure behavior.
Changes in v7:
* Rebase to latest main.
Changes in v6:
* Move out of patchset and create a separate patch.
Changes in v5:
* Update commit message.
* Fix conflicts to align with the new patches.
* Make it part of patchset.
Changes in v4:
* Rebase and fix conflicts.
Changes in v3:
* Rename mana_gd_cleanup to mana_gd_cleanup_device.
* Add creation of debugfs entries in mana_gd_setup.
* Add removal of debugfs entries in mana_gd_cleanup_device.
* Remove bm_hostmode and num_vports from debugfs in mana_remove itself,
because "ac" gets freed before debugfs_remove_recursive, to avoid
Use-After-Free error.
* Add "goto out:" in mana_cfg_vport_steering to avoid populating apc
values when resp.hdr.status is not NULL.
Changes in v2:
* Add debugfs_remove_recursice for gc>mana_pci_debugfs in
mana_gd_suspend to handle multiple duplicates creation in
mana_gd_setup and mana_gd_resume path.
* Move debugfs creation for num_vports and bm_hostmode out of
if(!resuming) condition since we have to create it again even for
resume.
* Recreate mana_pci_debugfs in mana_gd_resume.
---
.../net/ethernet/microsoft/mana/gdma_main.c | 73 +++++++++++--------
drivers/net/ethernet/microsoft/mana/mana_en.c | 33 +++++++++
include/net/mana/gdma.h | 1 +
include/net/mana/mana.h | 8 ++
4 files changed, 83 insertions(+), 32 deletions(-)
diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c
index 098fbda0d128..9e9a97eef7f0 100644
--- a/drivers/net/ethernet/microsoft/mana/gdma_main.c
+++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c
@@ -194,6 +194,11 @@ static int mana_gd_query_max_resources(struct pci_dev *pdev)
if (gc->max_num_queues > gc->num_msix_usable - 1)
gc->max_num_queues = gc->num_msix_usable - 1;
+ debugfs_create_u32("num_msix_usable", 0400, gc->mana_pci_debugfs,
+ &gc->num_msix_usable);
+ debugfs_create_u32("max_num_queues", 0400, gc->mana_pci_debugfs,
+ &gc->max_num_queues);
+
return 0;
}
@@ -1264,6 +1269,13 @@ int mana_gd_verify_vf_version(struct pci_dev *pdev)
return err ? err : -EPROTO;
}
gc->pf_cap_flags1 = resp.pf_cap_flags1;
+ gc->gdma_protocol_ver = resp.gdma_protocol_ver;
+
+ debugfs_create_x64("gdma_protocol_ver", 0400, gc->mana_pci_debugfs,
+ &gc->gdma_protocol_ver);
+ debugfs_create_x64("pf_cap_flags1", 0400, gc->mana_pci_debugfs,
+ &gc->pf_cap_flags1);
+
if (resp.pf_cap_flags1 & GDMA_DRV_CAP_FLAG_1_HWC_TIMEOUT_RECONFIG) {
err = mana_gd_query_hwc_timeout(pdev, &hwc->hwc_timeout);
if (err) {
@@ -1943,15 +1955,20 @@ static int mana_gd_setup(struct pci_dev *pdev)
struct gdma_context *gc = pci_get_drvdata(pdev);
int err;
+ gc->mana_pci_debugfs = debugfs_create_dir(pci_name(pdev),
+ mana_debugfs_root);
+
err = mana_gd_init_registers(pdev);
if (err)
- return err;
+ goto remove_debugfs;
mana_smc_init(&gc->shm_channel, gc->dev, gc->shm_base);
gc->service_wq = alloc_ordered_workqueue("gdma_service_wq", 0);
- if (!gc->service_wq)
- return -ENOMEM;
+ if (!gc->service_wq) {
+ err = -ENOMEM;
+ goto remove_debugfs;
+ }
err = mana_gd_setup_hwc_irqs(pdev);
if (err) {
@@ -1992,11 +2009,14 @@ static int mana_gd_setup(struct pci_dev *pdev)
free_workqueue:
destroy_workqueue(gc->service_wq);
gc->service_wq = NULL;
+remove_debugfs:
+ debugfs_remove_recursive(gc->mana_pci_debugfs);
+ gc->mana_pci_debugfs = NULL;
dev_err(&pdev->dev, "%s failed (error %d)\n", __func__, err);
return err;
}
-static void mana_gd_cleanup(struct pci_dev *pdev)
+static void mana_gd_cleanup_device(struct pci_dev *pdev)
{
struct gdma_context *gc = pci_get_drvdata(pdev);
@@ -2008,6 +2028,10 @@ static void mana_gd_cleanup(struct pci_dev *pdev)
destroy_workqueue(gc->service_wq);
gc->service_wq = NULL;
}
+
+ debugfs_remove_recursive(gc->mana_pci_debugfs);
+ gc->mana_pci_debugfs = NULL;
+
dev_dbg(&pdev->dev, "mana gdma cleanup successful\n");
}
@@ -2065,9 +2089,6 @@ static int mana_gd_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
gc->dev = &pdev->dev;
xa_init(&gc->irq_contexts);
- gc->mana_pci_debugfs = debugfs_create_dir(pci_name(pdev),
- mana_debugfs_root);
-
err = mana_gd_setup(pdev);
if (err)
goto unmap_bar;
@@ -2096,16 +2117,8 @@ static int mana_gd_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
cleanup_mana:
mana_remove(&gc->mana, false);
cleanup_gd:
- mana_gd_cleanup(pdev);
+ mana_gd_cleanup_device(pdev);
unmap_bar:
- /*
- * at this point we know that the other debugfs child dir/files
- * are either not yet created or are already cleaned up.
- * The pci debugfs folder clean-up now, will only be cleaning up
- * adapter-MTU file and apc->mana_pci_debugfs folder.
- */
- debugfs_remove_recursive(gc->mana_pci_debugfs);
- gc->mana_pci_debugfs = NULL;
xa_destroy(&gc->irq_contexts);
pci_iounmap(pdev, bar0_va);
free_gc:
@@ -2155,11 +2168,7 @@ static void mana_gd_remove(struct pci_dev *pdev)
mana_rdma_remove(&gc->mana_ib);
mana_remove(&gc->mana, false);
- mana_gd_cleanup(pdev);
-
- debugfs_remove_recursive(gc->mana_pci_debugfs);
-
- gc->mana_pci_debugfs = NULL;
+ mana_gd_cleanup_device(pdev);
xa_destroy(&gc->irq_contexts);
@@ -2181,14 +2190,13 @@ int mana_gd_suspend(struct pci_dev *pdev, pm_message_t state)
mana_rdma_remove(&gc->mana_ib);
mana_remove(&gc->mana, true);
- mana_gd_cleanup(pdev);
+ mana_gd_cleanup_device(pdev);
return 0;
}
-/* In case the NIC hardware stops working, the suspend and resume callbacks will
- * fail -- if this happens, it's safer to just report an error than try to undo
- * what has been done.
+/* If resume fails partway through, roll back any setup that completed so
+ * the device is left in a clean state and resources are not leaked.
*/
int mana_gd_resume(struct pci_dev *pdev)
{
@@ -2201,13 +2209,18 @@ int mana_gd_resume(struct pci_dev *pdev)
err = mana_probe(&gc->mana, true);
if (err)
- return err;
+ goto cleanup_gd;
err = mana_rdma_probe(&gc->mana_ib);
if (err)
- return err;
+ goto cleanup_mana;
return 0;
+cleanup_mana:
+ mana_remove(&gc->mana, true);
+cleanup_gd:
+ mana_gd_cleanup_device(pdev);
+ return err;
}
/* Quiesce the device for kexec. This is also called upon reboot/shutdown. */
@@ -2220,11 +2233,7 @@ static void mana_gd_shutdown(struct pci_dev *pdev)
mana_rdma_remove(&gc->mana_ib);
mana_remove(&gc->mana, true);
- mana_gd_cleanup(pdev);
-
- debugfs_remove_recursive(gc->mana_pci_debugfs);
-
- gc->mana_pci_debugfs = NULL;
+ mana_gd_cleanup_device(pdev);
pci_disable_device(pdev);
}
diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c
index a654b3699c4c..26bd3d270b5e 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_en.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
@@ -1276,6 +1276,9 @@ static int mana_query_vport_cfg(struct mana_port_context *apc, u32 vport_index,
apc->port_handle = resp.vport;
ether_addr_copy(apc->mac_addr, resp.mac_addr);
+ apc->vport_max_sq = *max_sq;
+ apc->vport_max_rq = *max_rq;
+
return 0;
}
@@ -1430,6 +1433,11 @@ static int mana_cfg_vport_steering(struct mana_port_context *apc,
netdev_info(ndev, "Configured steering vPort %llu entries %u\n",
apc->port_handle, apc->indir_table_sz);
+
+ apc->steer_rx = rx;
+ apc->steer_rss = apc->rss_state;
+ apc->steer_update_tab = update_tab;
+ apc->steer_cqe_coalescing = req->cqe_coalescing_enable;
out:
kfree(req);
return err;
@@ -3161,6 +3169,23 @@ static int mana_init_port(struct net_device *ndev)
eth_hw_addr_set(ndev, apc->mac_addr);
sprintf(vport, "vport%d", port_idx);
apc->mana_port_debugfs = debugfs_create_dir(vport, gc->mana_pci_debugfs);
+
+ debugfs_create_u64("port_handle", 0400, apc->mana_port_debugfs,
+ &apc->port_handle);
+ debugfs_create_u32("max_sq", 0400, apc->mana_port_debugfs,
+ &apc->vport_max_sq);
+ debugfs_create_u32("max_rq", 0400, apc->mana_port_debugfs,
+ &apc->vport_max_rq);
+ debugfs_create_u32("indir_table_sz", 0400, apc->mana_port_debugfs,
+ &apc->indir_table_sz);
+ debugfs_create_u32("steer_rx", 0400, apc->mana_port_debugfs,
+ &apc->steer_rx);
+ debugfs_create_u32("steer_rss", 0400, apc->mana_port_debugfs,
+ &apc->steer_rss);
+ debugfs_create_u32("steer_update_tab", 0400, apc->mana_port_debugfs,
+ &apc->steer_update_tab);
+ debugfs_create_u32("steer_cqe_coalescing", 0400, apc->mana_port_debugfs,
+ &apc->steer_cqe_coalescing);
debugfs_create_u32("current_speed", 0400, apc->mana_port_debugfs,
&apc->speed);
return 0;
@@ -3678,6 +3703,11 @@ int mana_probe(struct gdma_dev *gd, bool resuming)
if (ac->num_ports > MAX_PORTS_IN_MANA_DEV)
ac->num_ports = MAX_PORTS_IN_MANA_DEV;
+ debugfs_create_u16("num_vports", 0400, gc->mana_pci_debugfs,
+ &ac->num_ports);
+ debugfs_create_u8("bm_hostmode", 0400, gc->mana_pci_debugfs,
+ &ac->bm_hostmode);
+
ac->per_port_queue_reset_wq =
create_singlethread_workqueue("mana_per_port_queue_reset_wq");
if (!ac->per_port_queue_reset_wq) {
@@ -3800,6 +3830,9 @@ void mana_remove(struct gdma_dev *gd, bool suspending)
mana_gd_deregister_device(gd);
+ debugfs_lookup_and_remove("bm_hostmode", gc->mana_pci_debugfs);
+ debugfs_lookup_and_remove("num_vports", gc->mana_pci_debugfs);
+
if (suspending)
return;
diff --git a/include/net/mana/gdma.h b/include/net/mana/gdma.h
index 6d836060976a..70d62bc32837 100644
--- a/include/net/mana/gdma.h
+++ b/include/net/mana/gdma.h
@@ -442,6 +442,7 @@ struct gdma_context {
struct gdma_dev mana_ib;
u64 pf_cap_flags1;
+ u64 gdma_protocol_ver;
struct workqueue_struct *service_wq;
diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h
index 8f721cd4e4a7..18215388d2c7 100644
--- a/include/net/mana/mana.h
+++ b/include/net/mana/mana.h
@@ -568,6 +568,14 @@ struct mana_port_context {
/* Debugfs */
struct dentry *mana_port_debugfs;
+
+ /* Cached vport/steering config for debugfs */
+ u32 vport_max_sq;
+ u32 vport_max_rq;
+ u32 steer_rx;
+ u32 steer_rss;
+ u32 steer_update_tab;
+ u32 steer_cqe_coalescing;
};
netdev_tx_t mana_start_xmit(struct sk_buff *skb, struct net_device *ndev);
--
2.34.1
^ permalink raw reply related
* Re: [PATCH net-next v7] net: mana: Expose hardware diagnostic info via debugfs
From: Erni Sri Satya Vennela @ 2026-05-06 5:51 UTC (permalink / raw)
To: Jakub Kicinski
Cc: kys, haiyangz, wei.liu, decui, longli, andrew+netdev, davem,
edumazet, pabeni, kotaranov, horms, shradhagupta, dipayanroy,
yury.norov, shirazsaleem, kees, linux-hyperv, netdev,
linux-kernel, linux-rdma
In-Reply-To: <20260502005409.3645108-1-kuba@kernel.org>
On Fri, May 01, 2026 at 05:54:09PM -0700, Jakub Kicinski wrote:
> This is an AI-generated review of your patch. The human sending this
> email has considered the AI review valid, or at least plausible.
> ---
> net: mana: Expose hardware diagnostic info via debugfs
>
> This patch adds debugfs entries under /sys/kernel/debug/mana/<BDF>/ to
> expose device-level and per-vPort hardware configuration and diagnostic
> information. Debugfs creation and removal are moved into mana_gd_setup()
> and a renamed mana_gd_cleanup_device() so probe, remove, suspend, resume
> and shutdown share a single path.
>
> > diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c
> > index a654b3699c4c..077d3a1ff6bf 100644
> > --- a/drivers/net/ethernet/microsoft/mana/mana_en.c
> > +++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
>
> [ ... ]
>
> > @@ -3659,6 +3684,11 @@ int mana_probe(struct gdma_dev *gd, bool resuming)
> >
> > ac->bm_hostmode = bm_hostmode;
> >
> > + debugfs_create_u16("num_vports", 0400, gc->mana_pci_debugfs,
> > + &ac->num_ports);
> > + debugfs_create_u8("bm_hostmode", 0400, gc->mana_pci_debugfs,
> > + &ac->bm_hostmode);
> > +
> > if (!resuming) {
> > ac->num_ports = num_ports;
> > } else {
>
> Is the num_vports debugfs file pointed at ac->num_ports before that
> field has been assigned from the queried num_ports value?
>
> On a fresh probe (!resuming), the sequence in mana_probe() is:
>
> err = mana_query_device_cfg(ac, ..., &num_ports, &bm_hostmode);
> ...
> ac->bm_hostmode = bm_hostmode;
>
> debugfs_create_u16("num_vports", 0400, gc->mana_pci_debugfs,
> &ac->num_ports);
> ...
>
> if (!resuming) {
> ac->num_ports = num_ports;
> } else {
> ...
> }
> ...
> if (ac->num_ports > MAX_PORTS_IN_MANA_DEV)
> ac->num_ports = MAX_PORTS_IN_MANA_DEV;
>
> During the window between debugfs_create_u16() and the assignment (and
> subsequent clamp to MAX_PORTS_IN_MANA_DEV), would a reader of the
> num_vports debugfs file observe 0 rather than the hardware-reported
> value?
>
> Would it be clearer to create this file after ac->num_ports has been
> assigned and clamped, so the exposed value matches what the commit
> message describes as "Device configuration"?
> --
> pw-bot: cr
Thankyou for pointing out, Jakub.
I will update the comment in the next version.
- Vennela
^ permalink raw reply
* [PATCH net-next] net: ethernet: atheros: atl2: remove kernel backward-compatibility code
From: Ethan Nelson-Moore @ 2026-05-06 5:40 UTC (permalink / raw)
To: netdev
Cc: Ethan Nelson-Moore, Chris Snook, Andrew Lunn, David S. Miller,
Eric Dumazet, Jakub Kicinski, Paolo Abeni, Ingo Molnar,
Thomas Gleixner
The atl2 driver contains code for compatibility with old kernels that
do not support module_param_array. Backward compatibility is
irrelevant because this driver is in-tree. Remove this unreachable
code to simplify the driver's handling of module parameters.
Signed-off-by: Ethan Nelson-Moore <enelsonmoore@gmail.com>
---
drivers/net/ethernet/atheros/atlx/atl2.c | 37 ++----------------------
1 file changed, 3 insertions(+), 34 deletions(-)
diff --git a/drivers/net/ethernet/atheros/atlx/atl2.c b/drivers/net/ethernet/atheros/atlx/atl2.c
index 280e2f5f4aa5..5560d5cc3169 100644
--- a/drivers/net/ethernet/atheros/atlx/atl2.c
+++ b/drivers/net/ethernet/atheros/atlx/atl2.c
@@ -2756,10 +2756,8 @@ static void atl2_force_ps(struct atl2_hw *hw)
/* All parameters are treated the same, as an integer array of values.
* This macro just reduces the need to repeat the same declaration code
* over and over (plus this helps to avoid typo bugs).
- */
-#define ATL2_PARAM_INIT {[0 ... ATL2_MAX_NIC] = OPTION_UNSET}
-#ifndef module_param_array
-/* Module Parameters are always initialized to -1, so that the driver
+ *
+ * Module parameters are always initialized to -1, so that the driver
* can tell the difference between no user specified value or the
* user asking for the default value.
* The true default values are loaded in when atl2_check_options is called.
@@ -2770,16 +2768,10 @@ static void atl2_force_ps(struct atl2_hw *hw)
*/
#define ATL2_PARAM(X, desc) \
- static const int X[ATL2_MAX_NIC + 1] = ATL2_PARAM_INIT; \
- MODULE_PARM(X, "1-" __MODULE_STRING(ATL2_MAX_NIC) "i"); \
- MODULE_PARM_DESC(X, desc);
-#else
-#define ATL2_PARAM(X, desc) \
- static int X[ATL2_MAX_NIC+1] = ATL2_PARAM_INIT; \
+ static int X[ATL2_MAX_NIC+1] = {[0 ... ATL2_MAX_NIC] = OPTION_UNSET}; \
static unsigned int num_##X; \
module_param_array_named(X, X, int, &num_##X, 0); \
MODULE_PARM_DESC(X, desc);
-#endif
/*
* Transmit Memory Size
@@ -2924,9 +2916,6 @@ static void atl2_check_options(struct atl2_adapter *adapter)
printk(KERN_NOTICE "Warning: no configuration for board #%i\n",
bd);
printk(KERN_NOTICE "Using defaults for all values\n");
-#ifndef module_param_array
- bd = ATL2_MAX_NIC;
-#endif
}
/* Bytes of Transmit Memory */
@@ -2936,16 +2925,12 @@ static void atl2_check_options(struct atl2_adapter *adapter)
opt.def = ATL2_DEFAULT_TX_MEMSIZE;
opt.arg.r.min = ATL2_MIN_TX_MEMSIZE;
opt.arg.r.max = ATL2_MAX_TX_MEMSIZE;
-#ifdef module_param_array
if (num_TxMemSize > bd) {
-#endif
val = TxMemSize[bd];
atl2_validate_option(&val, &opt);
adapter->txd_ring_size = ((u32) val) * 1024;
-#ifdef module_param_array
} else
adapter->txd_ring_size = ((u32)opt.def) * 1024;
-#endif
/* txs ring size: */
adapter->txs_ring_size = adapter->txd_ring_size / 128;
if (adapter->txs_ring_size > 160)
@@ -2958,18 +2943,14 @@ static void atl2_check_options(struct atl2_adapter *adapter)
opt.def = ATL2_DEFAULT_RXD_COUNT;
opt.arg.r.min = ATL2_MIN_RXD_COUNT;
opt.arg.r.max = ATL2_MAX_RXD_COUNT;
-#ifdef module_param_array
if (num_RxMemBlock > bd) {
-#endif
val = RxMemBlock[bd];
atl2_validate_option(&val, &opt);
adapter->rxd_ring_size = (u32)val;
/* FIXME */
/* ((u16)val)&~1; */ /* even number */
-#ifdef module_param_array
} else
adapter->rxd_ring_size = (u32)opt.def;
-#endif
/* init RXD Flow control value */
adapter->hw.fc_rxd_hi = (adapter->rxd_ring_size / 8) * 7;
adapter->hw.fc_rxd_lo = (ATL2_MIN_RXD_COUNT / 8) >
@@ -2983,16 +2964,12 @@ static void atl2_check_options(struct atl2_adapter *adapter)
opt.def = INT_MOD_DEFAULT_CNT;
opt.arg.r.min = INT_MOD_MIN_CNT;
opt.arg.r.max = INT_MOD_MAX_CNT;
-#ifdef module_param_array
if (num_IntModTimer > bd) {
-#endif
val = IntModTimer[bd];
atl2_validate_option(&val, &opt);
adapter->imt = (u16) val;
-#ifdef module_param_array
} else
adapter->imt = (u16)(opt.def);
-#endif
/* Flash Vendor */
opt.type = range_option;
opt.name = "SPI Flash Vendor";
@@ -3000,16 +2977,12 @@ static void atl2_check_options(struct atl2_adapter *adapter)
opt.def = FLASH_VENDOR_DEFAULT;
opt.arg.r.min = FLASH_VENDOR_MIN;
opt.arg.r.max = FLASH_VENDOR_MAX;
-#ifdef module_param_array
if (num_FlashVendor > bd) {
-#endif
val = FlashVendor[bd];
atl2_validate_option(&val, &opt);
adapter->hw.flash_vendor = (u8) val;
-#ifdef module_param_array
} else
adapter->hw.flash_vendor = (u8)(opt.def);
-#endif
/* MediaType */
opt.type = range_option;
opt.name = "Speed/Duplex Selection";
@@ -3017,14 +2990,10 @@ static void atl2_check_options(struct atl2_adapter *adapter)
opt.def = MEDIA_TYPE_AUTO_SENSOR;
opt.arg.r.min = MEDIA_TYPE_AUTO_SENSOR;
opt.arg.r.max = MEDIA_TYPE_10M_HALF;
-#ifdef module_param_array
if (num_MediaType > bd) {
-#endif
val = MediaType[bd];
atl2_validate_option(&val, &opt);
adapter->hw.MediaType = (u16) val;
-#ifdef module_param_array
} else
adapter->hw.MediaType = (u16)(opt.def);
-#endif
}
--
2.43.0
^ permalink raw reply related
* Re: [PATCH net-next] ipv4: Flush the FIB once per dev nexthop removal
From: Kuniyuki Iwashima @ 2026-05-06 5:26 UTC (permalink / raw)
To: cratiu
Cc: davem, dsahern, edumazet, horms, kuba, netdev, pabeni,
Kuniyuki Iwashima
In-Reply-To: <20260504133626.4155933-1-cratiu@nvidia.com>
From: Cosmin Ratiu <cratiu@nvidia.com>
Date: Mon, 4 May 2026 16:36:26 +0300
> When a device is going down, all nexthops on it are removed, and for
> each nexthop being removed the FIB table is flushed, which does a full
> trie traversal looking for entries marked RTNH_F_DEAD and removing them.
> The performance of this is O(N x R), with N being number of dev nexthops
> and R being number of IPv4 routes.
>
> The RTNL is held the entire time.
>
> When there are many nexthops to be removed and many routing entries,
> this can result in the RTNL being held for multiple minutes, which
> causes unhappiness in other processes trying to acquire the RTNL (e.g.
> systemd-networkd for DHCP renewals).
>
> In a complicated deployment with multiple vxlan devices, each having
> 16K nexthops and a total of 128K ipv4 routes, this is exactly what
> happens:
>
> nexthop_flush_dev() # loops over 16K nexthops
> -> remove_nexthop()
> -> __remove_nexthop()
> -> __remove_nexthop_fib() # marks fi->fib_flags |= RTNH_F_DEAD
> -> fib_flush() # for EACH nexthop!
> -> fib_table_flush() # walks the ENTIRE FIB, 128K entries
>
> Change that so that a nexthop_flush_dev() does a single fib_flush()
> after all nexthops are removed. This is done with:
> - __remove_nexthop_fib() no longer flushes the FIB, instead returns
> whether a flush is needed and is marked with __must_check.
> - __remove_nexthop() and remove_nexthop() propagate this return value up
> with __must_check.
> - A new wrapper is defined, remove_one_nexthop() which calls
> remove_nexthop() and flushes if necessary.
> - The two direct callers of __remove_nexthop() get a WARN_ON, since the
> nh about to be removed should not have any FIB entries referencing it
> when replacing or inserting a new one.
> - Callers which need to remove a single nexthop were migrated to
> remove_one_nexthop().
> - Callers which need to remove multiple nexthops keep track in a local
> bool whether a flush is needed and call flush once at the end.
> - This is plumbed through group removal as well, so when removing a leaf
> nh causes a parent group to lose its last member, the group's flush is
> also deferred, accumulated via remove_nexthop_from_groups() ->
> remove_nh_grp_entry() -> remove_nexthop(). remove_nh_grp_entry() gets
> a __must_check as well.
>
> This dramatically improves performance from O(N x R) to O(N + R).
>
> Releasing a nexthop reference in remove_nexthop() now no longer frees
> it. Instead, it is deleted when the last fib_info pointing to it gets
> freed via free_fib_info_rcu(). All routing code is already careful not
> to take into consideration routes marked with RTNH_F_DEAD.
>
> Tested with:
> DEV=eth2
> ip link set up dev $DEV
> ip link add testnh0 link $DEV type macvlan mode bridge
> ip addr add 198.51.100.1/24 dev testnh0
> ip link set testnh0 up
>
> seq 1 65536 | \
> sed 's/.*/nexthop add id & via 198.51.100.2 dev testnh0/' | \
> ip -batch -
>
> i=1
> for a in $(seq 0 255); do
> for b in $(seq 0 255); do
> echo "route add 10.${a}.${b}.0/32 nhid $i"
> i=$((i + 1))
> done
> done | ip -batch -
>
> time ip link set testnh0 down
> ip link del testnh0
>
> Without this patch:
> real 0m32.601s
> user 0m0.000s
> sys 0m32.511s
>
> With this patch:
> real 0m0.209s
> user 0m0.000s
> sys 0m0.153s
>
> Signed-off-by: Cosmin Ratiu <cratiu@nvidia.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@google.com>
Looks good, 3 nits below.
[...]
> -static void remove_nexthop_from_groups(struct net *net, struct nexthop *nh,
> +static bool remove_nexthop_from_groups(struct net *net, struct nexthop *nh,
> struct nl_info *nlinfo)
Since you added __must_check to other functions,
remove_nexthop_from_groups() should have it too.
> {
> struct nh_grp_entry *nhge, *tmp;
> + bool need_flush = false;
> LIST_HEAD(deferred_free);
Please keep reverse xmas tree order here,
[...]
> @@ -2701,6 +2717,7 @@ static void nexthop_flush_dev(struct net_device *dev, unsigned long event)
> struct hlist_head *head = &net->nexthop.devhash[hash];
> struct hlist_node *n;
> struct nh_info *nhi;
> + bool need_flush = false;
and here.
^ permalink raw reply
* [PATCH net] tipc: avoid sending zero-length stream messages
From: Cássio Gabriel @ 2026-05-06 5:13 UTC (permalink / raw)
To: Jon Maloy, David S. Miller, Eric Dumazet, Jakub Kicinski,
Paolo Abeni, Simon Horman, Ying Xue, Parthasarathy Bhuvaragan
Cc: Jon Paul Maloy, netdev, tipc-discussion, linux-kernel, stable,
syzbot+aa7d098bd6fa788fae8e, Cássio Gabriel
TIPC stream send currently enters the transmit loop even when the
user payload length is zero. This can build and transmit a
header-only connection message.
For local TIPC sockets, such messages are delivered synchronously
through the loopback receive path. When this happens while socket
backlog processing is being flushed, reply transmission can re-enter
TIPC receive processing repeatedly and trigger an RCU stall.
Make zero-length sends on connected SOCK_STREAM TIPC sockets a no-op
after the existing connection/congestion wait has succeeded. Leave
implicit connection setup and SOCK_SEQPACKET behavior unchanged.
Fixes: 365ad353c256 ("tipc: reduce risk of user starvation during link congestion")
Cc: stable@vger.kernel.org
Reported-by: syzbot+aa7d098bd6fa788fae8e@syzkaller.appspotmail.com
Closes: https://lore.kernel.org/all/000000000000cedbc405ae81531f@google.com/
Closes: https://syzkaller.appspot.com/bug?extid=aa7d098bd6fa788fae8e
Signed-off-by: Cássio Gabriel <cassiogabrielcontato@gmail.com>
---
net/tipc/socket.c | 2 ++
1 file changed, 2 insertions(+)
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index 9329919fb07f..3c7838713d74 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -1585,6 +1585,8 @@ static int __tipc_sendstream(struct socket *sock, struct msghdr *m, size_t dlen)
tipc_sk_connected(sk)));
if (unlikely(rc))
break;
+ if (unlikely(!dlen && sk->sk_type == SOCK_STREAM))
+ break;
send = min_t(size_t, dlen - sent, TIPC_MAX_USER_MSG_SIZE);
blocks = tsk->snd_backlog;
if (tsk->oneway++ >= tsk->nagle_start && maxnagle &&
---
base-commit: 95084f1883a760e0d4290698346759d58e2b944a
change-id: 20260505-tipc-zero-length-stream-stall-2c3741de2c93
Best regards,
--
Cássio Gabriel <cassiogabrielcontato@gmail.com>
^ permalink raw reply related
* [PATCH net-next v2 5/5] ionic: Add .get_fec_stats ethtool handler
From: Eric Joyner @ 2026-05-06 4:35 UTC (permalink / raw)
To: netdev
Cc: Brett Creeley, Andrew Lunn, David S. Miller, Eric Dumazet,
Jakub Kicinski, Paolo Abeni, Eric Joyner
In-Reply-To: <20260506043526.64301-1-eric.joyner@amd.com>
Several FEC error statistics being collected can be reported in a
dedicated ethtool callback for FEC errors, so implement the handler that
does so. This includes 802.3ck FEC histogram data that some newer
hardware collects.
Assisted-by: Claude:claude-4.6-sonnet
Signed-off-by: Eric Joyner <eric.joyner@amd.com>
---
.../ethernet/pensando/ionic/ionic_ethtool.c | 52 +++++++++++++++++++
1 file changed, 52 insertions(+)
diff --git a/drivers/net/ethernet/pensando/ionic/ionic_ethtool.c b/drivers/net/ethernet/pensando/ionic/ionic_ethtool.c
index af0c4cc8ad8e..b0d7b5a9d189 100644
--- a/drivers/net/ethernet/pensando/ionic/ionic_ethtool.c
+++ b/drivers/net/ethernet/pensando/ionic/ionic_ethtool.c
@@ -419,6 +419,57 @@ static int ionic_get_fecparam(struct net_device *netdev,
return 0;
}
+static const struct ethtool_fec_hist_range ionic_fec_ranges[] = {
+ { 0, 0},
+ { 1, 1},
+ { 2, 2},
+ { 3, 3},
+ { 4, 4},
+ { 5, 5},
+ { 6, 6},
+ { 7, 7},
+ { 8, 8},
+ { 9, 9},
+ { 10, 10},
+ { 11, 11},
+ { 12, 12},
+ { 13, 13},
+ { 14, 14},
+ { 15, 15},
+ { 0, 0},
+};
+
+static void
+ionic_fill_fec_hist(const struct ionic_port_extra_stats *extra_stats,
+ struct ethtool_fec_hist *hist)
+{
+ int i;
+
+ hist->ranges = ionic_fec_ranges;
+ for (i = 0; i < ETHTOOL_FEC_HIST_MAX - 1; i++)
+ hist->values[i].sum =
+ le64_to_cpu(extra_stats->fec_codeword_error_bin[i]);
+}
+
+static void ionic_get_fec_stats(struct net_device *netdev,
+ struct ethtool_fec_stats *fec_stats,
+ struct ethtool_fec_hist *hist)
+{
+ struct ionic_port_extra_stats *extra_stats;
+ struct ionic_lif *lif = netdev_priv(netdev);
+
+ extra_stats = &lif->ionic->idev.port_info->extra_stats;
+
+ fec_stats->corrected_blocks.total =
+ le64_to_cpu(extra_stats->rsfec_correctable_blocks);
+ fec_stats->uncorrectable_blocks.total =
+ le64_to_cpu(extra_stats->rsfec_uncorrectable_blocks);
+ fec_stats->corrected_bits.total =
+ le64_to_cpu(extra_stats->fec_corrected_bits_total);
+
+ ionic_fill_fec_hist(extra_stats, hist);
+}
+
static int ionic_set_fecparam(struct net_device *netdev,
struct ethtool_fecparam *fec)
{
@@ -1155,6 +1206,7 @@ static const struct ethtool_ops ionic_ethtool_ops = {
.get_module_eeprom_by_page = ionic_get_module_eeprom_by_page,
.get_pauseparam = ionic_get_pauseparam,
.set_pauseparam = ionic_set_pauseparam,
+ .get_fec_stats = ionic_get_fec_stats,
.get_fecparam = ionic_get_fecparam,
.set_fecparam = ionic_set_fecparam,
.get_ts_info = ionic_get_ts_info,
--
2.17.1
^ permalink raw reply related
* [PATCH net-next v2 3/5] ionic: Update ionic_if.h with new extra port stats structure
From: Eric Joyner @ 2026-05-06 4:35 UTC (permalink / raw)
To: netdev
Cc: Brett Creeley, Andrew Lunn, David S. Miller, Eric Dumazet,
Jakub Kicinski, Paolo Abeni, Eric Joyner
In-Reply-To: <20260506043526.64301-1-eric.joyner@amd.com>
A new structure to report additional statistics from the firmware has
been added to struct ionic_port_info. It currently only contains FEC
related statistics, but new statistics collected by the firmware for the
port would go in it.
This structure is located in the same area as the unused
ionic_port_pb_stats structure, so this patch also removes that since it
was never used in this driver.
Signed-off-by: Eric Joyner <eric.joyner@amd.com>
---
.../net/ethernet/pensando/ionic/ionic_if.h | 36 ++++++-------------
1 file changed, 10 insertions(+), 26 deletions(-)
diff --git a/drivers/net/ethernet/pensando/ionic/ionic_if.h b/drivers/net/ethernet/pensando/ionic/ionic_if.h
index 23d6e2b4791e..01668dd10c0a 100644
--- a/drivers/net/ethernet/pensando/ionic/ionic_if.h
+++ b/drivers/net/ethernet/pensando/ionic/ionic_if.h
@@ -2855,6 +2855,14 @@ struct ionic_mgmt_port_stats {
__le64 frames_tx_pause;
};
+struct ionic_port_extra_stats {
+ __le64 rsfec_correctable_blocks;
+ __le64 rsfec_uncorrectable_blocks;
+ __le64 fec_corrected_bits_total;
+ __le64 rx_bits_phy;
+ __le64 fec_codeword_error_bin[16];
+};
+
enum ionic_pb_buffer_drop_stats {
IONIC_BUFFER_INTRINSIC_DROP = 0,
IONIC_BUFFER_DISCARDED,
@@ -2883,28 +2891,6 @@ enum ionic_oflow_drop_stats {
IONIC_OFLOW_DROP_MAX,
};
-/* struct ionic_port_pb_stats - packet buffers system stats
- * uses ionic_pb_buffer_drop_stats for drop_counts[]
- */
-struct ionic_port_pb_stats {
- __le64 sop_count_in;
- __le64 eop_count_in;
- __le64 sop_count_out;
- __le64 eop_count_out;
- __le64 drop_counts[IONIC_BUFFER_DROP_MAX];
- __le64 input_queue_buffer_occupancy[IONIC_QOS_TC_MAX];
- __le64 input_queue_port_monitor[IONIC_QOS_TC_MAX];
- __le64 output_queue_port_monitor[IONIC_QOS_TC_MAX];
- __le64 oflow_drop_counts[IONIC_OFLOW_DROP_MAX];
- __le64 input_queue_good_pkts_in[IONIC_QOS_TC_MAX];
- __le64 input_queue_good_pkts_out[IONIC_QOS_TC_MAX];
- __le64 input_queue_err_pkts_in[IONIC_QOS_TC_MAX];
- __le64 input_queue_fifo_depth[IONIC_QOS_TC_MAX];
- __le64 input_queue_max_fifo_depth[IONIC_QOS_TC_MAX];
- __le64 input_queue_peak_occupancy[IONIC_QOS_TC_MAX];
- __le64 output_queue_buffer_occupancy[IONIC_QOS_TC_MAX];
-};
-
/**
* struct ionic_port_identity - port identity structure
* @version: identity structure version
@@ -2950,7 +2936,7 @@ union ionic_port_identity {
* @sprom_page2: Extended Transceiver sprom, page 2
* @sprom_page17: Extended Transceiver sprom, page 17
* @rsvd: reserved byte(s)
- * @pb_stats: uplink pb drop stats
+ * @extra_stats: Extra port statistics data
*/
struct ionic_port_info {
union ionic_port_config config;
@@ -2968,9 +2954,7 @@ struct ionic_port_info {
};
};
u8 rsvd[376];
-
- /* pb_stats must start at 2k offset */
- struct ionic_port_pb_stats pb_stats;
+ struct ionic_port_extra_stats extra_stats;
};
/*
--
2.17.1
^ permalink raw reply related
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox