* Re: [net-next PATCH v5 1/4] octeontx2-af: npa: cn20k: Add NPA Halo support
From: Subbaraya Sundeep @ 2026-04-10 10:11 UTC (permalink / raw)
To: Alexander Lobakin
Cc: andrew+netdev, davem, edumazet, kuba, pabeni, sgoutham, gakula,
bbhushan2, netdev, linux-kernel, Linu Cherian
In-Reply-To: <1dc56269-d6d5-48da-a4c2-0686ce4fd1f6@intel.com>
On 2026-04-10 at 15:06:56, Alexander Lobakin (aleksander.lobakin@intel.com) wrote:
> From: Subbaraya Sundeep <sbhatta@marvell.com>
> Date: Fri, 10 Apr 2026 15:05:36 +0530
>
> > On 2026-04-09 at 20:39:02, Alexander Lobakin (aleksander.lobakin@intel.com) wrote:
> >> From: Subbaraya Sundeep <sbhatta@marvell.com>
> >> Date: Thu, 9 Apr 2026 15:23:21 +0530
> >>
> >>> From: Linu Cherian <lcherian@marvell.com>
> >>>
> >>> CN20K silicon implements unified aura and pool context
> >>> type called Halo for better resource usage. Add support to
> >>> handle Halo context type operations.
> >>>
> >>> Signed-off-by: Linu Cherian <lcherian@marvell.com>
> >>> Signed-off-by: Subbaraya Sundeep <sbhatta@marvell.com>
> >>
> >> [...]
> >>
> >>> diff --git a/drivers/net/ethernet/marvell/octeontx2/af/cn20k/struct.h b/drivers/net/ethernet/marvell/octeontx2/af/cn20k/struct.h
> >>> index 763f6cabd7c2..2364bafd329d 100644
> >>> --- a/drivers/net/ethernet/marvell/octeontx2/af/cn20k/struct.h
> >>> +++ b/drivers/net/ethernet/marvell/octeontx2/af/cn20k/struct.h
> >>> @@ -377,4 +377,85 @@ struct npa_cn20k_pool_s {
> >>>
> >>> static_assert(sizeof(struct npa_cn20k_pool_s) == NIX_MAX_CTX_SIZE);
> >>>
> >>> +struct npa_cn20k_halo_s {
> >>> + u64 stack_base : 64;
> >>
> >> It's redundant to add : 64 to a 64-bit field.
> > Agreed. But this is for readability, it helps when checking HRM. For
> > instance HRM says [703:640] and we define as u64 reserved_640_703 : 64;
> > so that we do not have to count bits in mind.
> >> Moreover, on 32-bit systems, the compilers sometimes complain on
> >> bitfields > 32 bits.
> > This driver depends on 64BIT.
> >>
> >>> + u64 ena : 1;
> >>> + u64 nat_align : 1;
> >>> + u64 reserved_66_67 : 2;
> >>> + u64 stack_caching : 1;
> >>> + u64 reserved_69_71 : 3;
> >>> + u64 aura_drop_ena : 1;
> >>> + u64 reserved_73_79 : 7;
> >>> + u64 aura_drop : 8;
> >>> + u64 buf_offset : 12;
> >>> + u64 reserved_100_103 : 4;
> >>> + u64 buf_size : 12;
> >>> + u64 reserved_116_119 : 4;
> >>> + u64 ref_cnt_prof : 3;
> >>> + u64 reserved_123_127 : 5;
> >>> + u64 stack_max_pages : 32;
> >>> + u64 stack_pages : 32;
> >>> + u64 bp_0 : 7;
> >>> + u64 bp_1 : 7;
> >>> + u64 bp_2 : 7;
> >>> + u64 bp_3 : 7;
> >>> + u64 bp_4 : 7;
> >>> + u64 bp_5 : 7;
> >>> + u64 bp_6 : 7;
> >>> + u64 bp_7 : 7;
> >>> + u64 bp_ena_0 : 1;
> >>> + u64 bp_ena_1 : 1;
> >>> + u64 bp_ena_2 : 1;
> >>> + u64 bp_ena_3 : 1;
> >>> + u64 bp_ena_4 : 1;
> >>> + u64 bp_ena_5 : 1;
> >>> + u64 bp_ena_6 : 1;
> >>> + u64 bp_ena_7 : 1;
> >>> + u64 stack_offset : 4;
> >>> + u64 reserved_260_263 : 4;
> >>> + u64 shift : 6;
> >>> + u64 reserved_270_271 : 2;
> >>> + u64 avg_level : 8;
> >>> + u64 avg_con : 9;
> >>> + u64 fc_ena : 1;
> >>> + u64 fc_stype : 2;
> >>> + u64 fc_hyst_bits : 4;
> >>> + u64 fc_up_crossing : 1;
> >>> + u64 reserved_297_299 : 3;
> >>> + u64 update_time : 16;
> >>> + u64 reserved_316_319 : 4;
> >>> + u64 fc_addr : 64;
> >>> + u64 ptr_start : 64;
> >>> + u64 ptr_end : 64;
> >>> + u64 bpid_0 : 12;
> >>> + u64 reserved_524_535 : 12;
> >>> + u64 err_int : 8;
> >>> + u64 err_int_ena : 8;
> >>> + u64 thresh_int : 1;
> >>> + u64 thresh_int_ena : 1;
> >>> + u64 thresh_up : 1;
> >>> + u64 reserved_555 : 1;
> >>> + u64 thresh_qint_idx : 7;
> >>> + u64 reserved_563 : 1;
> >>> + u64 err_qint_idx : 7;
> >>> + u64 reserved_571_575 : 5;
> >>> + u64 thresh : 36;
> >>> + u64 reserved_612_615 : 4;
> >>> + u64 fc_msh_dst : 11;
> >>> + u64 reserved_627_630 : 4;
> >>> + u64 op_dpc_ena : 1;
> >>> + u64 op_dpc_set : 5;
> >>> + u64 reserved_637_637 : 1;
> >>> + u64 stream_ctx : 1;
> >>> + u64 unified_ctx : 1;
> >>> + u64 reserved_640_703 : 64;
> >>> + u64 reserved_704_767 : 64;
> >>> + u64 reserved_768_831 : 64;
> >>> + u64 reserved_832_895 : 64;
> >>> + u64 reserved_896_959 : 64;
> >>> + u64 reserved_960_1023 : 64;
> >>> +};
> >>> +
> >>> +static_assert(sizeof(struct npa_cn20k_halo_s) == NIX_MAX_CTX_SIZE);
> >>
> >> Now the main question:
> >>
> >> Is mailbox's Endianness fixed (LE/BE)? Or is it always the same as the
> >> host's ones (I doubt so)?
> >> If not, these need to be __le{8,16,32,64} (or __be if it's Big Endian)
> >> and you need to handle the conversions manually.
> >>
> > Yes endianness is LE and fixed. This is NOT a host side driver for an
> > endpoint card. This is driver for on chip PCI device of CN20K soc.
> > Hope I answered your question wrt host.
>
> But the mailbox is shared between the SoC and the host or HW or not? Is
In hardware it is just shared DDR region between two on chip devices and both
devices access shared region using their BARs.
> it possible that one client of the mailbox will have LE and the second
> will have BE?
No not possible.
Thanks,
Sundeep
>
> >
> > Thanks,
> > Sundeep
>
> Thanks,
> Olek
^ permalink raw reply
* [bug report] octeontx2-af: npc: cn20k: add debugfs support
From: Dan Carpenter @ 2026-04-10 10:12 UTC (permalink / raw)
To: Ratheesh Kannoth; +Cc: netdev
Hello Ratheesh Kannoth,
Commit 528530dff56b ("octeontx2-af: npc: cn20k: add debugfs support")
from Feb 24, 2026 (linux-next), leads to the following Smatch static
checker warning:
drivers/net/ethernet/marvell/octeontx2/af/cn20k/debugfs.c:257 npc_cn20k_debugfs_init() warn: 'npc_dentry' is an error pointer or valid
drivers/net/ethernet/marvell/octeontx2/af/cn20k/debugfs.c:263 npc_cn20k_debugfs_init() warn: 'npc_dentry' is an error pointer or valid
drivers/net/ethernet/marvell/octeontx2/af/cn20k/debugfs.c:268 npc_cn20k_debugfs_init() warn: 'npc_dentry' is an error pointer or valid
drivers/net/ethernet/marvell/octeontx2/af/cn20k/debugfs.c:273 npc_cn20k_debugfs_init() warn: 'npc_dentry' is an error pointer or valid
drivers/net/ethernet/marvell/octeontx2/af/cn20k/debugfs.c:278 npc_cn20k_debugfs_init() warn: 'npc_dentry' is an error pointer or valid
drivers/net/ethernet/marvell/octeontx2/af/cn20k/debugfs.c
249 int npc_cn20k_debugfs_init(struct rvu *rvu)
250 {
251 struct npc_priv_t *npc_priv = npc_priv_get();
252 struct dentry *npc_dentry;
253
254 npc_dentry = debugfs_create_file("mcam_layout", 0444, rvu->rvu_dbg.npc,
255 npc_priv, &npc_mcam_layout_fops);
256
257 if (!npc_dentry)
258 return -EFAULT;
This error checking is wrong, but instead of fixing it, just delete it.
See my blog for details:
https://staticthinking.wordpress.com/2023/07/24/debugfs-functions-are-not-supposed-to-be-checked/
259
260 npc_dentry = debugfs_create_file("mcam_default", 0444, rvu->rvu_dbg.npc,
261 rvu, &npc_mcam_default_fops);
262
263 if (!npc_dentry)
264 return -EFAULT;
265
266 npc_dentry = debugfs_create_file("vidx2idx", 0444, rvu->rvu_dbg.npc,
267 npc_priv, &npc_vidx2idx_map_fops);
268 if (!npc_dentry)
269 return -EFAULT;
270
271 npc_dentry = debugfs_create_file("idx2vidx", 0444, rvu->rvu_dbg.npc,
272 npc_priv, &npc_idx2vidx_map_fops);
273 if (!npc_dentry)
274 return -EFAULT;
275
276 npc_dentry = debugfs_create_file("defrag", 0444, rvu->rvu_dbg.npc,
277 npc_priv, &npc_defrag_fops);
--> 278 if (!npc_dentry)
279 return -EFAULT;
280
281 return 0;
282 }
This email is a free service from the Smatch-CI project [smatch.sf.net].
regards,
dan carpenter
^ permalink raw reply
* [bug report] octeontx2-af: npc: cn20k: virtual index support
From: Dan Carpenter @ 2026-04-10 10:12 UTC (permalink / raw)
To: Ratheesh Kannoth; +Cc: netdev
Hello Ratheesh Kannoth,
Commit 645c6e3c1999 ("octeontx2-af: npc: cn20k: virtual index
support") from Feb 24, 2026 (linux-next), leads to the following
Smatch static checker warning:
drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.c:3534 npc_defrag_alloc_free_slots()
warn: missing error code 'rc'
drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.c
3479 static int npc_defrag_alloc_free_slots(struct rvu *rvu,
3480 struct npc_defrag_node *f,
3481 int cnt, u16 *save)
3482 {
3483 int alloc_cnt1, alloc_cnt2;
3484 struct npc_subbank *sb;
3485 int rc, sb_off, i;
3486 bool deleted;
3487
3488 sb = &npc_priv.sb[f->idx];
3489
3490 alloc_cnt1 = 0;
3491 alloc_cnt2 = 0;
3492
3493 rc = __npc_subbank_alloc(rvu, sb,
3494 NPC_MCAM_KEY_X2, sb->b0b,
3495 sb->b0t,
3496 NPC_MCAM_LOWER_PRIO,
3497 false, cnt, save, cnt, true,
3498 &alloc_cnt1);
3499 if (alloc_cnt1 < cnt) {
3500 rc = __npc_subbank_alloc(rvu, sb,
3501 NPC_MCAM_KEY_X2, sb->b1b,
3502 sb->b1t,
3503 NPC_MCAM_LOWER_PRIO,
3504 false, cnt - alloc_cnt1,
3505 save + alloc_cnt1,
3506 cnt - alloc_cnt1,
3507 true, &alloc_cnt2);
3508 }
3509
3510 if (alloc_cnt1 + alloc_cnt2 != cnt) {
3511 dev_err(rvu->dev,
3512 "%s: Failed to alloc cnt=%u alloc_cnt1=%u alloc_cnt2=%u\n",
3513 __func__, cnt, alloc_cnt1, alloc_cnt2);
3514 goto fail_free_alloc;
3515 }
3516 return 0;
3517
3518 fail_free_alloc:
3519 for (i = 0; i < alloc_cnt1 + alloc_cnt2; i++) {
3520 rc = npc_mcam_idx_2_subbank_idx(rvu, save[i],
3521 &sb, &sb_off);
3522 if (rc) {
3523 dev_err(rvu->dev,
3524 "%s: Error to find subbank for mcam idx=%u\n",
3525 __func__, save[i]);
3526 break;
3527 }
3528
3529 deleted = __npc_subbank_free(rvu, sb, sb_off);
3530 if (!deleted) {
3531 dev_err(rvu->dev,
3532 "%s: Error to free mcam idx=%u\n",
3533 __func__, save[i]);
--> 3534 break;
Set an error code here?
3535 }
3536 }
3537
3538 return rc;
3539 }
This email is a free service from the Smatch-CI project [smatch.sf.net].
regards,
dan carpenter
^ permalink raw reply
* [bug report] [NET]: Add Tehuti network driver.
From: Dan Carpenter @ 2026-04-10 10:13 UTC (permalink / raw)
To: Andy Gospodarek; +Cc: netdev
Hello Andy Gospodarek,
Commit 1a348ccc1047 ("[NET]: Add Tehuti network driver.") from Sep
17, 2007 (linux-next), leads to the following Smatch static checker
warning:
drivers/net/ethernet/tehuti/tehuti.c:2210 bdx_set_coalesce()
warn: no lower bound on 'rx_max_coal' rl='s32min-15'
drivers/net/ethernet/tehuti/tehuti.c
2179 static int bdx_set_coalesce(struct net_device *netdev,
2180 struct ethtool_coalesce *ecoal,
2181 struct kernel_ethtool_coalesce *kernel_coal,
2182 struct netlink_ext_ack *extack)
2183 {
2184 u32 rdintcm;
2185 u32 tdintcm;
2186 struct bdx_priv *priv = netdev_priv(netdev);
2187 int rx_coal;
2188 int tx_coal;
2189 int rx_max_coal;
2190 int tx_max_coal;
2191
2192 /* Check for valid input */
2193 rx_coal = ecoal->rx_coalesce_usecs / INT_COAL_MULT;
2194 tx_coal = ecoal->tx_coalesce_usecs / INT_COAL_MULT;
2195 rx_max_coal = ecoal->rx_max_coalesced_frames
2196 tx_max_coal = ecoal->tx_max_coalesced_frames;
2197
2198 /* Translate from packets to multiples of FIFO bytes */
2199 rx_max_coal =
2200 (((rx_max_coal * sizeof(struct rxf_desc)) + PCK_TH_MULT - 1)
2201 / PCK_TH_MULT);
2202 tx_max_coal =
2203 (((tx_max_coal * BDX_TXF_DESC_SZ) + PCK_TH_MULT - 1)
2204 / PCK_TH_MULT);
2205
2206 if ((rx_coal > 0x7FFF) || (tx_coal > 0x7FFF) ||
2207 (rx_max_coal > 0xF) || (tx_max_coal > 0xF))
Check rx_max_coal and tx_max_coal for negative values?
2208 return -EINVAL;
2209
--> 2210 rdintcm = INT_REG_VAL(rx_coal, GET_INT_COAL_RC(priv->rdintcm),
2211 GET_RXF_TH(priv->rdintcm), rx_max_coal);
^^^^^^^^^^^
2212 tdintcm = INT_REG_VAL(tx_coal, GET_INT_COAL_RC(priv->tdintcm), 0,
2213 tx_max_coal);
2214
2215 priv->rdintcm = rdintcm;
2216 priv->tdintcm = tdintcm;
2217
2218 WRITE_REG(priv, regRDINTCM0, rdintcm);
2219 WRITE_REG(priv, regTDINTCM0, tdintcm);
2220
2221 return 0;
2222 }
This email is a free service from the Smatch-CI project [smatch.sf.net].
regards,
dan carpenter
^ permalink raw reply
* [PATCH nf] netfilter: nf_tables: use RCU-safe list primitives for basechain hook list
From: Weiming Shi @ 2026-04-10 10:13 UTC (permalink / raw)
To: Pablo Neira Ayuso, Florian Westphal, David S . Miller,
Eric Dumazet, Jakub Kicinski, Paolo Abeni
Cc: Phil Sutter, Simon Horman, netfilter-devel, coreteam, netdev,
linux-kernel, Xiang Mei, Weiming Shi
NFT_MSG_GETCHAIN runs as an NFNL_CB_RCU callback, so chain dumps
traverse basechain->hook_list under rcu_read_lock() without holding
commit_mutex. Meanwhile, nft_delchain_hook() mutates that same live
hook_list with plain list_move() and list_splice(), and the commit/abort
paths splice hooks back with plain list_splice(). None of these are
RCU-safe list operations.
A concurrent GETCHAIN dump can observe partially updated list pointers,
follow them into stack-local or transaction-private list heads, and
crash when container_of() produces a bogus struct nft_hook pointer.
The PoC triggers this by racing GETCHAIN dumps against aborting DELCHAIN
hook updates, reachable from an unprivileged user namespace since all
capability checks use ns_capable() with CONFIG_NF_TABLES=y (default):
Oops: general protection fault, probably for non-canonical address 0xdffffc0000000006: 0000 [#1] SMP KASAN NOPTI
KASAN: null-ptr-deref in range [0x0000000000000030-0x0000000000000037]
RIP: 0010:strlen (lib/string.c:420 (discriminator 1))
Call Trace:
<TASK>
nf_tables_fill_chain_info (net/netfilter/nf_tables_api.c:1987 (discriminator 1) net/netfilter/nf_tables_api.c:1992 (discriminator 1) net/netfilter/nf_tables_api.c:2028 (discriminator 1) net/netfilter/nf_tables_api.c:2077 (discriminator 1))
nf_tables_dump_chains (net/netfilter/nf_tables_api.c:2173 (discriminator 1))
netlink_dump (net/netlink/af_netlink.c:2325 (discriminator 1))
__netlink_dump_start (net/netlink/af_netlink.c:2442)
nf_tables_getchain (net/netfilter/nf_tables_api.c:1314 net/netfilter/nf_tables_api.c:2212)
nfnetlink_rcv_msg (net/netfilter/nfnetlink.c:290)
netlink_rcv_skb (net/netlink/af_netlink.c:2550)
nfnetlink_rcv (net/netfilter/nfnetlink.c:653)
netlink_unicast (net/netlink/af_netlink.c:1319 net/netlink/af_netlink.c:1344)
netlink_sendmsg (net/netlink/af_netlink.c:1894)
__sys_sendto (net/socket.c:727 net/socket.c:742 net/socket.c:2206)
__x64_sys_sendto (net/socket.c:2209)
</TASK>
Replace list_move() in nft_delchain_hook() with list_del_rcu() plus an
intermediate pointer array, followed by synchronize_rcu() before the
deleted hooks' list pointers are reused to link them into the
transaction's private list. In the error paths, put hooks back with
list_add_tail_rcu() which is safe for concurrent RCU readers (they
either continue to the original successor or see the list head and
terminate the walk).
Add nft_hook_list_splice_rcu() helper that splices entries from a
private list into a live RCU-protected list using individual
list_add_tail_rcu() calls instead of plain list_splice(). Use it in
the commit and abort paths for NEWCHAIN updates and DELCHAIN rollback.
Fixes: 7d937b107108 ("netfilter: nf_tables: support for deleting devices in an existing netdev chain")
Reported-by: Xiang Mei <xmei5@asu.edu>
Signed-off-by: Weiming Shi <bestswngs@gmail.com>
---
net/netfilter/nf_tables_api.c | 64 ++++++++++++++++++++++++++++++-----
1 file changed, 56 insertions(+), 8 deletions(-)
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 8c42247a176c7..62fcfefba7b0f 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -391,6 +391,22 @@ static void nft_netdev_unregister_hooks(struct net *net,
}
}
+/* Splice hooks from a private list into a live (RCU-protected) hook list.
+ * Each entry is published individually via list_add_tail_rcu() so that
+ * concurrent RCU readers walking the destination list never observe torn
+ * list pointers.
+ */
+static void nft_hook_list_splice_rcu(struct list_head *from,
+ struct list_head *to)
+{
+ struct nft_hook *hook, *next;
+
+ list_for_each_entry_safe(hook, next, from, list) {
+ list_del(&hook->list);
+ list_add_tail_rcu(&hook->list, to);
+ }
+}
+
static int nf_tables_register_hook(struct net *net,
const struct nft_table *table,
struct nft_chain *chain)
@@ -3162,9 +3178,11 @@ static int nft_delchain_hook(struct nft_ctx *ctx,
const struct nlattr * const *nla = ctx->nla;
struct nft_chain_hook chain_hook = {};
struct nft_hook *this, *hook;
+ struct nft_hook **del_hooks;
LIST_HEAD(chain_del_list);
struct nft_trans *trans;
- int err;
+ int err, n = 0, i;
+ int max_hooks = 0;
if (ctx->table->flags & __NFT_TABLE_F_UPDATE)
return -EOPNOTSUPP;
@@ -3174,19 +3192,38 @@ static int nft_delchain_hook(struct nft_ctx *ctx,
if (err < 0)
return err;
+ list_for_each_entry(this, &chain_hook.list, list)
+ max_hooks++;
+
+ del_hooks = kcalloc(max_hooks, sizeof(*del_hooks), GFP_KERNEL);
+ if (!del_hooks) {
+ nft_chain_release_hook(&chain_hook);
+ return -ENOMEM;
+ }
+
list_for_each_entry(this, &chain_hook.list, list) {
hook = nft_hook_list_find(&basechain->hook_list, this);
if (!hook) {
err = -ENOENT;
goto err_chain_del_hook;
}
- list_move(&hook->list, &chain_del_list);
+ list_del_rcu(&hook->list);
+ del_hooks[n++] = hook;
}
+ /* Wait for any concurrent RCU readers (e.g. GETCHAIN dumps walking
+ * basechain->hook_list) to finish before modifying the removed hooks'
+ * list pointers to link them into the transaction's private list.
+ */
+ synchronize_rcu();
+
+ for (i = 0; i < n; i++)
+ list_add_tail(&del_hooks[i]->list, &chain_del_list);
+
trans = nft_trans_alloc_chain(ctx, NFT_MSG_DELCHAIN);
if (!trans) {
err = -ENOMEM;
- goto err_chain_del_hook;
+ goto err_chain_add_back;
}
nft_trans_basechain(trans) = basechain;
@@ -3194,13 +3231,24 @@ static int nft_delchain_hook(struct nft_ctx *ctx,
INIT_LIST_HEAD(&nft_trans_chain_hooks(trans));
list_splice(&chain_del_list, &nft_trans_chain_hooks(trans));
nft_chain_release_hook(&chain_hook);
+ kfree(del_hooks);
nft_trans_commit_list_add_tail(ctx->net, trans);
return 0;
+err_chain_add_back:
+ for (i = 0; i < n; i++)
+ list_add_tail_rcu(&del_hooks[i]->list, &basechain->hook_list);
+ kfree(del_hooks);
+ nft_chain_release_hook(&chain_hook);
+
+ return err;
+
err_chain_del_hook:
- list_splice(&chain_del_list, &basechain->hook_list);
+ for (i = 0; i < n; i++)
+ list_add_tail_rcu(&del_hooks[i]->list, &basechain->hook_list);
+ kfree(del_hooks);
nft_chain_release_hook(&chain_hook);
return err;
@@ -10912,8 +10960,8 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
nft_chain_commit_update(nft_trans_container_chain(trans));
nf_tables_chain_notify(&ctx, NFT_MSG_NEWCHAIN,
&nft_trans_chain_hooks(trans));
- list_splice(&nft_trans_chain_hooks(trans),
- &nft_trans_basechain(trans)->hook_list);
+ nft_hook_list_splice_rcu(&nft_trans_chain_hooks(trans),
+ &nft_trans_basechain(trans)->hook_list);
/* trans destroyed after rcu grace period */
} else {
nft_chain_commit_drop_policy(nft_trans_container_chain(trans));
@@ -11231,8 +11279,8 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action)
case NFT_MSG_DELCHAIN:
case NFT_MSG_DESTROYCHAIN:
if (nft_trans_chain_update(trans)) {
- list_splice(&nft_trans_chain_hooks(trans),
- &nft_trans_basechain(trans)->hook_list);
+ nft_hook_list_splice_rcu(&nft_trans_chain_hooks(trans),
+ &nft_trans_basechain(trans)->hook_list);
} else {
nft_use_inc_restore(&table->use);
nft_clear(trans->net, nft_trans_chain(trans));
--
2.43.0
^ permalink raw reply related
* [bug report] ipv4: icmp: fix null-ptr-deref in icmp_build_probe()
From: Dan Carpenter @ 2026-04-10 10:16 UTC (permalink / raw)
To: Yiqi Sun, Fernando Fernandez Mancera; +Cc: Simon Horman, netdev
Hello Yiqi Sun,
Commit fde29fd93493 ("ipv4: icmp: fix null-ptr-deref in
icmp_build_probe()") from Apr 2, 2026 (linux-next), leads to the
following Smatch static checker warning:
net/ipv4/icmp.c:1351 icmp_build_probe()
warn: 'dev' is not an error pointer
net/ipv4/icmp.c
1341 #if IS_ENABLED(CONFIG_IPV6)
1342 case ICMP_AFI_IP6:
1343 if (iio->ident.addr.ctype3_hdr.addrlen != sizeof(struct in6_addr))
1344 goto send_mal_query;
1345 dev = ipv6_dev_find(net, &iio->ident.addr.ip_addr.ipv6_addr, dev);
1346
1347 /*
1348 * If IPv6 identifier lookup is unavailable, silently
1349 * discard the request instead of misreporting NO_IF.
1350 */
--> 1351 if (IS_ERR(dev))
1352 return false;
It looks like there were two patches that went in around the same
time. Commit fde29fd93493 ("ipv4: icmp: fix null-ptr-deref in
icmp_build_probe()") updated the checking for
ipv6_stub->ipv6_dev_find() but d98adfbdd5c0 ("ipv4: drop ipv6_stub usage
and use direct function calls") changed it to not return error pointers.
This IS_ERR() check can be removed.
1353
1354 dev_hold(dev);
1355 break;
1356 #endif
1357 default:
1358 goto send_mal_query;
1359 }
This email is a free service from the Smatch-CI project [smatch.sf.net].
regards,
dan carpenter
^ permalink raw reply
* [bug report] xfrm: fix ip_rt_bug race in icmp_route_lookup reverse path
From: Dan Carpenter @ 2026-04-10 10:16 UTC (permalink / raw)
To: Jiayuan Chen; +Cc: Simon Horman, netdev
Hello Jiayuan Chen,
Commit 81b84de32bb2 ("xfrm: fix ip_rt_bug race in icmp_route_lookup
reverse path") from Feb 6, 2026 (linux-next), leads to the following
Smatch static checker warning:
net/ipv4/icmp.c:587 icmp_route_lookup()
error: we previously assumed 'rt2' could be null (see line 576)
net/ipv4/icmp.c
491 static struct rtable *icmp_route_lookup(struct net *net, struct flowi4 *fl4,
492 struct sk_buff *skb_in,
493 const struct iphdr *iph, __be32 saddr,
494 dscp_t dscp, u32 mark, int type,
495 int code, struct icmp_bxm *param)
496 {
497 struct net_device *route_lookup_dev;
498 struct dst_entry *dst, *dst2;
499 struct rtable *rt, *rt2;
500 struct flowi4 fl4_dec;
501 int err;
502
503 memset(fl4, 0, sizeof(*fl4));
504 fl4->daddr = (param->replyopts.opt.srr ?
505 param->replyopts.opt.faddr : iph->saddr);
506 fl4->saddr = saddr;
507 fl4->flowi4_mark = mark;
508 fl4->flowi4_uid = sock_net_uid(net, NULL);
509 fl4->flowi4_dscp = dscp;
510 fl4->flowi4_proto = IPPROTO_ICMP;
511 fl4->fl4_icmp_type = type;
512 fl4->fl4_icmp_code = code;
513 route_lookup_dev = icmp_get_route_lookup_dev(skb_in);
514 fl4->flowi4_oif = l3mdev_master_ifindex(route_lookup_dev);
515
516 security_skb_classify_flow(skb_in, flowi4_to_flowi_common(fl4));
517 rt = ip_route_output_key_hash(net, fl4, skb_in);
518 if (IS_ERR(rt))
519 return rt;
520
521 /* No need to clone since we're just using its address. */
522 rt2 = rt;
523
524 dst = xfrm_lookup(net, &rt->dst,
525 flowi4_to_flowi(fl4), NULL, 0);
526 rt = dst_rtable(dst);
527 if (!IS_ERR(dst)) {
528 if (rt != rt2)
529 return rt;
530 if (inet_addr_type_dev_table(net, route_lookup_dev,
531 fl4->daddr) == RTN_LOCAL)
532 return rt;
533 } else if (PTR_ERR(dst) == -EPERM) {
534 rt = NULL;
535 } else {
536 return rt;
537 }
538 err = xfrm_decode_session_reverse(net, skb_in, flowi4_to_flowi(&fl4_dec), AF_INET);
539 if (err)
540 goto relookup_failed;
541
542 if (inet_addr_type_dev_table(net, route_lookup_dev,
543 fl4_dec.saddr) == RTN_LOCAL) {
544 rt2 = __ip_route_output_key(net, &fl4_dec);
545 if (IS_ERR(rt2))
546 err = PTR_ERR(rt2);
547 } else {
548 struct flowi4 fl4_2 = {};
549 unsigned long orefdst;
550
551 fl4_2.daddr = fl4_dec.saddr;
552 rt2 = ip_route_output_key(net, &fl4_2);
553 if (IS_ERR(rt2)) {
554 err = PTR_ERR(rt2);
555 goto relookup_failed;
556 }
557 /* Ugh! */
558 orefdst = skb_dstref_steal(skb_in);
559 err = ip_route_input(skb_in, fl4_dec.daddr, fl4_dec.saddr,
560 dscp, rt2->dst.dev) ? -EINVAL : 0;
561
562 dst_release(&rt2->dst);
563 rt2 = skb_rtable(skb_in);
564 /* steal dst entry from skb_in, don't drop refcnt */
565 skb_dstref_steal(skb_in);
566 skb_dstref_restore(skb_in, orefdst);
567
568 /*
569 * At this point, fl4_dec.daddr should NOT be local (we
570 * checked fl4_dec.saddr above). However, a race condition
571 * may occur if the address is added to the interface
572 * concurrently. In that case, ip_route_input() returns a
573 * LOCAL route with dst.output=ip_rt_bug, which must not
574 * be used for output.
575 */
576 if (!err && rt2 && rt2->rt_type == RTN_LOCAL) {
^^^
Can rt2 really be NULL here?
577 net_warn_ratelimited("detected local route for %pI4 during ICMP sending, src %pI4\n",
578 &fl4_dec.daddr, &fl4_dec.saddr);
579 dst_release(&rt2->dst);
580 err = -EINVAL;
581 }
582 }
583
584 if (err)
585 goto relookup_failed;
586
--> 587 dst2 = xfrm_lookup(net, &rt2->dst, flowi4_to_flowi(&fl4_dec), NULL,
^^^^^^^^^
Because, if so, then we are screwed here.
588 XFRM_LOOKUP_ICMP);
589 rt2 = dst_rtable(dst2);
590 if (!IS_ERR(dst2)) {
591 dst_release(&rt->dst);
592 rt = rt2;
593 } else if (PTR_ERR(dst2) == -EPERM) {
594 if (rt)
595 dst_release(&rt->dst);
596 return rt2;
597 } else {
598 err = PTR_ERR(dst2);
599 goto relookup_failed;
600 }
601 return rt;
602
603 relookup_failed:
604 if (rt)
605 return rt;
606 return ERR_PTR(err);
607 }
This email is a free service from the Smatch-CI project [smatch.sf.net].
regards,
dan carpenter
^ permalink raw reply
* Re: [PATCH net-next v3 09/12] net: airoha: Support multiple net_devices for a single FE GDM port
From: Lorenzo Bianconi @ 2026-04-10 10:19 UTC (permalink / raw)
To: Jakub Kicinski
Cc: andrew+netdev, davem, edumazet, pabeni, robh, krzk+dt, conor+dt,
ansuelsmth, benjamin.larsson, linux-arm-kernel, linux-mediatek,
netdev, devicetree
In-Reply-To: <20260410025724.1810007-1-kuba@kernel.org>
[-- Attachment #1: Type: text/plain, Size: 8640 bytes --]
On Apr 09, Jakub Kicinski wrote:
> This is an AI-generated review of your patch. The human sending this
> email says "For your consideration."
> ---
> net: airoha: Support multiple net_devices for a single FE GDM port
>
> This commit introduces support for mapping multiple net_device interfaces
> to a single Frame Engine (FE) GDM port via an external hardware arbiter.
> It updates the rx/tx paths, port allocation, and registration logic to
> handle the new net_device array.
>
> > diff --git a/drivers/net/ethernet/airoha/airoha_eth.c b/drivers/net/ethernet/airoha/airoha_eth.c
> > --- a/drivers/net/ethernet/airoha/airoha_eth.c
> > +++ b/drivers/net/ethernet/airoha/airoha_eth.c
> [ ... ]
> > @@ -615,9 +617,8 @@ static int airoha_qdma_rx_process(struct airoha_queue *q, int budget)
> > u32 hash, reason, msg1 = le32_to_cpu(desc->msg1);
> > struct page *page = virt_to_head_page(e->buf);
> > u32 desc_ctrl = le32_to_cpu(desc->ctrl);
> > - struct airoha_gdm_port *port;
> > - struct net_device *netdev;
> > - int data_len, len, p;
> > + struct airoha_gdm_dev *dev;
> > + int data_len, len;
> >
> > if (!(desc_ctrl & QDMA_DESC_DONE_MASK))
> > break;
>
> While looking at this code, I noticed an issue that predates this patch.
> Can the lack of a memory barrier after checking the descriptor ownership bit
> cause stale reads?
>
> The driver checks QDMA_DESC_DONE_MASK to verify that the hardware has finished
> writing the descriptor. However, it reads desc->msg1 before this check and
> without a dma_rmb():
>
> u32 hash, reason, msg1 = le32_to_cpu(desc->msg1);
> struct page *page = virt_to_head_page(e->buf);
> u32 desc_ctrl = le32_to_cpu(desc->ctrl);
>
> if (!(desc_ctrl & QDMA_DESC_DONE_MASK))
> break;
>
> Since weakly ordered architectures can reorder memory reads, could the CPU
> prefetch msg1 before the DMA engine writes the new descriptor values, leading
> to routing the packet to the wrong gdm_dev?
This issue has been already fixed with the following commit:
commit 4ae0604a0673e11e2075b178387151fcad5111b5
Author: Lorenzo Bianconi <lorenzo@kernel.org>
Date: Tue Apr 7 08:48:04 2026 +0200
net: airoha: Add dma_rmb() and READ_ONCE() in airoha_qdma_rx_process()
Add missing dma_rmb() in airoha_qdma_rx_process routine to make sure the
DMA read operations are completed when the NIC reports the processing on
the current descriptor is done. Moreover, add missing READ_ONCE() in
airoha_qdma_rx_process() for DMA descriptor control fields in order to
avoid any compiler reordering.
>
> [ ... ]
> > @@ -1764,9 +1765,7 @@ static int airoha_set_gdm2_loopback(struct airoha_gdm_dev *dev)
> > airoha_fe_clear(eth, REG_FE_VIP_PORT_EN, BIT(AIROHA_GDM2_IDX));
> > airoha_fe_clear(eth, REG_FE_IFC_PORT_EN, BIT(AIROHA_GDM2_IDX));
> >
> > - /* XXX: handle XSI_USB_PORT and XSI_PCE1_PORT */
> > - nbq = port->id == AIROHA_GDM3_IDX && airoha_is_7581(eth) ? 4 : 0;
> > - src_port = eth->soc->ops.get_src_port_id(port, nbq);
> > + src_port = eth->soc->ops.get_src_port_id(port, dev->nbq);
> > if (src_port < 0)
> > return src_port;
>
> Can configuring the loopback for one device break connectivity for other
> devices sharing the same port?
>
> When multiple net_devices (like PCIE0 and PCIE1 on GDM3) open and call
> airoha_set_gdm2_loopback(), they modify global hardware registers such as
> REG_FE_WAN_PORT.
>
> Since WAN0_MASK is unconditionally updated to track the current device's
> src_port:
>
> airoha_fe_rmw(eth, REG_FE_WAN_PORT,
> WAN1_EN_MASK | WAN1_MASK | WAN0_MASK,
> FIELD_PREP(WAN0_MASK, src_port));
>
> won't the last configured device clobber the hardware state for the others?
This issue is fixed with the following patch in the series:
commit 008995bd8f39c5fdf37a933fa3c962e7ef52e8c6
Author: Lorenzo Bianconi <lorenzo@kernel.org>
Date: Fri Apr 3 12:07:27 2026 +0200
net: airoha: Introduce WAN device flag
Introduce WAN flag to specify if a given device is used to transmit/receive
WAN or LAN traffic. Current codebase supports specifying LAN/WAN device
configuration in ndo_init() callback during device bootstrap.
Please note it is possible to specify multiple LAN devices but just a
single WAN one.
>
> [ ... ]
> > @@ -1987,7 +1986,8 @@ static netdev_tx_t airoha_dev_xmit(struct sk_buff *skb,
> > }
> >
> > fport = airoha_get_fe_port(dev);
> > - msg1 = FIELD_PREP(QDMA_ETH_TXMSG_FPORT_MASK, fport) |
> > + msg1 = FIELD_PREP(QDMA_ETH_TXMSG_NBOQ_MASK, dev->nbq) |
> > + FIELD_PREP(QDMA_ETH_TXMSG_FPORT_MASK, fport) |
> > FIELD_PREP(QDMA_ETH_TXMSG_METER_MASK, 0x7f);
> >
> > q = &qdma->q_tx[qid];
>
> Does sharing the hardware QDMA TX queues among multiple net_device instances
> introduce a permanent transmit stall?
>
> In airoha_dev_xmit(), if a shared hardware queue is full, the driver stops the
> netdev_queue of the specific netdev making the call:
>
> if (q->queued + nr_frags >= q->ndesc) {
> /* not enough space in the queue */
> netif_tx_stop_queue(txq);
> spin_unlock_bh(&q->lock);
> return NETDEV_TX_BUSY;
> }
>
> However, the TX completion handler (airoha_tx_irq_process()) only wakes the
> queue of the netdev attached to the completed packet. If an interface's TX
> queue is stopped but it has no pending packets in the hardware queue (because
> another interface filled it), will it ever get woken up?
This issue is orthogonal to this series and it will be fixed with a dedicated
patch.
>
> [ ... ]
> > @@ -2955,11 +2958,25 @@ static int airoha_alloc_gdm_device(struct airoha_eth *eth,
> > netdev->dev_addr);
> > }
> >
> > + /* Allowed nbq for EN7581 on GDM3 port are 4 and 5 for PCIE0
> > + * and PCIE1 respectively.
> > + */
> > + index = nbq;
> > + if (index && airoha_is_7581(eth) && port->id == AIROHA_GDM3_IDX)
> > + index -= 4;
> > +
> > + if (index >= ARRAY_SIZE(port->devs) || port->devs[index]) {
> > + dev_err(eth->dev, "invalid nbq id: %d\n", nbq);
> > + return -EINVAL;
> > + }
> > +
> > + netdev->dev.of_node = of_node_get(np);
> > dev = netdev_priv(netdev);
>
> Does this code leak the of_node reference?
>
> The node reference is explicitly incremented via of_node_get(np). The networking
> core does not automatically drop explicit of_node references on free_netdev().
> Without a matching of_node_put() in the error paths or the driver's teardown
> function (airoha_remove()), will this reference permanently leak when the device
> is destroyed?
ack, I will fix it in v4.
>
> [ ... ]
> > @@ -3003,7 +3021,43 @@ static int airoha_alloc_gdm_port(struct airoha_eth *eth,
> > if (err)
> > return err;
> >
> > - return airoha_alloc_gdm_device(eth, port, np);
> > + for_each_child_of_node(np, node) {
> > + /* Multiple external serdes connected to the FE GDM port via an
> > + * external arbiter.
> > + */
> > + const __be32 *nbq_ptr;
> > + int nbq;
> > +
> > + if (!of_device_is_compatible(node, "airoha,eth-port"))
> > + continue;
> > +
> > + d++;
> > + if (!of_device_is_available(node))
> > + continue;
> > +
> > + nbq_ptr = of_get_property(node, "reg", NULL);
> > + if (!nbq_ptr) {
> > + dev_err(eth->dev, "missing nbq id\n");
> > + of_node_put(node);
> > + return -EINVAL;
> > + }
> > +
> > + /* Verify the provided nbq parameter is valid */
> > + nbq = be32_to_cpup(nbq_ptr);
> > + err = eth->soc->ops.get_src_port_id(port, nbq);
> > + if (err < 0) {
> > + of_node_put(node);
> > + return err;
> > + }
> > +
> > + err = airoha_alloc_gdm_device(eth, port, nbq, node);
> > + if (err) {
> > + of_node_put(node);
> > + return err;
> > + }
> > + }
> > +
> > + return !d ? airoha_alloc_gdm_device(eth, port, 0, np) : 0;
> > }
>
> Does passing 0 for nbq in the fallback path break older device trees on the
> EN7581 SoC?
>
> Later, when the interface is brought up, airoha_set_gdm2_loopback() calls
> airoha_en7581_get_src_port_id(), which returns -EINVAL for GDM3 if nbq is
> not 4 or 5:
>
> switch (port->id) {
> case AIROHA_GDM3_IDX:
> /* 7581 SoC supports PCIe serdes on GDM3 port */
> if (nbq == 4)
> return HSGMII_LAN_7581_PCIE0_SRCPORT;
> if (nbq == 5)
> return HSGMII_LAN_7581_PCIE1_SRCPORT;
> break;
>
> If nbq is hardcoded to 0 for legacy DTs, won't this cause interface
> initialization to fail unconditionally for GDM3?
ack, I will fix it in v4.
Regards,
Lorenzo
[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 228 bytes --]
^ permalink raw reply
* Re: [PATCH net] netrom: do some basic forms of validation on incoming frames
From: Simon Horman @ 2026-04-10 10:28 UTC (permalink / raw)
To: Greg Kroah-Hartman
Cc: Jakub Kicinski, netdev, linux-kernel, David S. Miller,
Eric Dumazet, Paolo Abeni, linux-hams, Yizhe Zhuang, stable
In-Reply-To: <2026041026-excuse-slashing-c4ee@gregkh>
On Fri, Apr 10, 2026 at 07:24:36AM +0200, Greg Kroah-Hartman wrote:
> On Thu, Apr 09, 2026 at 08:32:35PM -0700, Jakub Kicinski wrote:
> > On Thu, 9 Apr 2026 20:03:28 +0100 Simon Horman wrote:
> > > I expect that checking skb->len isn't sufficient here
> > > and pskb_may_pull needs to be used to ensure that
> > > the data is also available in the linear section of the skb.
> >
> > Or for simplicity we could also be testing against skb_headlen()
> > since we don't expect any legit non-linear frames here? Dunno.
Sure, that's find by me if it leads to simpler code than
using pskb_may_pull(). Else I'd lean towards pskb_may_pull()
as it is a more general approach that feels worth proliferating.
> I'll be glad to change this either way, your call. Given that this is
> an obsolete protocol that seems to only be a target for drive-by fuzzers
> to attack, whatever the simplest thing to do to quiet them up I'll be
> glad to implement.
>
> Or can we just delete this stuff entirely? :)
Deleting sounds good to me.
But we likely need a deprecation process.
In which case fixing these bugs still makes sense for the short term.
^ permalink raw reply
* Re: [PATCH nf] netfilter: nf_tables: use RCU-safe list primitives for basechain hook list
From: Florian Westphal @ 2026-04-10 10:31 UTC (permalink / raw)
To: Weiming Shi
Cc: Pablo Neira Ayuso, David S . Miller, Eric Dumazet, Jakub Kicinski,
Paolo Abeni, Phil Sutter, Simon Horman, netfilter-devel, coreteam,
netdev, linux-kernel, Xiang Mei
In-Reply-To: <20260410101321.915190-2-bestswngs@gmail.com>
Weiming Shi <bestswngs@gmail.com> wrote:
> NFT_MSG_GETCHAIN runs as an NFNL_CB_RCU callback, so chain dumps
> traverse basechain->hook_list under rcu_read_lock() without holding
> commit_mutex. Meanwhile, nft_delchain_hook() mutates that same live
> hook_list with plain list_move() and list_splice(), and the commit/abort
> paths splice hooks back with plain list_splice(). None of these are
> RCU-safe list operations.
>
> A concurrent GETCHAIN dump can observe partially updated list pointers,
> follow them into stack-local or transaction-private list heads, and
> crash when container_of() produces a bogus struct nft_hook pointer.
Right, but this is broken by design.
> Replace list_move() in nft_delchain_hook() with list_del_rcu() plus an
> intermediate pointer array, followed by synchronize_rcu() before the
> deleted hooks' list pointers are reused to link them into the
> transaction's private list. In the error paths, put hooks back with
> list_add_tail_rcu() which is safe for concurrent RCU readers (they
> either continue to the original successor or see the list head and
> terminate the walk).
I don't understand the existing code.
I don't even understand why
we have a difference between the 'update delete' and chain delete cases.
I think its wrong to unlink and then relink on abort.
What prevents nft_delchain_hook() from using the normal approach done
by nft_delchain()...?
This existing code appears to be way too complex.
^ permalink raw reply
* Re: [PATCH net-next v3 10/12] net: airoha: Do not stop GDM port if it is shared
From: Lorenzo Bianconi @ 2026-04-10 10:35 UTC (permalink / raw)
To: Jakub Kicinski
Cc: andrew+netdev, davem, edumazet, pabeni, robh, krzk+dt, conor+dt,
ansuelsmth, benjamin.larsson, linux-arm-kernel, linux-mediatek,
netdev, devicetree, Xuegang Lu
In-Reply-To: <20260410025726.1810041-1-kuba@kernel.org>
[-- Attachment #1: Type: text/plain, Size: 4665 bytes --]
> This is an AI-generated review of your patch. The human sending this
> email says "For your consideration."
> ---
> net: airoha: Do not stop GDM port if it is shared
>
> Theoretically, in the current codebase, two independent net_devices can
> be connected to the same GDM port so we need to check the GDM port is not
> used by any other running net_device before setting the forward
> configuration to FE_PSE_PORT_DROP.
>
> Tested-by: Xuegang Lu <xuegang.lu@airoha.com>
>
> > diff --git a/drivers/net/ethernet/airoha/airoha_eth.c b/drivers/net/ethernet/airoha/airoha_eth.c
> > index fd27ba13029bd..5b0cd37b155e4 100644
> > --- a/drivers/net/ethernet/airoha/airoha_eth.c
> > +++ b/drivers/net/ethernet/airoha/airoha_eth.c
>
> [ ... ]
>
> > @@ -1688,18 +1693,18 @@ static int airoha_dev_stop(struct net_device *netdev)
> > struct airoha_gdm_dev *dev = netdev_priv(netdev);
> > struct airoha_gdm_port *port = dev->port;
> > struct airoha_qdma *qdma = dev->qdma;
> > - int i, err;
> > + int i;
> >
> > netif_tx_disable(netdev);
> > - err = airoha_set_vip_for_gdm_port(dev, false);
> > - if (err)
> > - return err;
> > -
> > for (i = 0; i < ARRAY_SIZE(qdma->q_tx); i++)
> > netdev_tx_reset_subqueue(netdev, i);
> >
> > - airoha_set_gdm_port_fwd_cfg(qdma->eth, REG_GDM_FWD_CFG(port->id),
> > - FE_PSE_PORT_DROP);
> > + if (atomic_dec_and_test(&port->users)) {
> > + airoha_set_vip_for_gdm_port(dev, false);
> > + airoha_set_gdm_port_fwd_cfg(qdma->eth,
> > + REG_GDM_FWD_CFG(port->id),
> > + FE_PSE_PORT_DROP);
> > + }
> >
> > if (atomic_dec_and_test(&qdma->users)) {
> > airoha_qdma_clear(qdma, REG_QDMA_GLOBAL_CFG,
> > GLOBAL_CFG_TX_DMA_EN_MASK |
> > GLOBAL_CFG_RX_DMA_EN_MASK);
> >
> > for (i = 0; i < ARRAY_SIZE(qdma->q_tx); i++) {
> > if (!qdma->q_tx[i].ndesc)
> > continue;
> >
> > airoha_qdma_cleanup_tx_queue(&qdma->q_tx[i]);
> > }
> > }
>
> This problem wasn't introduced by this patch, but does the QDMA teardown
> path have a DMA corruption vulnerability here?
>
> When GLOBAL_CFG_TX_DMA_EN_MASK is cleared, it fails to poll
> GLOBAL_CFG_TX_DMA_BUSY_MASK, proceeding to unmap memory while the hardware
> DMA engine may still be processing in-flight descriptors.
>
> Furthermore, airoha_qdma_cleanup_tx_queue() unmaps the DMA addresses and
> frees the SKBs, but it does not appear to clear the physical hardware
> descriptors or reset the hardware ring pointers (REG_TX_CPU_IDX /
> REG_TX_DMA_IDX).
>
> When the device is later reopened in airoha_dev_open(),
> GLOBAL_CFG_TX_DMA_EN_MASK is set again without re-initializing the rings.
> Will the hardware DMA engine resume fetching from its previous index,
> processing stale descriptors whose backing memory was already unmapped
> and freed?
ack, I will work on it with a dedicated patch.
>
> [ ... ]
>
> > @@ -1851,10 +1856,22 @@ static void airoha_dev_get_stats64(struct net_device *netdev,
> > static int airoha_dev_change_mtu(struct net_device *netdev, int mtu)
> > {
> > struct airoha_gdm_dev *dev = netdev_priv(netdev);
> > + u32 cur_len, len = ETH_HLEN + mtu + ETH_FCS_LEN;
> > struct airoha_gdm_port *port = dev->port;
> > - u32 len = ETH_HLEN + mtu + ETH_FCS_LEN;
> > struct airoha_eth *eth = dev->eth;
> >
> > + cur_len = FIELD_GET(GDM_LONG_LEN_MASK,
> > + airoha_fe_rr(eth, REG_GDM_LEN_CFG(port->id)));
> > + if (len < cur_len) {
> > + u8 port_refcnt = atomic_read(&port->users);
> > +
> > + /* We can decrease the device MTU just if the GDM port is
> > + * not shared or if the other device is not running.
> > + */
> > + if (port_refcnt > 1 || (port_refcnt && !netif_running(netdev)))
> > + return -EBUSY;
> > + }
> > +
>
> Does this MTU decrease validation create a state trap on shared ports?
>
> If both eth0 and eth1 (sharing a port) are up, and a user increases eth0 MTU
> to 9000, the hardware MTU updates to 9000. If the user then tries to lower
> eth0 MTU back to 1500, len < cur_len evaluates to true. Because eth1 is also
> up (port_refcnt > 1), this returns -EBUSY.
>
> This would prevent either interface from decreasing its MTU without fully
> bringing down the sibling interface.
>
> Should the software netdev->mtu be decoupled from the hardware validation,
> allowing valid MTU changes in software while dynamically programming the
> hardware MTU to the maximum of all currently up interfaces on the shared
> port?
It seems more correct to me avoid receiving in the stack packet bigger than the
configured 'SW MTU".
Regards,
Lorenzo
[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 228 bytes --]
^ permalink raw reply
* Re: [PATCH net-next v3 00/12] net: airoha: Support multiple net_devices connected to the same GDM port
From: Lorenzo Bianconi @ 2026-04-10 10:39 UTC (permalink / raw)
To: Jakub Kicinski
Cc: Andrew Lunn, David S. Miller, Eric Dumazet, Paolo Abeni,
Rob Herring, Krzysztof Kozlowski, Conor Dooley, Christian Marangi,
Benjamin Larsson, linux-arm-kernel, linux-mediatek, netdev,
devicetree, Xuegang Lu
In-Reply-To: <20260409195645.16c68979@kernel.org>
[-- Attachment #1: Type: text/plain, Size: 629 bytes --]
On Apr 09, Jakub Kicinski wrote:
> On Mon, 06 Apr 2026 12:34:05 +0200 Lorenzo Bianconi wrote:
> > EN7581 or AN7583 SoCs support connecting multiple external SerDes (e.g.
> > Ethernet or USB SerDes) to GDM3 or GDM4 ports via a hw arbiter that
> > manages the traffic in a TDM manner. As a result multiple net_devices can
> > connect to the same GDM{3,4} port and there is a theoretical "1:n"
> > relation between GDM ports and net_devices.
>
> Still waiting for the device tree review. I'm going to blindly send out
> the Sashiko review, please comment if any of it makes sense?
ack, I will do.
Regards,
Lorenzo
[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 228 bytes --]
^ permalink raw reply
* Re: [bug report] ipv4: icmp: fix null-ptr-deref in icmp_build_probe()
From: Fernando Fernandez Mancera @ 2026-04-10 10:51 UTC (permalink / raw)
To: Dan Carpenter, Yiqi Sun; +Cc: Simon Horman, netdev
In-Reply-To: <adjOCdCW1EpPl8lf@stanley.mountain>
On 4/10/26 12:16 PM, Dan Carpenter wrote:
> Hello Yiqi Sun,
>
> Commit fde29fd93493 ("ipv4: icmp: fix null-ptr-deref in
> icmp_build_probe()") from Apr 2, 2026 (linux-next), leads to the
> following Smatch static checker warning:
>
> net/ipv4/icmp.c:1351 icmp_build_probe()
> warn: 'dev' is not an error pointer
>
> net/ipv4/icmp.c
> 1341 #if IS_ENABLED(CONFIG_IPV6)
> 1342 case ICMP_AFI_IP6:
> 1343 if (iio->ident.addr.ctype3_hdr.addrlen != sizeof(struct in6_addr))
> 1344 goto send_mal_query;
> 1345 dev = ipv6_dev_find(net, &iio->ident.addr.ip_addr.ipv6_addr, dev);
> 1346
> 1347 /*
> 1348 * If IPv6 identifier lookup is unavailable, silently
> 1349 * discard the request instead of misreporting NO_IF.
> 1350 */
> --> 1351 if (IS_ERR(dev))
> 1352 return false;
>
> It looks like there were two patches that went in around the same
> time. Commit fde29fd93493 ("ipv4: icmp: fix null-ptr-deref in
> icmp_build_probe()") updated the checking for
> ipv6_stub->ipv6_dev_find() but d98adfbdd5c0 ("ipv4: drop ipv6_stub usage
> and use direct function calls") changed it to not return error pointers.
>
> This IS_ERR() check can be removed.
>
Yes, I thought it was going to happen during merging but I guess it
makes sense to do it on a separate patch.
I am sending a patch to net-next addressing this.
Thanks!
^ permalink raw reply
* Re: [PATCH net-next v11 03/14] net: Add lease info to queue-get response
From: Daniel Borkmann @ 2026-04-10 11:10 UTC (permalink / raw)
To: Jakub Kicinski
Cc: netdev, bpf, davem, razor, pabeni, willemb, sdf, john.fastabend,
martin.lau, jordan, maciej.fijalkowski, magnus.karlsson, dw, toke,
yangzhenze, wangdongdong.6
In-Reply-To: <20260409185105.721a6465@kernel.org>
On 4/10/26 3:51 AM, Jakub Kicinski wrote:
> On Thu, 9 Apr 2026 17:32:31 +0200 Daniel Borkmann wrote:
>>> I think the test has to be reworked but of the available options seems
>>> like merging it as is and following up quickly is the best. I've only
>>> set up the container testing in our CI yesterday anyway so there may
>>> be more things that need changing in the test as we gain experience :S
>>
>> No objections obviously if you want to land as-is with your refactor on
>> top.
>
> Done, please double check my work, there were some conflicts with net.
Looks good to me, thanks a lot for everything!
^ permalink raw reply
* Re: [Intel-wired-lan] [PATCH net v2 3/4] iavf: send MAC change request synchronously
From: Jose Ignacio Tornos Martinez @ 2026-04-10 11:12 UTC (permalink / raw)
To: przemyslaw.kitszel
Cc: anthony.l.nguyen, davem, edumazet, intel-wired-lan,
jacob.e.keller, jtornosm, kohei.enju, kuba, netdev, pabeni, poros,
stable
In-Reply-To: <89bfd605-1877-4d40-95e1-bfeae6624168@intel.com>
Hello Przemek,
Thank you for your comments.
I will try to include them in a next version.
Best regards
Jose Ignacio
^ permalink raw reply
* Re: [syzbot] [mptcp?] possible deadlock in mptcp_pm_mp_prio_send_ack
From: Matthieu Baerts @ 2026-04-10 11:13 UTC (permalink / raw)
To: syzbot
Cc: davem, edumazet, geliang, horms, kuba, linux-kernel, martineau,
mptcp, netdev, pabeni, syzkaller-bugs
In-Reply-To: <69d7de34.050a0220.3030df.0019.GAE@google.com>
Hello,
On 09/04/2026 19:13, syzbot wrote:
> Hello,
>
> syzbot found the following issue on:
>
> HEAD commit: 1caa871bb061 Merge branch 'net-stmmac-fix-tegra234-mgbe-cl..
> git tree: net
> console output: https://syzkaller.appspot.com/x/log.txt?x=11d74e06580000
> kernel config: https://syzkaller.appspot.com/x/.config?x=6754c86e8d9e4c91
> dashboard link: https://syzkaller.appspot.com/bug?extid=2204dbe6a049b3218db9
> compiler: Debian clang version 21.1.8 (++20251221033036+2078da43e25a-1~exp1~20251221153213.50), Debian LLD 21.1.8
>
> Unfortunately, I don't have any reproducer for this issue yet.
>
> Downloadable assets:
> disk image: https://storage.googleapis.com/syzbot-assets/014aae23b990/disk-1caa871b.raw.xz
> vmlinux: https://storage.googleapis.com/syzbot-assets/c574a710638c/vmlinux-1caa871b.xz
> kernel image: https://storage.googleapis.com/syzbot-assets/b29909f4efc4/bzImage-1caa871b.xz
>
> IMPORTANT: if you fix the issue, please add the following tag to the commit:
> Reported-by: syzbot+2204dbe6a049b3218db9@syzkaller.appspotmail.com
>
> netlink: 8 bytes leftover after parsing attributes in process `syz.2.2034'.
> netlink: 8 bytes leftover after parsing attributes in process `syz.2.2034'.
> ======================================================
> WARNING: possible circular locking dependency detected
> syzkaller #0 Not tainted
> ------------------------------------------------------
> syz.2.2034/13659 is trying to acquire lock:
> ffff888031173560 (k-sk_lock-AF_INET){+.+.}-{0:0}, at: mptcp_pm_mp_prio_send_ack+0xaf8/0xba0 net/mptcp/pm.c:296
>
> but task is already holding lock:
> ffff88807e300ea0 (sk_lock-AF_INET){+.+.}-{0:0}, at: lock_sock include/net/sock.h:1709 [inline]
> ffff88807e300ea0 (sk_lock-AF_INET){+.+.}-{0:0}, at: mptcp_pm_nl_set_flags_all net/mptcp/pm_kernel.c:1482 [inline]
> ffff88807e300ea0 (sk_lock-AF_INET){+.+.}-{0:0}, at: mptcp_pm_nl_set_flags+0x795/0xc90 net/mptcp/pm_kernel.c:1551
>
> which lock already depends on the new lock.
>
>
> the existing dependency chain (in reverse order) is:
>
> -> #7 (sk_lock-AF_INET){+.+.}-{0:0}:
> lock_sock_nested+0x48/0x100 net/core/sock.c:3780
> lock_sock include/net/sock.h:1709 [inline]
> inet_shutdown+0x6a/0x390 net/ipv4/af_inet.c:919
> nbd_mark_nsock_dead+0x2e9/0x560 drivers/block/nbd.c:318
If I'm not mistaken, it looks like this issue is also due to nbd
introducing a lockdep dependency between reclaim and af_socket, and this
is similar to a previous report:
#syz dup: [syzbot] [mptcp?] possible deadlock in mptcp_subflow_create_socket (2)
If that's not correct, please unduplicate it.
Cheers,
Matt
--
Sponsored by the NGI0 Core fund.
^ permalink raw reply
* Re: [PATCH nf] netfilter: nf_tables: use RCU-safe list primitives for basechain hook list
From: Pablo Neira Ayuso @ 2026-04-10 11:14 UTC (permalink / raw)
To: Florian Westphal
Cc: Weiming Shi, David S . Miller, Eric Dumazet, Jakub Kicinski,
Paolo Abeni, Phil Sutter, Simon Horman, netfilter-devel, coreteam,
netdev, linux-kernel, Xiang Mei
In-Reply-To: <adjRiG_Bp3WpRYOz@strlen.de>
On Fri, Apr 10, 2026 at 12:31:36PM +0200, Florian Westphal wrote:
> Weiming Shi <bestswngs@gmail.com> wrote:
[...]
> > Replace list_move() in nft_delchain_hook() with list_del_rcu() plus an
> > intermediate pointer array, followed by synchronize_rcu() before the
> > deleted hooks' list pointers are reused to link them into the
> > transaction's private list. In the error paths, put hooks back with
> > list_add_tail_rcu() which is safe for concurrent RCU readers (they
> > either continue to the original successor or see the list head and
> > terminate the walk).
>
> I don't understand the existing code.
I am working on an alternative fix.
^ permalink raw reply
* [PATCH net 1/1] net/sched: act_ct: Only release RCU read lock after ct_ft
From: Jamal Hadi Salim @ 2026-04-10 11:16 UTC (permalink / raw)
To: netdev
Cc: davem, edumazet, kuba, pabeni, horms, jiri, zdi-disclosures,
security, Jamal Hadi Salim, Victor Nogueira
When looking up a flow table in act_ct in tcf_ct_flow_table_get(),
rhashtable_lookup_fast() internally opens and closes an RCU read critical
section before returning ct_ft.
The tcf_ct_flow_table_cleanup_work() can complete before refcount_inc_not_zero()
is invoked on the returned ct_ft resulting in a UAF on the already freed ct_ft
object. This vulnerability can lead to privilege escalation.
Analysis from zdi-disclosures@trendmicro.com:
When initializing act_ct, tcf_ct_init() is called, which internally triggers
tcf_ct_flow_table_get().
static int tcf_ct_flow_table_get(struct net *net, struct tcf_ct_params *params)
{
struct zones_ht_key key = { .net = net, .zone = params->zone };
struct tcf_ct_flow_table *ct_ft;
int err = -ENOMEM;
mutex_lock(&zones_mutex);
ct_ft = rhashtable_lookup_fast(&zones_ht, &key, zones_params); // [1]
if (ct_ft && refcount_inc_not_zero(&ct_ft->ref)) // [2]
goto out_unlock;
...
}
static __always_inline void *rhashtable_lookup_fast(
struct rhashtable *ht, const void *key,
const struct rhashtable_params params)
{
void *obj;
rcu_read_lock();
obj = rhashtable_lookup(ht, key, params);
rcu_read_unlock();
return obj;
}
At [1], rhashtable_lookup_fast() looks up and returns the corresponding ct_ft
from zones_ht . The lookup is performed within an RCU read critical section
through rcu_read_lock() / rcu_read_unlock(), which prevents the object from
being freed. However, at the point of function return, rcu_read_unlock() has
already been called, and there is nothing preventing ct_ft from being freed
before reaching refcount_inc_not_zero(&ct_ft->ref) at [2]. This interval becomes
the race window, during which ct_ft can be freed.
Free Process:
tcf_ct_flow_table_put() is executed through the path tcf_ct_cleanup() call_rcu()
tcf_ct_params_free_rcu() tcf_ct_params_free() tcf_ct_flow_table_put().
static void tcf_ct_flow_table_put(struct tcf_ct_flow_table *ct_ft)
{
if (refcount_dec_and_test(&ct_ft->ref)) {
rhashtable_remove_fast(&zones_ht, &ct_ft->node, zones_params);
INIT_RCU_WORK(&ct_ft->rwork, tcf_ct_flow_table_cleanup_work); // [3]
queue_rcu_work(act_ct_wq, &ct_ft->rwork);
}
}
At [3], tcf_ct_flow_table_cleanup_work() is scheduled as RCU work
static void tcf_ct_flow_table_cleanup_work(struct work_struct *work)
{
struct tcf_ct_flow_table *ct_ft;
struct flow_block *block;
ct_ft = container_of(to_rcu_work(work), struct tcf_ct_flow_table,
rwork);
nf_flow_table_free(&ct_ft->nf_ft);
block = &ct_ft->nf_ft.flow_block;
down_write(&ct_ft->nf_ft.flow_block_lock);
WARN_ON(!list_empty(&block->cb_list));
up_write(&ct_ft->nf_ft.flow_block_lock);
kfree(ct_ft); // [4]
module_put(THIS_MODULE);
}
tcf_ct_flow_table_cleanup_work() frees ct_ft at [4]. When this function executes
between [1] and [2], UAF occurs.
This race condition has a very short race window, making it generally
difficult to trigger. Therefore, to trigger the vulnerability an msleep(100) was
inserted after[1]
Fixes: 138470a9b2cc2 ("net/sched: act_ct: fix lockdep splat in tcf_ct_flow_table_get")
Reported-by: zdi-disclosures@trendmicro.com
Tested-by: Victor Nogueira <victor@mojatatu.com>
Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com>
---
net/sched/act_ct.c | 8 ++++++--
1 file changed, 6 insertions(+), 2 deletions(-)
diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c
index 7d5e50c921a0..6158e13c98d3 100644
--- a/net/sched/act_ct.c
+++ b/net/sched/act_ct.c
@@ -328,9 +328,13 @@ static int tcf_ct_flow_table_get(struct net *net, struct tcf_ct_params *params)
int err = -ENOMEM;
mutex_lock(&zones_mutex);
- ct_ft = rhashtable_lookup_fast(&zones_ht, &key, zones_params);
- if (ct_ft && refcount_inc_not_zero(&ct_ft->ref))
+ rcu_read_lock();
+ ct_ft = rhashtable_lookup(&zones_ht, &key, zones_params);
+ if (ct_ft && refcount_inc_not_zero(&ct_ft->ref)) {
+ rcu_read_unlock();
goto out_unlock;
+ }
+ rcu_read_unlock();
ct_ft = kzalloc_obj(*ct_ft);
if (!ct_ft)
--
2.34.1
^ permalink raw reply related
* Re: [bug report] ipv4: icmp: fix null-ptr-deref in icmp_build_probe()
From: Fernando Fernandez Mancera @ 2026-04-10 11:19 UTC (permalink / raw)
To: Dan Carpenter, Yiqi Sun; +Cc: Simon Horman, netdev
In-Reply-To: <00cba68f-2e37-4ad9-872b-cc41a113de00@suse.de>
On 4/10/26 12:51 PM, Fernando Fernandez Mancera wrote:
> On 4/10/26 12:16 PM, Dan Carpenter wrote:
>> Hello Yiqi Sun,
>>
>> Commit fde29fd93493 ("ipv4: icmp: fix null-ptr-deref in
>> icmp_build_probe()") from Apr 2, 2026 (linux-next), leads to the
>> following Smatch static checker warning:
>>
>> net/ipv4/icmp.c:1351 icmp_build_probe()
>> warn: 'dev' is not an error pointer
>>
>> net/ipv4/icmp.c
>> 1341 #if IS_ENABLED(CONFIG_IPV6)
>> 1342 case ICMP_AFI_IP6:
>> 1343 if (iio-
>> >ident.addr.ctype3_hdr.addrlen != sizeof(struct in6_addr))
>> 1344 goto send_mal_query;
>> 1345 dev = ipv6_dev_find(net, &iio-
>> >ident.addr.ip_addr.ipv6_addr, dev);
>> 1346
>> 1347 /*
>> 1348 * If IPv6 identifier lookup is
>> unavailable, silently
>> 1349 * discard the request instead of
>> misreporting NO_IF.
>> 1350 */
>> --> 1351 if (IS_ERR(dev))
>> 1352 return false;
>>
>> It looks like there were two patches that went in around the same
>> time. Commit fde29fd93493 ("ipv4: icmp: fix null-ptr-deref in
>> icmp_build_probe()") updated the checking for
>> ipv6_stub->ipv6_dev_find() but d98adfbdd5c0 ("ipv4: drop ipv6_stub usage
>> and use direct function calls") changed it to not return error pointers.
>>
>> This IS_ERR() check can be removed.
>>
>
> Yes, I thought it was going to happen during merging but I guess it
> makes sense to do it on a separate patch.
>
Actually, I believe this has been handled during the net merge with
net-next.
https://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git/commit/?id=b6e39e48469e37057fce27a1b87cf6d3e456aa42
It should reach linux-next, so all good.
Thanks,
Fernando.
^ permalink raw reply
* [PATCH net-next 00/11] netfilter: updates for net-next
From: Florian Westphal @ 2026-04-10 11:23 UTC (permalink / raw)
To: netdev
Cc: Paolo Abeni, David S. Miller, Eric Dumazet, Jakub Kicinski,
netfilter-devel, pablo
Hi,
The following patchset contains Netfilter updates for *net-next*:
1-3) IPVS updates from Julian Anastasov to enhance visibility into
IPVS internal state by exposing hash size, load factor etc and
allows userspace to tune the load factor used for resizing hash
tables.
4) reject empty/not nul terminated device names from xt_physdev.
This isn't a bug fix; existing code doesn't require a c-string.
But clean this up anyway because conceptually the interface name
definitely should be a c-string.
5) Switch nfnetlink to skb_mac_header helpers that didn't exist back
when this code was written. This gives us additional debug checks
but is not intended to change functionality.
6) Let the xt ttl/hoplimit match reject unknown operator modes.
This is a cleanup, the evaluation function simply returns false when
the mode is out of range. From Marino Dzalto.
7) xt_socket match should enable defrag after all other checks. This
bug is harmless, historically defrag could not be disabled either
except by rmmod.
8) remove UDP-Lite conntrack support, from Fernando Fernandez Mancera.
9) Avoid a couple -Wflex-array-member-not-at-end warnings in the old
xtables 32bit compat code, from Gustavo A. R. Silva.
10) nftables fwd expression should drop packets when their ttl/hl has
expired. This is a bug fix deferred, its not deemed important
enough for -rc8.
11) Add additional checks before assuming the mac header is an ethernet
header, from Zhengchuan Liang.
Please, pull these changes from:
The following changes since commit 42f9b4c6ef19e71d2c7d9bfd3c5037d4fe434ad7:
tools: ynl: tests: fix leading space on Makefile target (2026-04-09 20:41:40 -0700)
are available in the Git repository at:
https://git.kernel.org/pub/scm/linux/kernel/git/netfilter/nf-next.git tags/nf-next-26-04-10
for you to fetch changes up to 62443dc21114c0bbc476fa62973db89743f2f137:
netfilter: require Ethernet MAC header before using eth_hdr() (2026-04-10 12:16:27 +0200)
----------------------------------------------------------------
netfilter pull request nf-next-26-04-10
----------------------------------------------------------------
Fernando Fernandez Mancera (1):
netfilter: conntrack: remove UDP-Lite conntrack support
Florian Westphal (4):
netfilter: x_physdev: reject empty or not-nul terminated device names
netfilter: nfnetlink: prefer skb_mac_header helpers
netfilter: xt_socket: enable defrag after all other checks
netfilter: nft_fwd_netdev: check ttl/hl before forwarding
Gustavo A. R. Silva (1):
netfilter: x_tables: Avoid a couple -Wflex-array-member-not-at-end warnings
Julian Anastasov (3):
ipvs: show the current conn_tab size to users
ipvs: add ip_vs_status info
ipvs: add conn_lfactor and svc_lfactor sysctl vars
Marino Dzalto (1):
netfilter: xt_HL: add pr_fmt and checkentry validation
Zhengchuan Liang (1):
netfilter: require Ethernet MAC header before using eth_hdr()
Documentation/networking/ipvs-sysctl.rst | 37 +++
.../net/netfilter/ipv4/nf_conntrack_ipv4.h | 3 -
include/net/netfilter/nf_conntrack_l4proto.h | 7 -
net/ipv6/netfilter/ip6t_eui64.c | 7 +-
net/netfilter/Kconfig | 11 -
net/netfilter/ipset/ip_set_bitmap_ipmac.c | 5 +-
net/netfilter/ipset/ip_set_hash_ipmac.c | 9 +-
net/netfilter/ipset/ip_set_hash_mac.c | 5 +-
net/netfilter/ipvs/ip_vs_ctl.c | 247 +++++++++++++++++-
net/netfilter/nf_conntrack_core.c | 8 -
net/netfilter/nf_conntrack_proto.c | 3 -
net/netfilter/nf_conntrack_proto_udp.c | 108 --------
net/netfilter/nf_conntrack_standalone.c | 2 -
net/netfilter/nf_log_syslog.c | 8 +-
net/netfilter/nf_nat_core.c | 6 -
net/netfilter/nf_nat_proto.c | 20 --
net/netfilter/nfnetlink_cttimeout.c | 1 -
net/netfilter/nfnetlink_log.c | 19 +-
net/netfilter/nfnetlink_queue.c | 25 +-
net/netfilter/nft_ct.c | 1 -
net/netfilter/nft_fwd_netdev.c | 10 +
net/netfilter/x_tables.c | 12 +-
net/netfilter/xt_hl.c | 27 ++
net/netfilter/xt_mac.c | 4 +-
net/netfilter/xt_physdev.c | 22 ++
net/netfilter/xt_socket.c | 23 +-
26 files changed, 399 insertions(+), 231 deletions(-)
--
2.52.0
^ permalink raw reply
* [PATCH net-next 01/11] ipvs: show the current conn_tab size to users
From: Florian Westphal @ 2026-04-10 11:23 UTC (permalink / raw)
To: netdev
Cc: Paolo Abeni, David S. Miller, Eric Dumazet, Jakub Kicinski,
netfilter-devel, pablo
In-Reply-To: <20260410112352.23599-1-fw@strlen.de>
From: Julian Anastasov <ja@ssi.bg>
As conn_tab is per-net, better to show the current hash table size
to users instead of the ip_vs_conn_tab_size (max).
Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Florian Westphal <fw@strlen.de>
---
net/netfilter/ipvs/ip_vs_ctl.c | 26 ++++++++++++++++++++++----
1 file changed, 22 insertions(+), 4 deletions(-)
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index a1f070cb76c3..1322dd54ed7c 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -281,6 +281,20 @@ static void est_reload_work_handler(struct work_struct *work)
mutex_unlock(&ipvs->est_mutex);
}
+static int get_conn_tab_size(struct netns_ipvs *ipvs)
+{
+ const struct ip_vs_rht *t;
+ int size = 0;
+
+ rcu_read_lock();
+ t = rcu_dereference(ipvs->conn_tab);
+ if (t)
+ size = t->size;
+ rcu_read_unlock();
+
+ return size;
+}
+
int
ip_vs_use_count_inc(void)
{
@@ -2741,10 +2755,13 @@ static void ip_vs_info_seq_stop(struct seq_file *seq, void *v)
static int ip_vs_info_seq_show(struct seq_file *seq, void *v)
{
+ struct net *net = seq_file_net(seq);
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
if (v == SEQ_START_TOKEN) {
seq_printf(seq,
"IP Virtual Server version %d.%d.%d (size=%d)\n",
- NVERSION(IP_VS_VERSION_CODE), ip_vs_conn_tab_size);
+ NVERSION(IP_VS_VERSION_CODE), get_conn_tab_size(ipvs));
seq_puts(seq,
"Prot LocalAddress:Port Scheduler Flags\n");
seq_puts(seq,
@@ -3425,7 +3442,7 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
char buf[64];
sprintf(buf, "IP Virtual Server version %d.%d.%d (size=%d)",
- NVERSION(IP_VS_VERSION_CODE), ip_vs_conn_tab_size);
+ NVERSION(IP_VS_VERSION_CODE), get_conn_tab_size(ipvs));
if (copy_to_user(user, buf, strlen(buf)+1) != 0) {
ret = -EFAULT;
goto out;
@@ -3437,8 +3454,9 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
case IP_VS_SO_GET_INFO:
{
struct ip_vs_getinfo info;
+
info.version = IP_VS_VERSION_CODE;
- info.size = ip_vs_conn_tab_size;
+ info.size = get_conn_tab_size(ipvs);
info.num_services =
atomic_read(&ipvs->num_services[IP_VS_AF_INET]);
if (copy_to_user(user, &info, sizeof(info)) != 0)
@@ -4447,7 +4465,7 @@ static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info)
if (nla_put_u32(msg, IPVS_INFO_ATTR_VERSION,
IP_VS_VERSION_CODE) ||
nla_put_u32(msg, IPVS_INFO_ATTR_CONN_TAB_SIZE,
- ip_vs_conn_tab_size))
+ get_conn_tab_size(ipvs)))
goto nla_put_failure;
break;
}
--
2.52.0
^ permalink raw reply related
* [PATCH net-next 02/11] ipvs: add ip_vs_status info
From: Florian Westphal @ 2026-04-10 11:23 UTC (permalink / raw)
To: netdev
Cc: Paolo Abeni, David S. Miller, Eric Dumazet, Jakub Kicinski,
netfilter-devel, pablo
In-Reply-To: <20260410112352.23599-1-fw@strlen.de>
From: Julian Anastasov <ja@ssi.bg>
Add /proc/net/ip_vs_status to show current state of IPVS.
The motivation for this new /proc interface is to provide the output
for the users to help them decide when to tune the load factor for
hash tables, which is possible with the new sysctl knobs coming in
followup patch.
The output also includes information for the kthreads used for stats.
Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Florian Westphal <fw@strlen.de>
---
net/netfilter/ipvs/ip_vs_ctl.c | 145 +++++++++++++++++++++++++++++++++
1 file changed, 145 insertions(+)
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 1322dd54ed7c..fb1df61edfdd 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -2924,6 +2924,144 @@ static int ip_vs_stats_percpu_show(struct seq_file *seq, void *v)
return 0;
}
+
+static int ip_vs_status_show(struct seq_file *seq, void *v)
+{
+ struct net *net = seq_file_single_net(seq);
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ unsigned int resched_score = 0;
+ struct ip_vs_conn_hnode *hn;
+ struct hlist_bl_head *head;
+ struct ip_vs_service *svc;
+ struct ip_vs_rht *t, *pt;
+ struct hlist_bl_node *e;
+ int old_gen, new_gen;
+ u32 counts[8];
+ u32 bucket;
+ int count;
+ u32 sum1;
+ u32 sum;
+ int i;
+
+ rcu_read_lock();
+
+ t = rcu_dereference(ipvs->conn_tab);
+
+ seq_printf(seq, "Conns:\t%d\n", atomic_read(&ipvs->conn_count));
+ seq_printf(seq, "Conn buckets:\t%d (%d bits, lfactor %d)\n",
+ t ? t->size : 0, t ? t->bits : 0, t ? t->lfactor : 0);
+
+ if (!atomic_read(&ipvs->conn_count))
+ goto after_conns;
+ old_gen = atomic_read(&ipvs->conn_tab_changes);
+
+repeat_conn:
+ smp_rmb(); /* ipvs->conn_tab and conn_tab_changes */
+ memset(counts, 0, sizeof(counts));
+ ip_vs_rht_for_each_table_rcu(ipvs->conn_tab, t, pt) {
+ for (bucket = 0; bucket < t->size; bucket++) {
+ DECLARE_IP_VS_RHT_WALK_BUCKET_RCU();
+
+ count = 0;
+ resched_score++;
+ ip_vs_rht_walk_bucket_rcu(t, bucket, head) {
+ count = 0;
+ hlist_bl_for_each_entry_rcu(hn, e, head, node)
+ count++;
+ }
+ resched_score += count;
+ if (resched_score >= 100) {
+ resched_score = 0;
+ cond_resched_rcu();
+ new_gen = atomic_read(&ipvs->conn_tab_changes);
+ /* New table installed ? */
+ if (old_gen != new_gen) {
+ old_gen = new_gen;
+ goto repeat_conn;
+ }
+ }
+ counts[min(count, (int)ARRAY_SIZE(counts) - 1)]++;
+ }
+ }
+ for (sum = 0, i = 0; i < ARRAY_SIZE(counts); i++)
+ sum += counts[i];
+ sum1 = sum - counts[0];
+ seq_printf(seq, "Conn buckets empty:\t%u (%lu%%)\n",
+ counts[0], (unsigned long)counts[0] * 100 / max(sum, 1U));
+ for (i = 1; i < ARRAY_SIZE(counts); i++) {
+ if (!counts[i])
+ continue;
+ seq_printf(seq, "Conn buckets len-%d:\t%u (%lu%%)\n",
+ i, counts[i],
+ (unsigned long)counts[i] * 100 / max(sum1, 1U));
+ }
+
+after_conns:
+ t = rcu_dereference(ipvs->svc_table);
+
+ count = ip_vs_get_num_services(ipvs);
+ seq_printf(seq, "Services:\t%d\n", count);
+ seq_printf(seq, "Service buckets:\t%d (%d bits, lfactor %d)\n",
+ t ? t->size : 0, t ? t->bits : 0, t ? t->lfactor : 0);
+
+ if (!count)
+ goto after_svc;
+ old_gen = atomic_read(&ipvs->svc_table_changes);
+
+repeat_svc:
+ smp_rmb(); /* ipvs->svc_table and svc_table_changes */
+ memset(counts, 0, sizeof(counts));
+ ip_vs_rht_for_each_table_rcu(ipvs->svc_table, t, pt) {
+ for (bucket = 0; bucket < t->size; bucket++) {
+ DECLARE_IP_VS_RHT_WALK_BUCKET_RCU();
+
+ count = 0;
+ resched_score++;
+ ip_vs_rht_walk_bucket_rcu(t, bucket, head) {
+ count = 0;
+ hlist_bl_for_each_entry_rcu(svc, e, head,
+ s_list)
+ count++;
+ }
+ resched_score += count;
+ if (resched_score >= 100) {
+ resched_score = 0;
+ cond_resched_rcu();
+ new_gen = atomic_read(&ipvs->svc_table_changes);
+ /* New table installed ? */
+ if (old_gen != new_gen) {
+ old_gen = new_gen;
+ goto repeat_svc;
+ }
+ }
+ counts[min(count, (int)ARRAY_SIZE(counts) - 1)]++;
+ }
+ }
+ for (sum = 0, i = 0; i < ARRAY_SIZE(counts); i++)
+ sum += counts[i];
+ sum1 = sum - counts[0];
+ seq_printf(seq, "Service buckets empty:\t%u (%lu%%)\n",
+ counts[0], (unsigned long)counts[0] * 100 / max(sum, 1U));
+ for (i = 1; i < ARRAY_SIZE(counts); i++) {
+ if (!counts[i])
+ continue;
+ seq_printf(seq, "Service buckets len-%d:\t%u (%lu%%)\n",
+ i, counts[i],
+ (unsigned long)counts[i] * 100 / max(sum1, 1U));
+ }
+
+after_svc:
+ seq_printf(seq, "Stats thread slots:\t%d (max %lu)\n",
+ ipvs->est_kt_count, ipvs->est_max_threads);
+ seq_printf(seq, "Stats chain max len:\t%d\n", ipvs->est_chain_max);
+ seq_printf(seq, "Stats thread ests:\t%d\n",
+ ipvs->est_chain_max * IPVS_EST_CHAIN_FACTOR *
+ IPVS_EST_NTICKS);
+
+ rcu_read_unlock();
+ return 0;
+}
+
#endif
/*
@@ -4825,6 +4963,9 @@ int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs)
ipvs->net->proc_net,
ip_vs_stats_percpu_show, NULL))
goto err_percpu;
+ if (!proc_create_net_single("ip_vs_status", 0, ipvs->net->proc_net,
+ ip_vs_status_show, NULL))
+ goto err_status;
#endif
ret = ip_vs_control_net_init_sysctl(ipvs);
@@ -4835,6 +4976,9 @@ int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs)
err:
#ifdef CONFIG_PROC_FS
+ remove_proc_entry("ip_vs_status", ipvs->net->proc_net);
+
+err_status:
remove_proc_entry("ip_vs_stats_percpu", ipvs->net->proc_net);
err_percpu:
@@ -4860,6 +5004,7 @@ void __net_exit ip_vs_control_net_cleanup(struct netns_ipvs *ipvs)
ip_vs_control_net_cleanup_sysctl(ipvs);
cancel_delayed_work_sync(&ipvs->est_reload_work);
#ifdef CONFIG_PROC_FS
+ remove_proc_entry("ip_vs_status", ipvs->net->proc_net);
remove_proc_entry("ip_vs_stats_percpu", ipvs->net->proc_net);
remove_proc_entry("ip_vs_stats", ipvs->net->proc_net);
remove_proc_entry("ip_vs", ipvs->net->proc_net);
--
2.52.0
^ permalink raw reply related
* [PATCH net-next 03/11] ipvs: add conn_lfactor and svc_lfactor sysctl vars
From: Florian Westphal @ 2026-04-10 11:23 UTC (permalink / raw)
To: netdev
Cc: Paolo Abeni, David S. Miller, Eric Dumazet, Jakub Kicinski,
netfilter-devel, pablo
In-Reply-To: <20260410112352.23599-1-fw@strlen.de>
From: Julian Anastasov <ja@ssi.bg>
Allow the default load factor for the connection and service tables
to be configured.
Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Florian Westphal <fw@strlen.de>
---
Documentation/networking/ipvs-sysctl.rst | 37 ++++++++++++
net/netfilter/ipvs/ip_vs_ctl.c | 76 ++++++++++++++++++++++++
2 files changed, 113 insertions(+)
diff --git a/Documentation/networking/ipvs-sysctl.rst b/Documentation/networking/ipvs-sysctl.rst
index 3fb5fa142eef..a556439f8be7 100644
--- a/Documentation/networking/ipvs-sysctl.rst
+++ b/Documentation/networking/ipvs-sysctl.rst
@@ -29,6 +29,33 @@ backup_only - BOOLEAN
If set, disable the director function while the server is
in backup mode to avoid packet loops for DR/TUN methods.
+conn_lfactor - INTEGER
+ Possible values: -8 (larger table) .. 8 (smaller table)
+
+ Default: -4
+
+ Controls the sizing of the connection hash table based on the
+ load factor (number of connections per table buckets):
+
+ 2^conn_lfactor = nodes / buckets
+
+ As result, the table grows if load increases and shrinks when
+ load decreases in the range of 2^8 - 2^conn_tab_bits (module
+ parameter).
+ The value is a shift count where negative values select
+ buckets = (connection hash nodes << -value) while positive
+ values select buckets = (connection hash nodes >> value). The
+ negative values reduce the collisions and reduce the time for
+ lookups but increase the table size. Positive values will
+ tolerate load above 100% when using smaller table is
+ preferred with the cost of more collisions. If using NAT
+ connections consider decreasing the value with one because
+ they add two nodes in the hash table.
+
+ Example:
+ -4: grow if load goes above 6% (buckets = nodes * 16)
+ 2: grow if load goes above 400% (buckets = nodes / 4)
+
conn_reuse_mode - INTEGER
1 - default
@@ -219,6 +246,16 @@ secure_tcp - INTEGER
The value definition is the same as that of drop_entry and
drop_packet.
+svc_lfactor - INTEGER
+ Possible values: -8 (larger table) .. 8 (smaller table)
+
+ Default: -3
+
+ Controls the sizing of the service hash table based on the
+ load factor (number of services per table buckets). The table
+ will grow and shrink in the range of 2^4 - 2^20.
+ See conn_lfactor for explanation.
+
sync_threshold - vector of 2 INTEGERs: sync_threshold, sync_period
default 3 50
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index fb1df61edfdd..6632daa87ded 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -2445,6 +2445,60 @@ static int ipvs_proc_run_estimation(const struct ctl_table *table, int write,
return ret;
}
+static int ipvs_proc_conn_lfactor(const struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct netns_ipvs *ipvs = table->extra2;
+ int *valp = table->data;
+ int val = *valp;
+ int ret;
+
+ struct ctl_table tmp_table = {
+ .data = &val,
+ .maxlen = sizeof(int),
+ };
+
+ ret = proc_dointvec(&tmp_table, write, buffer, lenp, ppos);
+ if (write && ret >= 0) {
+ if (val < -8 || val > 8) {
+ ret = -EINVAL;
+ } else {
+ *valp = val;
+ if (rcu_access_pointer(ipvs->conn_tab))
+ mod_delayed_work(system_unbound_wq,
+ &ipvs->conn_resize_work, 0);
+ }
+ }
+ return ret;
+}
+
+static int ipvs_proc_svc_lfactor(const struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct netns_ipvs *ipvs = table->extra2;
+ int *valp = table->data;
+ int val = *valp;
+ int ret;
+
+ struct ctl_table tmp_table = {
+ .data = &val,
+ .maxlen = sizeof(int),
+ };
+
+ ret = proc_dointvec(&tmp_table, write, buffer, lenp, ppos);
+ if (write && ret >= 0) {
+ if (val < -8 || val > 8) {
+ ret = -EINVAL;
+ } else {
+ *valp = val;
+ if (rcu_access_pointer(ipvs->svc_table))
+ mod_delayed_work(system_unbound_wq,
+ &ipvs->svc_resize_work, 0);
+ }
+ }
+ return ret;
+}
+
/*
* IPVS sysctl table (under the /proc/sys/net/ipv4/vs/)
* Do not change order or insert new entries without
@@ -2633,6 +2687,18 @@ static struct ctl_table vs_vars[] = {
.mode = 0644,
.proc_handler = ipvs_proc_est_nice,
},
+ {
+ .procname = "conn_lfactor",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = ipvs_proc_conn_lfactor,
+ },
+ {
+ .procname = "svc_lfactor",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = ipvs_proc_svc_lfactor,
+ },
#ifdef CONFIG_IP_VS_DEBUG
{
.procname = "debug_level",
@@ -4853,6 +4919,16 @@ static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs)
tbl[idx].extra2 = ipvs;
tbl[idx++].data = &ipvs->sysctl_est_nice;
+ if (unpriv)
+ tbl[idx].mode = 0444;
+ tbl[idx].extra2 = ipvs;
+ tbl[idx++].data = &ipvs->sysctl_conn_lfactor;
+
+ if (unpriv)
+ tbl[idx].mode = 0444;
+ tbl[idx].extra2 = ipvs;
+ tbl[idx++].data = &ipvs->sysctl_svc_lfactor;
+
#ifdef CONFIG_IP_VS_DEBUG
/* Global sysctls must be ro in non-init netns */
if (!net_eq(net, &init_net))
--
2.52.0
^ permalink raw reply related
* [PATCH net-next 04/11] netfilter: x_physdev: reject empty or not-nul terminated device names
From: Florian Westphal @ 2026-04-10 11:23 UTC (permalink / raw)
To: netdev
Cc: Paolo Abeni, David S. Miller, Eric Dumazet, Jakub Kicinski,
netfilter-devel, pablo
In-Reply-To: <20260410112352.23599-1-fw@strlen.de>
Reject names that lack a \0 character and reject the empty string as
well. iptables allows this but it fails to re-parse iptables-save output
that contain such rules.
Signed-off-by: Florian Westphal <fw@strlen.de>
---
net/netfilter/xt_physdev.c | 22 ++++++++++++++++++++++
1 file changed, 22 insertions(+)
diff --git a/net/netfilter/xt_physdev.c b/net/netfilter/xt_physdev.c
index 343e65f377d4..53997771013f 100644
--- a/net/netfilter/xt_physdev.c
+++ b/net/netfilter/xt_physdev.c
@@ -107,6 +107,28 @@ static int physdev_mt_check(const struct xt_mtchk_param *par)
return -EINVAL;
}
+#define X(memb) strnlen(info->memb, sizeof(info->memb)) >= sizeof(info->memb)
+ if (info->bitmask & XT_PHYSDEV_OP_IN) {
+ if (info->physindev[0] == '\0')
+ return -EINVAL;
+ if (X(physindev))
+ return -ENAMETOOLONG;
+ }
+
+ if (info->bitmask & XT_PHYSDEV_OP_OUT) {
+ if (info->physoutdev[0] == '\0')
+ return -EINVAL;
+
+ if (X(physoutdev))
+ return -ENAMETOOLONG;
+ }
+
+ if (X(in_mask))
+ return -ENAMETOOLONG;
+ if (X(out_mask))
+ return -ENAMETOOLONG;
+#undef X
+
if (!brnf_probed) {
brnf_probed = true;
request_module("br_netfilter");
--
2.52.0
^ permalink raw reply related
* [PATCH net-next 05/11] netfilter: nfnetlink: prefer skb_mac_header helpers
From: Florian Westphal @ 2026-04-10 11:23 UTC (permalink / raw)
To: netdev
Cc: Paolo Abeni, David S. Miller, Eric Dumazet, Jakub Kicinski,
netfilter-devel, pablo
In-Reply-To: <20260410112352.23599-1-fw@strlen.de>
This adds implicit DEBUG_WARN_ON_ONCE for debug configurations.
No other changes intended.
Signed-off-by: Florian Westphal <fw@strlen.de>
---
net/netfilter/nfnetlink_log.c | 19 ++++++++++---------
net/netfilter/nfnetlink_queue.c | 25 ++++++++++++-------------
2 files changed, 22 insertions(+), 22 deletions(-)
diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c
index b2c24cb919d4..2439cbbd5b26 100644
--- a/net/netfilter/nfnetlink_log.c
+++ b/net/netfilter/nfnetlink_log.c
@@ -401,7 +401,7 @@ nfulnl_timer(struct timer_list *t)
static u32 nfulnl_get_bridge_size(const struct sk_buff *skb)
{
- u32 size = 0;
+ u32 mac_len, size = 0;
if (!skb_mac_header_was_set(skb))
return 0;
@@ -412,14 +412,17 @@ static u32 nfulnl_get_bridge_size(const struct sk_buff *skb)
size += nla_total_size(sizeof(u16)); /* tag */
}
- if (skb->network_header > skb->mac_header)
- size += nla_total_size(skb->network_header - skb->mac_header);
+ mac_len = skb_mac_header_len(skb);
+ if (mac_len > 0)
+ size += nla_total_size(mac_len);
return size;
}
static int nfulnl_put_bridge(struct nfulnl_instance *inst, const struct sk_buff *skb)
{
+ u32 mac_len;
+
if (!skb_mac_header_was_set(skb))
return 0;
@@ -437,12 +440,10 @@ static int nfulnl_put_bridge(struct nfulnl_instance *inst, const struct sk_buff
nla_nest_end(inst->skb, nest);
}
- if (skb->mac_header < skb->network_header) {
- int len = (int)(skb->network_header - skb->mac_header);
-
- if (nla_put(inst->skb, NFULA_L2HDR, len, skb_mac_header(skb)))
- goto nla_put_failure;
- }
+ mac_len = skb_mac_header_len(skb);
+ if (mac_len > 0 &&
+ nla_put(inst->skb, NFULA_L2HDR, mac_len, skb_mac_header(skb)))
+ goto nla_put_failure;
return 0;
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index c7ee6f6ff725..58304fd1f70f 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -579,6 +579,7 @@ static u32 nfqnl_get_bridge_size(struct nf_queue_entry *entry)
{
struct sk_buff *entskb = entry->skb;
u32 nlalen = 0;
+ u32 mac_len;
if (entry->state.pf != PF_BRIDGE || !skb_mac_header_was_set(entskb))
return 0;
@@ -587,9 +588,9 @@ static u32 nfqnl_get_bridge_size(struct nf_queue_entry *entry)
nlalen += nla_total_size(nla_total_size(sizeof(__be16)) +
nla_total_size(sizeof(__be16)));
- if (entskb->network_header > entskb->mac_header)
- nlalen += nla_total_size((entskb->network_header -
- entskb->mac_header));
+ mac_len = skb_mac_header_len(entskb);
+ if (mac_len > 0)
+ nlalen += nla_total_size(mac_len);
return nlalen;
}
@@ -597,6 +598,7 @@ static u32 nfqnl_get_bridge_size(struct nf_queue_entry *entry)
static int nfqnl_put_bridge(struct nf_queue_entry *entry, struct sk_buff *skb)
{
struct sk_buff *entskb = entry->skb;
+ u32 mac_len;
if (entry->state.pf != PF_BRIDGE || !skb_mac_header_was_set(entskb))
return 0;
@@ -615,12 +617,10 @@ static int nfqnl_put_bridge(struct nf_queue_entry *entry, struct sk_buff *skb)
nla_nest_end(skb, nest);
}
- if (entskb->mac_header < entskb->network_header) {
- int len = (int)(entskb->network_header - entskb->mac_header);
-
- if (nla_put(skb, NFQA_L2HDR, len, skb_mac_header(entskb)))
- goto nla_put_failure;
- }
+ mac_len = skb_mac_header_len(entskb);
+ if (mac_len > 0 &&
+ nla_put(skb, NFQA_L2HDR, mac_len, skb_mac_header(entskb)))
+ goto nla_put_failure;
return 0;
@@ -1004,13 +1004,13 @@ nf_queue_entry_dup(struct nf_queue_entry *e)
static void nf_bridge_adjust_skb_data(struct sk_buff *skb)
{
if (nf_bridge_info_get(skb))
- __skb_push(skb, skb->network_header - skb->mac_header);
+ __skb_push(skb, skb_mac_header_len(skb));
}
static void nf_bridge_adjust_segmented_data(struct sk_buff *skb)
{
if (nf_bridge_info_get(skb))
- __skb_pull(skb, skb->network_header - skb->mac_header);
+ __skb_pull(skb, skb_mac_header_len(skb));
}
#else
#define nf_bridge_adjust_skb_data(s) do {} while (0)
@@ -1469,8 +1469,7 @@ static int nfqa_parse_bridge(struct nf_queue_entry *entry,
}
if (nfqa[NFQA_L2HDR]) {
- int mac_header_len = entry->skb->network_header -
- entry->skb->mac_header;
+ u32 mac_header_len = skb_mac_header_len(entry->skb);
if (mac_header_len != nla_len(nfqa[NFQA_L2HDR]))
return -EINVAL;
--
2.52.0
^ permalink raw reply related
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox