Netdev List
 help / color / mirror / Atom feed
* [PATCH 6.6.y] net: sched: fix TCF_LAYER_TRANSPORT handling in tcf_get_base_ptr()
From: Chelsy Ratnawat @ 2026-04-15 22:19 UTC (permalink / raw)
  To: stable
  Cc: jhs, jiri, davem, edumazet, kuba, netdev,
	syzbot+f3a497f02c389d86ef16, Chelsy Ratnawat

From: Eric Dumazet <edumazet@google.com>

[Upstream commit 4fe5a00ec70717a7f1002d8913ec6143582b3c8e]

syzbot reported that tcf_get_base_ptr() can be called while transport
header is not set [1].

Instead of returning a dangling pointer, return NULL.

Fix tcf_get_base_ptr() callers to handle this NULL value.

[1]
 WARNING: CPU: 1 PID: 6019 at ./include/linux/skbuff.h:3071 skb_transport_header include/linux/skbuff.h:3071 [inline]
 WARNING: CPU: 1 PID: 6019 at ./include/linux/skbuff.h:3071 tcf_get_base_ptr include/net/pkt_cls.h:539 [inline]
 WARNING: CPU: 1 PID: 6019 at ./include/linux/skbuff.h:3071 em_nbyte_match+0x2d8/0x3f0 net/sched/em_nbyte.c:43
Modules linked in:
CPU: 1 UID: 0 PID: 6019 Comm: syz.0.17 Not tainted syzkaller #0 PREEMPT(full)
Call Trace:
 <TASK>
  tcf_em_match net/sched/ematch.c:494 [inline]
  __tcf_em_tree_match+0x1ac/0x770 net/sched/ematch.c:520
  tcf_em_tree_match include/net/pkt_cls.h:512 [inline]
  basic_classify+0x115/0x2d0 net/sched/cls_basic.c:50
  tc_classify include/net/tc_wrapper.h:197 [inline]
  __tcf_classify net/sched/cls_api.c:1764 [inline]
  tcf_classify+0x4cf/0x1140 net/sched/cls_api.c:1860
  multiq_classify net/sched/sch_multiq.c:39 [inline]
  multiq_enqueue+0xfd/0x4c0 net/sched/sch_multiq.c:66
  dev_qdisc_enqueue+0x4e/0x260 net/core/dev.c:4118
  __dev_xmit_skb net/core/dev.c:4214 [inline]
  __dev_queue_xmit+0xe83/0x3b50 net/core/dev.c:4729
  packet_snd net/packet/af_packet.c:3076 [inline]
  packet_sendmsg+0x3e33/0x5080 net/packet/af_packet.c:3108
  sock_sendmsg_nosec net/socket.c:727 [inline]
  __sock_sendmsg+0x21c/0x270 net/socket.c:742
  ____sys_sendmsg+0x505/0x830 net/socket.c:2630

Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Reported-by: syzbot+f3a497f02c389d86ef16@syzkaller.appspotmail.com
Closes: https://lore.kernel.org/netdev/6920855a.a70a0220.2ea503.0058.GAE@google.com/T/#u
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Jamal Hadi Salim <jhs@mojatatu.com>
Link: https://patch.msgid.link/20251121154100.1616228-1-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Chelsy Ratnawat <chelsyratnawat2001@gmail.com>
---
 include/net/pkt_cls.h |  2 ++
 net/sched/em_cmp.c    |  5 ++++-
 net/sched/em_nbyte.c  |  2 ++
 net/sched/em_text.c   | 11 +++++++++--
 4 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index f308e8268651..ccc1c698ed00 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -525,6 +525,8 @@ static inline unsigned char * tcf_get_base_ptr(struct sk_buff *skb, int layer)
 		case TCF_LAYER_NETWORK:
 			return skb_network_header(skb);
 		case TCF_LAYER_TRANSPORT:
+			if (!skb_transport_header_was_set(skb))
+				break;
 			return skb_transport_header(skb);
 	}
 
diff --git a/net/sched/em_cmp.c b/net/sched/em_cmp.c
index f17b049ea530..71ce113f2d08 100644
--- a/net/sched/em_cmp.c
+++ b/net/sched/em_cmp.c
@@ -22,9 +22,12 @@ static int em_cmp_match(struct sk_buff *skb, struct tcf_ematch *em,
 			struct tcf_pkt_info *info)
 {
 	struct tcf_em_cmp *cmp = (struct tcf_em_cmp *) em->data;
-	unsigned char *ptr = tcf_get_base_ptr(skb, cmp->layer) + cmp->off;
+	unsigned char *ptr = tcf_get_base_ptr(skb, cmp->layer);
 	u32 val = 0;
 
+	if (!ptr)
+		return 0;
+	ptr += cmp->off;
 	if (!tcf_valid_offset(skb, ptr, cmp->align))
 		return 0;
 
diff --git a/net/sched/em_nbyte.c b/net/sched/em_nbyte.c
index a83b237cbeb0..2e3c1d58d456 100644
--- a/net/sched/em_nbyte.c
+++ b/net/sched/em_nbyte.c
@@ -42,6 +42,8 @@ static int em_nbyte_match(struct sk_buff *skb, struct tcf_ematch *em,
 	struct nbyte_data *nbyte = (struct nbyte_data *) em->data;
 	unsigned char *ptr = tcf_get_base_ptr(skb, nbyte->hdr.layer);
 
+	if (!ptr)
+		return 0;
 	ptr += nbyte->hdr.off;
 
 	if (!tcf_valid_offset(skb, ptr, nbyte->hdr.len))
diff --git a/net/sched/em_text.c b/net/sched/em_text.c
index f176afb70559..32aae8a9deda 100644
--- a/net/sched/em_text.c
+++ b/net/sched/em_text.c
@@ -29,12 +29,19 @@ static int em_text_match(struct sk_buff *skb, struct tcf_ematch *m,
 			 struct tcf_pkt_info *info)
 {
 	struct text_match *tm = EM_TEXT_PRIV(m);
+	unsigned char *ptr;
 	int from, to;
 
-	from = tcf_get_base_ptr(skb, tm->from_layer) - skb->data;
+	ptr = tcf_get_base_ptr(skb, tm->from_layer);
+	if (!ptr)
+		return 0;
+	from = ptr - skb->data;
 	from += tm->from_offset;
 
-	to = tcf_get_base_ptr(skb, tm->to_layer) - skb->data;
+	ptr = tcf_get_base_ptr(skb, tm->to_layer);
+	if (!ptr)
+		return 0;
+	to = ptr - skb->data;
 	to += tm->to_offset;
 
 	return skb_find_text(skb, from, to, tm->config) != UINT_MAX;
-- 
2.43.0


^ permalink raw reply related

* Re: [PATCH net v4] openvswitch: cap upcall PID array size and pre-size vport replies
From: Ilya Maximets @ 2026-04-15 22:41 UTC (permalink / raw)
  To: Weiming Shi, Aaron Conole, Eelco Chaudron, David S . Miller,
	Eric Dumazet, Jakub Kicinski, Paolo Abeni
  Cc: i.maximets, Simon Horman, netdev, dev, Xiang Mei
In-Reply-To: <20260415125121.110874-2-bestswngs@gmail.com>

On 4/15/26 2:51 PM, Weiming Shi wrote:
> The vport netlink reply helpers allocate a fixed-size skb with
> nlmsg_new(NLMSG_DEFAULT_SIZE, ...) but serialize the full upcall PID
> array via ovs_vport_get_upcall_portids().  Since
> ovs_vport_set_upcall_portids() accepts any non-zero multiple of
> sizeof(u32) with no upper bound, a CAP_NET_ADMIN user can install a PID
> array large enough to overflow the reply buffer, causing nla_put() to
> fail with -EMSGSIZE and hitting BUG_ON(err < 0).  On systems with
> unprivileged user namespaces enabled (e.g., Ubuntu default), this is
> reachable via unshare -Urn since OVS vport mutation operations use
> GENL_UNS_ADMIN_PERM.
> 
>  kernel BUG at net/openvswitch/datapath.c:2414!
>  Oops: invalid opcode: 0000 [#1] SMP KASAN NOPTI
>  CPU: 1 UID: 0 PID: 65 Comm: poc Not tainted 7.0.0-rc7-00195-geb216e422044 #1
>  RIP: 0010:ovs_vport_cmd_set+0x34c/0x400
>  Call Trace:
>   <TASK>
>   genl_family_rcv_msg_doit (net/netlink/genetlink.c:1116)
>   genl_rcv_msg (net/netlink/genetlink.c:1194)
>   netlink_rcv_skb (net/netlink/af_netlink.c:2550)
>   genl_rcv (net/netlink/genetlink.c:1219)
>   netlink_unicast (net/netlink/af_netlink.c:1344)
>   netlink_sendmsg (net/netlink/af_netlink.c:1894)
>   __sys_sendto (net/socket.c:2206)
>   __x64_sys_sendto (net/socket.c:2209)
>   do_syscall_64 (arch/x86/entry/syscall_64.c:63)
>   entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:130)
>   </TASK>
>  Kernel panic - not syncing: Fatal exception
> 
> Reject attempts to set more PIDs than nr_cpu_ids in
> ovs_vport_set_upcall_portids(), and pre-compute the worst-case reply
> size in ovs_vport_cmd_msg_size() based on that bound, similar to the
> existing ovs_dp_cmd_msg_size().  nr_cpu_ids matches the cap already
> used by the per-CPU dispatch configuration on the datapath side
> (ovs_dp_cmd_fill_info() serialises at most nr_cpu_ids PIDs), so the
> two sides stay consistent.
> 
> Fixes: 5cd667b0a456 ("openvswitch: Allow each vport to have an array of 'port_id's.")
> Reported-by: Xiang Mei <xmei5@asu.edu>
> Signed-off-by: Weiming Shi <bestswngs@gmail.com>
> ---
> v4 (per Ilya):
> - Use nr_cpu_ids instead of num_possible_cpus() for consistency with
>   the per-CPU dispatch on the datapath side.
> - Annotate ovs_vport_cmd_msg_size() per-attribute; split nested sums.
> v3: Cap at num_possible_cpus(); add ovs_vport_cmd_msg_size(); keep
>     BUG_ON(); fix Fixes tag.
> v2: Dynamically size reply skb; drop WARN_ON_ONCE, return plain errors.

Please, don't re-name the patch for every version if there are no changes
that actually invalidate the name.  It was definitely not necessary in the
past few versions of this patch.  Could've even kept the original name
from v1, it was fine.  But please, keep the current v4 name in v5.

These renames are messing up version tracking.  Also, please, add lore links
into the changelog for previous versions, especially if you're renaming the
patch, so reviewers can find the older versions.

In case you're using AI to help with these patches (which would explain the
constant renaming), you should disclose that by adding an Assisted-by tag:
  https://docs.kernel.org/process/coding-assistants.html#attribution

> ---
>  net/openvswitch/datapath.c | 33 +++++++++++++++++++++++++++++++--
>  net/openvswitch/vport.c    |  3 +++
>  2 files changed, 34 insertions(+), 2 deletions(-)
> 
> diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
> index e209099218b4..35e67e51b0d2 100644
> --- a/net/openvswitch/datapath.c
> +++ b/net/openvswitch/datapath.c
> @@ -2184,9 +2184,38 @@ static int ovs_vport_cmd_fill_info(struct vport *vport, struct sk_buff *skb,
>  	return err;
>  }
>  
> +static size_t ovs_vport_cmd_msg_size(void)
> +{
> +	size_t msgsize = NLMSG_ALIGN(sizeof(struct ovs_header));
> +
> +	msgsize += nla_total_size(sizeof(u32)); /* OVS_VPORT_ATTR_PORT_NO */
> +	msgsize += nla_total_size(sizeof(u32)); /* OVS_VPORT_ATTR_TYPE */
> +	msgsize += nla_total_size(IFNAMSIZ);    /* OVS_VPORT_ATTR_NAME */
> +	msgsize += nla_total_size(sizeof(u32)); /* OVS_VPORT_ATTR_IFINDEX */
> +	msgsize += nla_total_size(sizeof(s32)); /* OVS_VPORT_ATTR_NETNSID */

Add an empty line here, it's hard to read when comments are sandwiched
between the code.  Same for all the blocks below (empty line before the
comment line).

> +	/* OVS_VPORT_ATTR_STATS */
> +	msgsize += nla_total_size_64bit(sizeof(struct ovs_vport_stats));
> +	/* OVS_VPORT_ATTR_UPCALL_STATS(OVS_VPORT_UPCALL_ATTR_SUCCESS +
> +	 *                             OVS_VPORT_UPCALL_ATTR_FAIL)
> +	 */
> +	msgsize += nla_total_size(nla_total_size_64bit(sizeof(u64)) +
> +				  nla_total_size_64bit(sizeof(u64)));
> +	/* OVS_VPORT_ATTR_UPCALL_PID (capped at nr_cpu_ids by
> +	 * ovs_vport_set_upcall_portids())

The explanation inside the parentheses is not needed, IMO.

The rest seems fine to me.

Best regards, Ilya Maximets.

^ permalink raw reply

* [net,PATCH v4 1/2] net: ks8851: Reinstate disabling of BHs around IRQ handler
From: Marek Vasut @ 2026-04-15 23:09 UTC (permalink / raw)
  To: netdev
  Cc: Marek Vasut, Sebastian Andrzej Siewior, stable, David S. Miller,
	Andrew Lunn, Eric Dumazet, Jakub Kicinski, Nicolai Buchwitz,
	Paolo Abeni, Ronald Wahl, Yicong Hui, linux-kernel

If the driver executes ks8851_irq() AND a TX packet has been sent, then
the driver enables TX queue via netif_wake_queue() which schedules TX
softirq to queue packets for this device.

If CONFIG_PREEMPT_RT=y is set AND a packet has also been received by
the MAC, then ks8851_rx_pkts() calls netdev_alloc_skb_ip_align() to
allocate SKBs for the received packets. If netdev_alloc_skb_ip_align()
is called with BH enabled, then local_bh_enable() at the end of
netdev_alloc_skb_ip_align() will trigger the pending softirq processing,
which may ultimately call the .xmit callback ks8851_start_xmit_par().
The ks8851_start_xmit_par() will try to lock struct ks8851_net_par
.lock spinlock, which is already locked by ks8851_irq() from which
ks8851_start_xmit_par() was called. This leads to a deadlock, which
is reported by the kernel, including a trace listed below.

If CONFIG_PREEMPT_RT is not set, then since commit 0913ec336a6c0
("net: ks8851: Fix deadlock with the SPI chip variant") the deadlock
can also be triggered without received packet in the RX FIFO. The
pending softirqs will be processed on return from
spin_unlock_bh(&ks->statelock) in ks8851_irq(), which triggers the
deadlock as well.

Fix the problem by disabling BH around critical sections, including the
IRQ handler, thus preventing the net_tx_action() softirq from triggering
during these critical sections. The net_tx_action() softirq is triggered
once BH are re-enabled and at the end of the IRQ handler, once all the
other IRQ handler actions have been completed.

 __schedule from schedule_rtlock+0x1c/0x34
 schedule_rtlock from rtlock_slowlock_locked+0x548/0x904
 rtlock_slowlock_locked from rt_spin_lock+0x60/0x9c
 rt_spin_lock from ks8851_start_xmit_par+0x74/0x1a8
 ks8851_start_xmit_par from netdev_start_xmit+0x20/0x44
 netdev_start_xmit from dev_hard_start_xmit+0xd0/0x188
 dev_hard_start_xmit from sch_direct_xmit+0xb8/0x25c
 sch_direct_xmit from __qdisc_run+0x1f8/0x4ec
 __qdisc_run from qdisc_run+0x1c/0x28
 qdisc_run from net_tx_action+0x1f0/0x268
 net_tx_action from handle_softirqs+0x1a4/0x270
 handle_softirqs from __local_bh_enable_ip+0xcc/0xe0
 __local_bh_enable_ip from __alloc_skb+0xd8/0x128
 __alloc_skb from __netdev_alloc_skb+0x3c/0x19c
 __netdev_alloc_skb from ks8851_irq+0x388/0x4d4
 ks8851_irq from irq_thread_fn+0x24/0x64
 irq_thread_fn from irq_thread+0x178/0x28c
 irq_thread from kthread+0x12c/0x138
 kthread from ret_from_fork+0x14/0x28

Reviewed-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Fixes: e0863634bf9f ("net: ks8851: Queue RX packets in IRQ handler instead of disabling BHs")
Cc: stable@vger.kernel.org
Signed-off-by: Marek Vasut <marex@nabladev.com>
---
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Andrew Lunn <andrew+netdev@lunn.ch>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: Nicolai Buchwitz <nb@tipi-net.de>
Cc: Paolo Abeni <pabeni@redhat.com>
Cc: Ronald Wahl <ronald.wahl@raritan.com>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Yicong Hui <yiconghui@gmail.com>
Cc: linux-kernel@vger.kernel.org
Cc: netdev@vger.kernel.org
---
V2: Register dedicated IRQ handler wrapper which disables BH for the
    parallel variant of the MAC, the variant which uses spinlocks as
    the locking primitive. Use stock IRQ handler with BH unchanged
    for the SPI variant, which uses mutexes as locking primitive.
V3: Switch to spin_lock_bh(), update commit message
V4: Extend the commit message
    Add RB from Sebastian
---
 drivers/net/ethernet/micrel/ks8851.h        |  6 +-
 drivers/net/ethernet/micrel/ks8851_common.c | 64 +++++++++------------
 drivers/net/ethernet/micrel/ks8851_par.c    | 15 ++---
 drivers/net/ethernet/micrel/ks8851_spi.c    | 11 ++--
 4 files changed, 38 insertions(+), 58 deletions(-)

diff --git a/drivers/net/ethernet/micrel/ks8851.h b/drivers/net/ethernet/micrel/ks8851.h
index 31f75b4a67fd7..b795a3a605711 100644
--- a/drivers/net/ethernet/micrel/ks8851.h
+++ b/drivers/net/ethernet/micrel/ks8851.h
@@ -408,10 +408,8 @@ struct ks8851_net {
 	struct gpio_desc	*gpio;
 	struct mii_bus		*mii_bus;
 
-	void			(*lock)(struct ks8851_net *ks,
-					unsigned long *flags);
-	void			(*unlock)(struct ks8851_net *ks,
-					  unsigned long *flags);
+	void			(*lock)(struct ks8851_net *ks);
+	void			(*unlock)(struct ks8851_net *ks);
 	unsigned int		(*rdreg16)(struct ks8851_net *ks,
 					   unsigned int reg);
 	void			(*wrreg16)(struct ks8851_net *ks,
diff --git a/drivers/net/ethernet/micrel/ks8851_common.c b/drivers/net/ethernet/micrel/ks8851_common.c
index 8048770958d60..6c375647b24de 100644
--- a/drivers/net/ethernet/micrel/ks8851_common.c
+++ b/drivers/net/ethernet/micrel/ks8851_common.c
@@ -28,25 +28,23 @@
 /**
  * ks8851_lock - register access lock
  * @ks: The chip state
- * @flags: Spinlock flags
  *
  * Claim chip register access lock
  */
-static void ks8851_lock(struct ks8851_net *ks, unsigned long *flags)
+static void ks8851_lock(struct ks8851_net *ks)
 {
-	ks->lock(ks, flags);
+	ks->lock(ks);
 }
 
 /**
  * ks8851_unlock - register access unlock
  * @ks: The chip state
- * @flags: Spinlock flags
  *
  * Release chip register access lock
  */
-static void ks8851_unlock(struct ks8851_net *ks, unsigned long *flags)
+static void ks8851_unlock(struct ks8851_net *ks)
 {
-	ks->unlock(ks, flags);
+	ks->unlock(ks);
 }
 
 /**
@@ -129,11 +127,10 @@ static void ks8851_set_powermode(struct ks8851_net *ks, unsigned pwrmode)
 static int ks8851_write_mac_addr(struct net_device *dev)
 {
 	struct ks8851_net *ks = netdev_priv(dev);
-	unsigned long flags;
 	u16 val;
 	int i;
 
-	ks8851_lock(ks, &flags);
+	ks8851_lock(ks);
 
 	/*
 	 * Wake up chip in case it was powered off when stopped; otherwise,
@@ -149,7 +146,7 @@ static int ks8851_write_mac_addr(struct net_device *dev)
 	if (!netif_running(dev))
 		ks8851_set_powermode(ks, PMECR_PM_SOFTDOWN);
 
-	ks8851_unlock(ks, &flags);
+	ks8851_unlock(ks);
 
 	return 0;
 }
@@ -163,12 +160,11 @@ static int ks8851_write_mac_addr(struct net_device *dev)
 static void ks8851_read_mac_addr(struct net_device *dev)
 {
 	struct ks8851_net *ks = netdev_priv(dev);
-	unsigned long flags;
 	u8 addr[ETH_ALEN];
 	u16 reg;
 	int i;
 
-	ks8851_lock(ks, &flags);
+	ks8851_lock(ks);
 
 	for (i = 0; i < ETH_ALEN; i += 2) {
 		reg = ks8851_rdreg16(ks, KS_MAR(i));
@@ -177,7 +173,7 @@ static void ks8851_read_mac_addr(struct net_device *dev)
 	}
 	eth_hw_addr_set(dev, addr);
 
-	ks8851_unlock(ks, &flags);
+	ks8851_unlock(ks);
 }
 
 /**
@@ -312,11 +308,10 @@ static irqreturn_t ks8851_irq(int irq, void *_ks)
 {
 	struct ks8851_net *ks = _ks;
 	struct sk_buff_head rxq;
-	unsigned long flags;
 	unsigned int status;
 	struct sk_buff *skb;
 
-	ks8851_lock(ks, &flags);
+	ks8851_lock(ks);
 
 	status = ks8851_rdreg16(ks, KS_ISR);
 	ks8851_wrreg16(ks, KS_ISR, status);
@@ -373,7 +368,7 @@ static irqreturn_t ks8851_irq(int irq, void *_ks)
 		ks8851_wrreg16(ks, KS_RXCR1, rxc->rxcr1);
 	}
 
-	ks8851_unlock(ks, &flags);
+	ks8851_unlock(ks);
 
 	if (status & IRQ_LCI)
 		mii_check_link(&ks->mii);
@@ -405,7 +400,6 @@ static void ks8851_flush_tx_work(struct ks8851_net *ks)
 static int ks8851_net_open(struct net_device *dev)
 {
 	struct ks8851_net *ks = netdev_priv(dev);
-	unsigned long flags;
 	int ret;
 
 	ret = request_threaded_irq(dev->irq, NULL, ks8851_irq,
@@ -418,7 +412,7 @@ static int ks8851_net_open(struct net_device *dev)
 
 	/* lock the card, even if we may not actually be doing anything
 	 * else at the moment */
-	ks8851_lock(ks, &flags);
+	ks8851_lock(ks);
 
 	netif_dbg(ks, ifup, ks->netdev, "opening\n");
 
@@ -471,7 +465,7 @@ static int ks8851_net_open(struct net_device *dev)
 
 	netif_dbg(ks, ifup, ks->netdev, "network device up\n");
 
-	ks8851_unlock(ks, &flags);
+	ks8851_unlock(ks);
 	mii_check_link(&ks->mii);
 	return 0;
 }
@@ -487,23 +481,22 @@ static int ks8851_net_open(struct net_device *dev)
 static int ks8851_net_stop(struct net_device *dev)
 {
 	struct ks8851_net *ks = netdev_priv(dev);
-	unsigned long flags;
 
 	netif_info(ks, ifdown, dev, "shutting down\n");
 
 	netif_stop_queue(dev);
 
-	ks8851_lock(ks, &flags);
+	ks8851_lock(ks);
 	/* turn off the IRQs and ack any outstanding */
 	ks8851_wrreg16(ks, KS_IER, 0x0000);
 	ks8851_wrreg16(ks, KS_ISR, 0xffff);
-	ks8851_unlock(ks, &flags);
+	ks8851_unlock(ks);
 
 	/* stop any outstanding work */
 	ks8851_flush_tx_work(ks);
 	flush_work(&ks->rxctrl_work);
 
-	ks8851_lock(ks, &flags);
+	ks8851_lock(ks);
 	/* shutdown RX process */
 	ks8851_wrreg16(ks, KS_RXCR1, 0x0000);
 
@@ -512,7 +505,7 @@ static int ks8851_net_stop(struct net_device *dev)
 
 	/* set powermode to soft power down to save power */
 	ks8851_set_powermode(ks, PMECR_PM_SOFTDOWN);
-	ks8851_unlock(ks, &flags);
+	ks8851_unlock(ks);
 
 	/* ensure any queued tx buffers are dumped */
 	while (!skb_queue_empty(&ks->txq)) {
@@ -566,14 +559,13 @@ static netdev_tx_t ks8851_start_xmit(struct sk_buff *skb,
 static void ks8851_rxctrl_work(struct work_struct *work)
 {
 	struct ks8851_net *ks = container_of(work, struct ks8851_net, rxctrl_work);
-	unsigned long flags;
 
-	ks8851_lock(ks, &flags);
+	ks8851_lock(ks);
 
 	/* need to shutdown RXQ before modifying filter parameters */
 	ks8851_wrreg16(ks, KS_RXCR1, 0x00);
 
-	ks8851_unlock(ks, &flags);
+	ks8851_unlock(ks);
 }
 
 static void ks8851_set_rx_mode(struct net_device *dev)
@@ -780,7 +772,6 @@ static int ks8851_set_eeprom(struct net_device *dev,
 {
 	struct ks8851_net *ks = netdev_priv(dev);
 	int offset = ee->offset;
-	unsigned long flags;
 	int len = ee->len;
 	u16 tmp;
 
@@ -794,7 +785,7 @@ static int ks8851_set_eeprom(struct net_device *dev,
 	if (!(ks->rc_ccr & CCR_EEPROM))
 		return -ENOENT;
 
-	ks8851_lock(ks, &flags);
+	ks8851_lock(ks);
 
 	ks8851_eeprom_claim(ks);
 
@@ -817,7 +808,7 @@ static int ks8851_set_eeprom(struct net_device *dev,
 	eeprom_93cx6_wren(&ks->eeprom, false);
 
 	ks8851_eeprom_release(ks);
-	ks8851_unlock(ks, &flags);
+	ks8851_unlock(ks);
 
 	return 0;
 }
@@ -827,7 +818,6 @@ static int ks8851_get_eeprom(struct net_device *dev,
 {
 	struct ks8851_net *ks = netdev_priv(dev);
 	int offset = ee->offset;
-	unsigned long flags;
 	int len = ee->len;
 
 	/* must be 2 byte aligned */
@@ -837,7 +827,7 @@ static int ks8851_get_eeprom(struct net_device *dev,
 	if (!(ks->rc_ccr & CCR_EEPROM))
 		return -ENOENT;
 
-	ks8851_lock(ks, &flags);
+	ks8851_lock(ks);
 
 	ks8851_eeprom_claim(ks);
 
@@ -845,7 +835,7 @@ static int ks8851_get_eeprom(struct net_device *dev,
 
 	eeprom_93cx6_multiread(&ks->eeprom, offset/2, (__le16 *)data, len/2);
 	ks8851_eeprom_release(ks);
-	ks8851_unlock(ks, &flags);
+	ks8851_unlock(ks);
 
 	return 0;
 }
@@ -904,7 +894,6 @@ static int ks8851_phy_reg(int reg)
 static int ks8851_phy_read_common(struct net_device *dev, int phy_addr, int reg)
 {
 	struct ks8851_net *ks = netdev_priv(dev);
-	unsigned long flags;
 	int result;
 	int ksreg;
 
@@ -912,9 +901,9 @@ static int ks8851_phy_read_common(struct net_device *dev, int phy_addr, int reg)
 	if (ksreg < 0)
 		return ksreg;
 
-	ks8851_lock(ks, &flags);
+	ks8851_lock(ks);
 	result = ks8851_rdreg16(ks, ksreg);
-	ks8851_unlock(ks, &flags);
+	ks8851_unlock(ks);
 
 	return result;
 }
@@ -949,14 +938,13 @@ static void ks8851_phy_write(struct net_device *dev,
 			     int phy, int reg, int value)
 {
 	struct ks8851_net *ks = netdev_priv(dev);
-	unsigned long flags;
 	int ksreg;
 
 	ksreg = ks8851_phy_reg(reg);
 	if (ksreg >= 0) {
-		ks8851_lock(ks, &flags);
+		ks8851_lock(ks);
 		ks8851_wrreg16(ks, ksreg, value);
-		ks8851_unlock(ks, &flags);
+		ks8851_unlock(ks);
 	}
 }
 
diff --git a/drivers/net/ethernet/micrel/ks8851_par.c b/drivers/net/ethernet/micrel/ks8851_par.c
index 78695be2570bf..9f1c33f6ddec0 100644
--- a/drivers/net/ethernet/micrel/ks8851_par.c
+++ b/drivers/net/ethernet/micrel/ks8851_par.c
@@ -55,29 +55,27 @@ struct ks8851_net_par {
 /**
  * ks8851_lock_par - register access lock
  * @ks: The chip state
- * @flags: Spinlock flags
  *
  * Claim chip register access lock
  */
-static void ks8851_lock_par(struct ks8851_net *ks, unsigned long *flags)
+static void ks8851_lock_par(struct ks8851_net *ks)
 {
 	struct ks8851_net_par *ksp = to_ks8851_par(ks);
 
-	spin_lock_irqsave(&ksp->lock, *flags);
+	spin_lock_bh(&ksp->lock);
 }
 
 /**
  * ks8851_unlock_par - register access unlock
  * @ks: The chip state
- * @flags: Spinlock flags
  *
  * Release chip register access lock
  */
-static void ks8851_unlock_par(struct ks8851_net *ks, unsigned long *flags)
+static void ks8851_unlock_par(struct ks8851_net *ks)
 {
 	struct ks8851_net_par *ksp = to_ks8851_par(ks);
 
-	spin_unlock_irqrestore(&ksp->lock, *flags);
+	spin_unlock_bh(&ksp->lock);
 }
 
 /**
@@ -233,7 +231,6 @@ static netdev_tx_t ks8851_start_xmit_par(struct sk_buff *skb,
 {
 	struct ks8851_net *ks = netdev_priv(dev);
 	netdev_tx_t ret = NETDEV_TX_OK;
-	unsigned long flags;
 	unsigned int txqcr;
 	u16 txmir;
 	int err;
@@ -241,7 +238,7 @@ static netdev_tx_t ks8851_start_xmit_par(struct sk_buff *skb,
 	netif_dbg(ks, tx_queued, ks->netdev,
 		  "%s: skb %p, %d@%p\n", __func__, skb, skb->len, skb->data);
 
-	ks8851_lock_par(ks, &flags);
+	ks8851_lock_par(ks);
 
 	txmir = ks8851_rdreg16_par(ks, KS_TXMIR) & 0x1fff;
 
@@ -262,7 +259,7 @@ static netdev_tx_t ks8851_start_xmit_par(struct sk_buff *skb,
 		ret = NETDEV_TX_BUSY;
 	}
 
-	ks8851_unlock_par(ks, &flags);
+	ks8851_unlock_par(ks);
 
 	return ret;
 }
diff --git a/drivers/net/ethernet/micrel/ks8851_spi.c b/drivers/net/ethernet/micrel/ks8851_spi.c
index a161ae45743ab..b9e68520278d0 100644
--- a/drivers/net/ethernet/micrel/ks8851_spi.c
+++ b/drivers/net/ethernet/micrel/ks8851_spi.c
@@ -71,11 +71,10 @@ struct ks8851_net_spi {
 /**
  * ks8851_lock_spi - register access lock
  * @ks: The chip state
- * @flags: Spinlock flags
  *
  * Claim chip register access lock
  */
-static void ks8851_lock_spi(struct ks8851_net *ks, unsigned long *flags)
+static void ks8851_lock_spi(struct ks8851_net *ks)
 {
 	struct ks8851_net_spi *kss = to_ks8851_spi(ks);
 
@@ -85,11 +84,10 @@ static void ks8851_lock_spi(struct ks8851_net *ks, unsigned long *flags)
 /**
  * ks8851_unlock_spi - register access unlock
  * @ks: The chip state
- * @flags: Spinlock flags
  *
  * Release chip register access lock
  */
-static void ks8851_unlock_spi(struct ks8851_net *ks, unsigned long *flags)
+static void ks8851_unlock_spi(struct ks8851_net *ks)
 {
 	struct ks8851_net_spi *kss = to_ks8851_spi(ks);
 
@@ -309,7 +307,6 @@ static void ks8851_tx_work(struct work_struct *work)
 	struct ks8851_net_spi *kss;
 	unsigned short tx_space;
 	struct ks8851_net *ks;
-	unsigned long flags;
 	struct sk_buff *txb;
 	bool last;
 
@@ -317,7 +314,7 @@ static void ks8851_tx_work(struct work_struct *work)
 	ks = &kss->ks8851;
 	last = skb_queue_empty(&ks->txq);
 
-	ks8851_lock_spi(ks, &flags);
+	ks8851_lock_spi(ks);
 
 	while (!last) {
 		txb = skb_dequeue(&ks->txq);
@@ -343,7 +340,7 @@ static void ks8851_tx_work(struct work_struct *work)
 	ks->tx_space = tx_space;
 	spin_unlock_bh(&ks->statelock);
 
-	ks8851_unlock_spi(ks, &flags);
+	ks8851_unlock_spi(ks);
 }
 
 /**
-- 
2.53.0


^ permalink raw reply related

* [net,PATCH v4 2/2] net: ks8851: Avoid excess softirq scheduling
From: Marek Vasut @ 2026-04-15 23:09 UTC (permalink / raw)
  To: netdev
  Cc: Marek Vasut, Sebastian Andrzej Siewior, stable, David S. Miller,
	Andrew Lunn, Eric Dumazet, Jakub Kicinski, Nicolai Buchwitz,
	Paolo Abeni, Ronald Wahl, Yicong Hui, linux-kernel
In-Reply-To: <20260415231020.455298-1-marex@nabladev.com>

The code injects a packet into netif_rx() repeatedly, which will add
it to its internal NAPI and schedule a softirq, and process it. It is
more efficient to queue multiple packets and process them all at the
local_bh_enable() time.

Reviewed-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Fixes: e0863634bf9f ("net: ks8851: Queue RX packets in IRQ handler instead of disabling BHs")
Cc: stable@vger.kernel.org
Signed-off-by: Marek Vasut <marex@nabladev.com>
---
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Andrew Lunn <andrew+netdev@lunn.ch>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: Nicolai Buchwitz <nb@tipi-net.de>
Cc: Paolo Abeni <pabeni@redhat.com>
Cc: Ronald Wahl <ronald.wahl@raritan.com>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Yicong Hui <yiconghui@gmail.com>
Cc: linux-kernel@vger.kernel.org
Cc: netdev@vger.kernel.org
---
V3: New patch
V4: Add RB from Sebastian
---
 drivers/net/ethernet/micrel/ks8851_common.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/micrel/ks8851_common.c b/drivers/net/ethernet/micrel/ks8851_common.c
index 6c375647b24de..4afbb40bc0e4a 100644
--- a/drivers/net/ethernet/micrel/ks8851_common.c
+++ b/drivers/net/ethernet/micrel/ks8851_common.c
@@ -373,9 +373,12 @@ static irqreturn_t ks8851_irq(int irq, void *_ks)
 	if (status & IRQ_LCI)
 		mii_check_link(&ks->mii);
 
-	if (status & IRQ_RXI)
+	if (status & IRQ_RXI) {
+		local_bh_disable();
 		while ((skb = __skb_dequeue(&rxq)))
 			netif_rx(skb);
+		local_bh_enable();
+	}
 
 	return IRQ_HANDLED;
 }
-- 
2.53.0


^ permalink raw reply related

* Re: [net,PATCH v3 1/2] net: ks8851: Reinstate disabling of BHs around IRQ handler
From: Marek Vasut @ 2026-04-15 23:14 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior
  Cc: netdev, stable, David S. Miller, Andrew Lunn, Eric Dumazet,
	Jakub Kicinski, Nicolai Buchwitz, Paolo Abeni, Ronald Wahl,
	Yicong Hui, linux-kernel
In-Reply-To: <20260414145218.lsNpdAJI@linutronix.de>

On 4/14/26 4:52 PM, Sebastian Andrzej Siewior wrote:
> On 2026-04-14 16:20:46 [+0200], Marek Vasut wrote:
>>> This is what happens since commit 0913ec336a6c0 ("net: ks8851: Fix
>>> deadlock with the SPI chip variant"). Before that commit the softirq
>>> execution will be picked up by netdev_alloc_skb_ip_align() and requires
>>> PREEMPT_RT and a RX packet in #1 to trigger the deadlock.
>>
>> Do you want me to add this into the V4 commit message ?
> 
> The description does not match the code since the commit mentioned
> above.

I hope the V4 commit message is a bit better.

>>> The backtrace here and the description is based on an older kernel.
>>> However
>> I actually did update the backtrace in V3 with the one from current next
>> 20260413 .
> 
> That would be from yesterday and the change is merged since v6.10. But
> why is the softirq starting from __netdev_alloc_skb() instead of
> spin_unlock_bh(&ks->statelock)? After that unlock, the softirq must be
> processed and __netdev_alloc_skb() _could_ observe pending softirqs but
> not from ks8851.
Because __netdev_alloc_skb() also enables/disables BH , see the "else" 
branch:

  759 struct sk_buff *__netdev_alloc_skb(struct net_device *dev, 
unsigned int len,
  760                                    gfp_t gfp_mask)
  761 {
...
  786         if (in_hardirq() || irqs_disabled()) {
  787                 nc = this_cpu_ptr(&netdev_alloc_cache);
  788                 data = page_frag_alloc(nc, len, gfp_mask);
  789                 pfmemalloc = page_frag_cache_is_pfmemalloc(nc);
  790         } else {
  791                 local_bh_disable();
  792                 local_lock_nested_bh(&napi_alloc_cache.bh_lock);
...
  798                 local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
  799                 local_bh_enable();
...

^ permalink raw reply

* Re: [PATCH net-next v18 00/15] Begin upstreaming Homa transport protocol
From: John Ousterhout @ 2026-04-15 23:21 UTC (permalink / raw)
  To: Jakub Kicinski; +Cc: netdev, pabeni, edumazet, horms
In-Reply-To: <CAGXJAmxCsBPJttXdvK=s2SLxFUBzTDvf6PewPo+6QkxZbuES2Q@mail.gmail.com>

For the record, I have fixed all of the issues reported by the AI
review of this patch series, except for the following comments:

(the following comment was on patch 8/15)

> In v17, Paolo followed up asking about kmem_cache performance and whether
> optimization makes sense. The author responded that kmem_cache was actually
> slower than kzalloc (400 vs 380 cycles). While the author provided
> justification, Paolo's v17 response suggests this remains a point of
> discussion about understanding the performance characteristics.

I have not attempted to track down why this optimization didn't help
for Homa. Let me know if that needs to be done as part of the
acceptance process for this patch series.

(the following comment was on patch 11/15)

> Eric Dumazet raised a performance concern during the September 2025 review
> (https://lore.kernel.org/netdev/CANn89i+r6ogvutW1+C7ZDhEYKZN7WU=OVONpgnoX4EgYX=BF4A@mail.gmail.com/):

> "Also if homa does not have yet GRO support, I doubt
> skb_attempt_defer_free() is a big win."

> Is there evidence that skb_attempt_defer_free provides meaningful
> performance benefits for Homa without GRO support? The concern appears
> unaddressed in subsequent patch versions.

I implemented skb_attempt_defer_free at Paolo's suggestion. My
performance measurements indicated that it does not reduce overall CPU
usage (it actually increases it slightly), but it moves CPU time off
the critical path for receiving long messages. Thus I'm inclined to
retain it. Maybe the performance benefits will increase when I
upstream Homa's GRO support?

-John-

^ permalink raw reply

* [BUG] net: caif: potential UAF in cfusbl_device_notify()
From: Wxm-233 @ 2026-04-15 23:35 UTC (permalink / raw)
  To: kuba, marcel, netdev, linux-bluetooth; +Cc: horms, linux-kernel

Hello,

we hit a KASAN slab-use-after-free read in
cfusbl_device_notify() under syzkaller-style workloads.

We reproduced this on 7.0.0-rc2-g0031c06807cf.

The crash happens in the NETDEV_REGISTER notifier path:

  bnep_add_connection()
    -> register_netdev()
      -> register_netdevice()
        -> call_netdevice_notifiers(NETDEV_REGISTER, dev)
          -> cfusbl_device_notify()

The faulting access is the parent/driver-name check in
net/caif/caif_usb.c:

  if (!(dev->dev.parent && dev->dev.parent->driver &&
        strcmp(dev->dev.parent->driver->name, "cdc_ncm") == 0))
          return 0;

From the report, the freed object appears to be the device behind
dev->dev.parent, not the net_device itself.

My current reading is that this is a cross-subsystem lifetime issue:

- the CAIF USB code registers a global netdevice notifier and
  inspects every new netdev
- BNEP sets dev->dev.parent via
  SET_NETDEV_DEV(dev, bnep_get_device(s))
- SET_NETDEV_DEV() only stores a raw pointer
- under concurrent Bluetooth teardown / suspend-related activity,
  that parent device can be released before cfusbl_device_notify()
  dereferences dev->dev.parent->driver

So this does not look like a CAIF modem functional regression in
ordinary use, but it does look like a real UAF in the notifier-side
filtering logic.

The trigger is clearly fuzzing-oriented and cross-subsystem, so I
would treat it as low urgency, but it still seems worth fixing in
net/caif/caif_usb.c, likely by avoiding naked parent dereferences in
the notifier path or by using a safer or lower-scope identification
path for relevant devices.

If useful, I can also send the full KASAN report separately.

Thanks.

^ permalink raw reply

* Re: [PATCH net-next] net: stmmac: enable RPS and RBU interrupts
From: Russell King (Oracle) @ 2026-04-16  0:02 UTC (permalink / raw)
  To: Sam Edwards
  Cc: Jakub Kicinski, Andrew Lunn, Alexandre Torgue, Andrew Lunn,
	David S. Miller, Eric Dumazet,
	moderated list:BROADCOM BCM2711/BCM2835 ARM ARCHITECTURE,
	linux-stm32, Linux Network Development Mailing List, Paolo Abeni
In-Reply-To: <CAH5Ym4j3GePEMEMmg1Z27gYfQ0N8Sc1BMW1rnvNZ4aLQ+cfFyQ@mail.gmail.com>

On Wed, Apr 15, 2026 at 01:50:53PM -0700, Sam Edwards wrote:
> On Wed, Apr 15, 2026 at 12:37 PM Russell King (Oracle)
> <linux@armlinux.org.uk> wrote:
> >
> > It's not a question about how I define RBU - this is defined by Synopsys
> > and I'm using it *exactly* that way as stated in the documentation.
> >
> > "This bit indicates that the host owns the Next Descriptor in the
> > Receive List and the DMA cannot acquire it. The Receive Process is
> > suspended. ... This bit is set only when the previous Receive
> > Descriptor is owned by the DMA."
> >
> > In other words, DMA has processed the previous receive descriptor which
> > _was_ owned by the hardware, written back to clear the OWN bit, and
> > then fetches the next descriptor and finds that the OWN bit is also
> > clear.
> 
> I'm only trying to leave open the possibility that the Synopsys
> technical writer and the hardware implementation team weren't
> communicating clearly. We already have a situation where RPS isn't
> behaving as documented (even if that's likely just hardware
> misconfiguration), so while I'm currently pretty sure RBU carries no
> other (actual) meaning than "DMA caught up to OWN=0," I'm only about
> 75% confident.

It doesn't make sense for RPS to be set though. RPS is "Receive Process
Stopped" and it's documented as being raised when the receive process
enters the stopped state.

If we look at the DMA Debug Status 0 register at 0x100c, then this
gives us a four bit bitfield for channels 0, 1 and 2. Further channels
are in 0x1010. I've added code to dump these when RBU occurs:

dwc-eth-dwmac 2490000.ethernet eth0: debug status: 0x00006400 0x00000000

bits 11:8 are RPS0, which indiciates that the DMA channel 0 receive
process state is "Suspended (Rx Descriptor Unavailable)". If this were
0, then it would be "Stopped (Reset or Stop Receive Command issued)".

So, RPS isn't being raised because the process state isn't entering
the stopped state, which makes sense - because we haven't issued a
stop command, nor have we caused a reset, and the documented recovery
from this condition is to merely advance the tail pointer, rather than
issuing a command to re-start the receive process.

When this is done (because stmmac_rx() continues to periodically run
because of NAPI) RPS0 does change back to 3 "Running (Waiting for Rx
packet)" but it seems that although there are packets waiting to be
written out, that never happens (the Queue 0 Receive Debug register
indicates that there are packets in the receive queue, the receive
queue fill level is above the flow control activate threshold, and
the MAC itself hammers the network with pause frames as a result.)

Thus, I think that the fact that RPS isn't being signalled is entirely
reasonable and consistent with the available documentation.

-- 
RMK's Patch system: https://www.armlinux.org.uk/developer/patches/
FTTP is here! 80Mbps down 10Mbps up. Decent connectivity at last!

^ permalink raw reply

* Re: [PATCH bpf v4 0/5] bpf, sockmap: Fix af_unix null-ptr-deref in proto update
From: patchwork-bot+netdevbpf @ 2026-04-16  0:30 UTC (permalink / raw)
  To: Michal Luczaj
  Cc: john.fastabend, jakub, edumazet, kuniyu, pabeni, willemb, davem,
	kuba, horms, yhs, andrii, ast, daniel, martin.lau, eddyz87, song,
	yonghong.song, kpsingh, sdf, haoluo, jolsa, shuah, cong.wang,
	netdev, bpf, linux-kernel, linux-kselftest, jiayuan.chen,
	yimingqian591
In-Reply-To: <20260414-unix-proto-update-null-ptr-deref-v4-0-2af6fe97918e@rbox.co>

Hello:

This series was applied to bpf/bpf.git (master)
by Martin KaFai Lau <martin.lau@kernel.org>:

On Tue, 14 Apr 2026 16:13:14 +0200 you wrote:
> Updating sockmap/sockhash using a unix sock races unix_stream_connect():
> when sock_map_sk_state_allowed() passes (sk_state == TCP_ESTABLISHED),
> unix_peer(sk) in unix_stream_bpf_update_proto() may still return NULL.
> 
> Signed-off-by: Michal Luczaj <mhal@rbox.co>
> ---
> Changes in v4:
> - Circle back to v1 approach
> - More details in commit messages [Martin]
> - Make unix iter take the state lock [Kaniyuki]
> - Link to v3: https://lore.kernel.org/r/20260306-unix-proto-update-null-ptr-deref-v3-0-2f0c7410c523@rbox.co
> 
> [...]

Here is the summary with links:
  - [bpf,v4,1/5] bpf, sockmap: Annotate af_unix sock::sk_state data-races
    https://git.kernel.org/bpf/bpf/c/a25566084e39
  - [bpf,v4,2/5] bpf, sockmap: Fix af_unix iter deadlock
    https://git.kernel.org/bpf/bpf/c/4d328dd69538
  - [bpf,v4,3/5] selftests/bpf: Extend bpf_iter_unix to attempt deadlocking
    https://git.kernel.org/bpf/bpf/c/997b8483d44c
  - [bpf,v4,4/5] bpf, sockmap: Fix af_unix null-ptr-deref in proto update
    https://git.kernel.org/bpf/bpf/c/dca38b7734d2
  - [bpf,v4,5/5] bpf, sockmap: Take state lock for af_unix iter
    https://git.kernel.org/bpf/bpf/c/64c2f93fc325

You are awesome, thank you!
-- 
Deet-doot-dot, I am a bot.
https://korg.docs.kernel.org/patchwork/pwbot.html



^ permalink raw reply

* Re: [PATCH v3 1/4] rust: netlink: add raw netlink abstraction
From: Andrew Lunn @ 2026-04-16  0:42 UTC (permalink / raw)
  To: Alice Ryhl
  Cc: Miguel Ojeda, Boqun Feng, Gary Guo, Björn Roy Baron,
	Benno Lossin, Andreas Hindborg, Trevor Gross, Danilo Krummrich,
	Donald Hunter, Jakub Kicinski, David S. Miller, Eric Dumazet,
	Paolo Abeni, Simon Horman, Greg Kroah-Hartman,
	Arve Hjønnevåg, Todd Kjos, Christian Brauner,
	Carlos Llamas, linux-kernel, rust-for-linux, netdev
In-Reply-To: <20260415-binder-netlink-v3-1-84be9ba63ee2@google.com>

> diff --git a/rust/helpers/genetlink.c b/rust/helpers/genetlink.c
> new file mode 100644
> index 000000000000..3530b69f6cf7
> --- /dev/null
> +++ b/rust/helpers/genetlink.c
> @@ -0,0 +1,46 @@
> +// SPDX-License-Identifier: GPL-2.0
> +
> +/*
> + * Copyright (C) 2026 Google LLC.
> + */
> +
> +#include <net/genetlink.h>
> +
> +#ifdef CONFIG_NET
> +
> +__rust_helper struct sk_buff *rust_helper_genlmsg_new(size_t payload, gfp_t flags)
> +{
> +	return genlmsg_new(payload, flags);
> +}
> +
> +__rust_helper
> +int rust_helper_genlmsg_multicast(const struct genl_family *family,
> +				  struct sk_buff *skb, u32 portid,
> +				  unsigned int group, gfp_t flags)
> +{
> +	return genlmsg_multicast(family, skb, portid, group, flags);
> +}
> +
> +__rust_helper void rust_helper_genlmsg_cancel(struct sk_buff *skb, void *hdr)
> +{
> +	genlmsg_cancel(skb, hdr);
> +}
> +
> +__rust_helper void rust_helper_genlmsg_end(struct sk_buff *skb, void *hdr)
> +{
> +	genlmsg_end(skb, hdr);
> +}
> +
> +__rust_helper void rust_helper_nlmsg_free(struct sk_buff *skb)
> +{
> +	nlmsg_free(skb);
> +}
> +
> +__rust_helper
> +int rust_helper_genl_has_listeners(const struct genl_family *family,
> +				   struct net *net, unsigned int group)
> +{
> +	return genl_has_listeners(family, net, group);
> +}

The C part looks sensible now. For this bit only:

Reviewed-by: Andrew Lunn <andrew@lunn.ch>

    Andrew

^ permalink raw reply

* Re: [PATCH v3 4/4] rust_binder: report netlink transactions
From: Andrew Lunn @ 2026-04-16  0:46 UTC (permalink / raw)
  To: Alice Ryhl
  Cc: Miguel Ojeda, Boqun Feng, Gary Guo, Björn Roy Baron,
	Benno Lossin, Andreas Hindborg, Trevor Gross, Danilo Krummrich,
	Donald Hunter, Jakub Kicinski, David S. Miller, Eric Dumazet,
	Paolo Abeni, Simon Horman, Greg Kroah-Hartman,
	Arve Hjønnevåg, Todd Kjos, Christian Brauner,
	Carlos Llamas, linux-kernel, rust-for-linux, netdev
In-Reply-To: <20260415-binder-netlink-v3-4-84be9ba63ee2@google.com>

On Wed, Apr 15, 2026 at 09:37:54AM +0000, Alice Ryhl wrote:
> From: Carlos Llamas <cmllamas@google.com>
> 
> The Android Binder driver supports a netlink API that reports
> transaction *failures* to a userapce daemon. This allows devices to

userspace ?

	Andrew

^ permalink raw reply

* Re: [PATCH net-next v9 04/10] net: phy: Create SFP phy_port before registering upstream
From: Andrew Lunn @ 2026-04-16  0:59 UTC (permalink / raw)
  To: Maxime Chevallier
  Cc: davem, Jakub Kicinski, Eric Dumazet, Paolo Abeni, Russell King,
	Heiner Kallweit, netdev, linux-kernel, thomas.petazzoni,
	Christophe Leroy, Herve Codina, Florian Fainelli, Vladimir Oltean,
	Köry Maincent, Marek Behún, Oleksij Rempel,
	Nicolò Veronese, Simon Horman, mwojtas, Romain Gantois,
	Daniel Golle, Dimitri Fedrau
In-Reply-To: <e4f6503c-6514-46f1-bf4f-a38c12903c16@bootlin.com>

On Wed, Apr 15, 2026 at 09:19:45AM +0200, Maxime Chevallier wrote:
> Hi Andrew,
> 
> On 15/04/2026 01:46, Andrew Lunn wrote:
> > On Fri, Apr 03, 2026 at 02:37:48PM +0200, Maxime Chevallier wrote:
> >> When dealing with PHY-driven SFP, we create a phy_port representing the
> >> SFP bus when we know we have such a bus.
> > 
> > I'm missing the big picture here.
> > 
> > Do we have three different things represented in the topology:
> > 
> > SFP bus-> SFP cage-> SFP module
> > 
> > 	Andrew
> 
> Ah by bad, this is a wording issue, this is the port for the cage indeed.
> 
> The model I ended-up with is to represent the SFP cage itself as a PHY
> port, but I've been calling that the "sfp bus port" in the code so far :/

I know we have sfp-bus.c, but i think most people think in terms of
cage and module. So unless we need some concepts from sfp-bus, i think
it would be better if we use phylink->sfp_cage_port and
phylink->mod_port. 

	Andrew

^ permalink raw reply

* [PATCH net] selftests: net: add missing CMAC to tcp_ao config
From: Jakub Kicinski @ 2026-04-16  1:04 UTC (permalink / raw)
  To: davem
  Cc: netdev, edumazet, pabeni, andrew+netdev, horms, Jakub Kicinski,
	shuah, fw, antonio, matttbe, phil, linux-kselftest

Recent changes to crypto and wifi made CMAC no longer
selected by default on x86 and tcp_ao needs it.
Add the missing config.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
CC: shuah@kernel.org
CC: fw@strlen.de
CC: antonio@openvpn.net
CC: matttbe@kernel.org
CC: phil@nwl.cc
CC: linux-kselftest@vger.kernel.org
---
 tools/testing/selftests/net/tcp_ao/config | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/testing/selftests/net/tcp_ao/config b/tools/testing/selftests/net/tcp_ao/config
index 971cb6fa2d63..f22148512365 100644
--- a/tools/testing/selftests/net/tcp_ao/config
+++ b/tools/testing/selftests/net/tcp_ao/config
@@ -1,3 +1,4 @@
+CONFIG_CRYPTO_CMAC=y
 CONFIG_CRYPTO_HMAC=y
 CONFIG_CRYPTO_RMD160=y
 CONFIG_CRYPTO_SHA1=y
-- 
2.53.0


^ permalink raw reply related

* [PATCH net 00/14] Netfilter/IPVS fixes for net
From: Pablo Neira Ayuso @ 2026-04-16  1:30 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev, kuba, pabeni, edumazet, fw, horms

Hi,

The following patchset contains Netfilter/IPVS fixes for net: Mostly
addressing very old bugs in the SIP conntrack helper string parser,
unsafe arp_tables match support with legacy IEEE1394, restrict xt_realm
to IPv4 and incorrect use of RCU lists in nat core and nftables. This
batch also includes one IPVS MTU fix. The exception is a fix for a
recent issue related to broken double-tagged vlan in the flowtable.

1) Fix possible stack recursion in nft_fwd_netdev from egress path,
   from Weiming Shi.

2) Fix unsafe port parser in SIP helper, from Jenny Guanni Qu.

3) Fix arp_tables match with IEEE1394 ARP payload, allowing to
   reach bytes off the skb boundary, from Weiming Shi.

4) Reject unsafe nfnetlink_osf configurations from control plane,
   this is addressing a possible division by zero, from Xiang Mei.

5) nft_osf actually only supports IPv4, restrict it.

6) Fix double-tagged-vlan support (again) in the flowtable, from
   Eric Woudstra.

7) Remove unsafe use of sprintf to fix possible buffer overflow
   in the SIP NAT helper, from Florian Westphal.

8) Restrict xt_mac, xt_owner and xt_physdev to inet families only;
   xt_realm is only for ipv4, otherwise null-pointer-deref is possible.

9) Use kfree_rcu() in nat core to release hooks, this can be an issue
   once nfnetlink_hook gets support to dump NAT hook information,
   not currently a real issue but better fix it now.

10) Fix MTU checks in IPVS, from Yingnan Zhang.

11) Use list_del_rcu() in chain and flowtable hook unregistration,
    concurrent RCU reader could be walking over the hook list,
    from Florian Westphal.

12) Add list_splice_rcu(), this is required to fix unsafe
    splice to RCU protected hook list. Reviewed by Paul McKenney.

13) Use list_splice_rcu() to splice new chain and flowtable hooks.

14) Add shim nft_trans_hook object to track chain and flowtable
    hook deletions and flag them as removed, instead of unsafely
    moving around hooks in the RCU-protected hook list. This allows
    to restore the previous state from the abort path.

Please, pull these changes from:

  git://git.kernel.org/pub/scm/linux/kernel/git/netfilter/nf.git nf-26-04-16

Thanks.

----------------------------------------------------------------

The following changes since commit 2dddb34dd0d07b01fa770eca89480a4da4f13153:

  net: ethernet: mtk_eth_soc: initialize PPE per-tag-layer MTU registers (2026-04-12 15:22:58 -0700)

are available in the Git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/netfilter/nf.git tags/nf-26-04-16

for you to fetch changes up to e349f90da812aeddd22c3914a2cc639b51e4eb48:

  netfilter: nf_tables: add hook transactions for device deletions (2026-04-16 02:47:58 +0200)

----------------------------------------------------------------
netfilter pull request 26-04-16

----------------------------------------------------------------
Eric Woudstra (1):
      netfilter: nf_flow_table_ip: Introduce nf_flow_vlan_push()

Florian Westphal (2):
      netfilter: conntrack: remove sprintf usage
      netfilter: nf_tables: use list_del_rcu for netlink hooks

Jenny Guanni Qu (1):
      netfilter: nf_conntrack_sip: add bounds-checked port parsing helper

Pablo Neira Ayuso (6):
      netfilter: nft_osf: restrict it to ipv4
      netfilter: xtables: restrict several matches to inet family
      netfilter: nat: use kfree_rcu to release ops
      rculist: add list_splice_rcu() for private lists
      netfilter: nf_tables: join hook list via splice_list_rcu() in commit phase
      netfilter: nf_tables: add hook transactions for device deletions

Weiming Shi (2):
      netfilter: nft_fwd_netdev: use recursion counter in neigh egress path
      netfilter: arp_tables: fix IEEE1394 ARP payload parsing in arp_packet_match()

Xiang Mei (1):
      netfilter: nfnetlink_osf: fix divide-by-zero in OSF_WSS_MODULO

Yingnan Zhang (1):
      ipvs: fix MTU check for GSO packets in tunnel mode

 include/linux/rculist.h               |  29 ++++++
 include/net/netfilter/nf_dup_netdev.h |  13 +++
 include/net/netfilter/nf_tables.h     |  13 +++
 net/ipv4/netfilter/arp_tables.c       |  14 ++-
 net/ipv4/netfilter/iptable_nat.c      |   2 +-
 net/ipv6/netfilter/ip6table_nat.c     |   2 +-
 net/netfilter/ipvs/ip_vs_xmit.c       |  19 +++-
 net/netfilter/nf_conntrack_sip.c      |  80 +++++++++++-----
 net/netfilter/nf_dup_netdev.c         |  16 ----
 net/netfilter/nf_flow_table_ip.c      |  25 ++++-
 net/netfilter/nf_nat_amanda.c         |   2 +-
 net/netfilter/nf_nat_core.c           |  10 +-
 net/netfilter/nf_nat_sip.c            |  33 ++++---
 net/netfilter/nf_tables_api.c         | 168 ++++++++++++++++++++++++----------
 net/netfilter/nfnetlink_osf.c         |   4 +
 net/netfilter/nft_fwd_netdev.c        |   7 ++
 net/netfilter/nft_osf.c               |   6 +-
 net/netfilter/xt_mac.c                |  34 ++++---
 net/netfilter/xt_owner.c              |  37 +++++---
 net/netfilter/xt_physdev.c            |  29 ++++--
 net/netfilter/xt_realm.c              |   2 +-
 21 files changed, 393 insertions(+), 152 deletions(-)

^ permalink raw reply

* [PATCH net 01/14] netfilter: nft_fwd_netdev: use recursion counter in neigh egress path
From: Pablo Neira Ayuso @ 2026-04-16  1:30 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev, kuba, pabeni, edumazet, fw, horms
In-Reply-To: <20260416013101.221555-1-pablo@netfilter.org>

From: Weiming Shi <bestswngs@gmail.com>

nft_fwd_neigh can be used in egress chains (NF_NETDEV_EGRESS). When the
forwarding rule targets the same device or two devices forward to each
other, neigh_xmit() triggers dev_queue_xmit() which re-enters
nf_hook_egress(), causing infinite recursion and stack overflow.

Move the nf_get_nf_dup_skb_recursion() accessor and NF_RECURSION_LIMIT
to the shared header nf_dup_netdev.h as a static inline, so that
nft_fwd_netdev can use the recursion counter directly without exported
function call overhead. Guard neigh_xmit() with the same recursion
limit already used in nf_do_netdev_egress().

Fixes: f87b9464d152 ("netfilter: nft_fwd_netdev: Support egress hook")
Reported-by: Xiang Mei <xmei5@asu.edu>
Signed-off-by: Weiming Shi <bestswngs@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_dup_netdev.h | 13 +++++++++++++
 net/netfilter/nf_dup_netdev.c         | 16 ----------------
 net/netfilter/nft_fwd_netdev.c        |  7 +++++++
 3 files changed, 20 insertions(+), 16 deletions(-)

diff --git a/include/net/netfilter/nf_dup_netdev.h b/include/net/netfilter/nf_dup_netdev.h
index b175d271aec9..609bcf422a9b 100644
--- a/include/net/netfilter/nf_dup_netdev.h
+++ b/include/net/netfilter/nf_dup_netdev.h
@@ -3,10 +3,23 @@
 #define _NF_DUP_NETDEV_H_
 
 #include <net/netfilter/nf_tables.h>
+#include <linux/netdevice.h>
+#include <linux/sched.h>
 
 void nf_dup_netdev_egress(const struct nft_pktinfo *pkt, int oif);
 void nf_fwd_netdev_egress(const struct nft_pktinfo *pkt, int oif);
 
+#define NF_RECURSION_LIMIT	2
+
+static inline u8 *nf_get_nf_dup_skb_recursion(void)
+{
+#ifndef CONFIG_PREEMPT_RT
+	return this_cpu_ptr(&softnet_data.xmit.nf_dup_skb_recursion);
+#else
+	return &current->net_xmit.nf_dup_skb_recursion;
+#endif
+}
+
 struct nft_offload_ctx;
 struct nft_flow_rule;
 
diff --git a/net/netfilter/nf_dup_netdev.c b/net/netfilter/nf_dup_netdev.c
index fab8b9011098..a958a1b0c5be 100644
--- a/net/netfilter/nf_dup_netdev.c
+++ b/net/netfilter/nf_dup_netdev.c
@@ -13,22 +13,6 @@
 #include <net/netfilter/nf_tables_offload.h>
 #include <net/netfilter/nf_dup_netdev.h>
 
-#define NF_RECURSION_LIMIT	2
-
-#ifndef CONFIG_PREEMPT_RT
-static u8 *nf_get_nf_dup_skb_recursion(void)
-{
-	return this_cpu_ptr(&softnet_data.xmit.nf_dup_skb_recursion);
-}
-#else
-
-static u8 *nf_get_nf_dup_skb_recursion(void)
-{
-	return &current->net_xmit.nf_dup_skb_recursion;
-}
-
-#endif
-
 static void nf_do_netdev_egress(struct sk_buff *skb, struct net_device *dev,
 				enum nf_dev_hooks hook)
 {
diff --git a/net/netfilter/nft_fwd_netdev.c b/net/netfilter/nft_fwd_netdev.c
index 152a9fb4d23a..492bb599a499 100644
--- a/net/netfilter/nft_fwd_netdev.c
+++ b/net/netfilter/nft_fwd_netdev.c
@@ -141,13 +141,20 @@ static void nft_fwd_neigh_eval(const struct nft_expr *expr,
 		goto out;
 	}
 
+	if (*nf_get_nf_dup_skb_recursion() > NF_RECURSION_LIMIT) {
+		verdict = NF_DROP;
+		goto out;
+	}
+
 	dev = dev_get_by_index_rcu(nft_net(pkt), oif);
 	if (dev == NULL)
 		return;
 
 	skb->dev = dev;
 	skb_clear_tstamp(skb);
+	(*nf_get_nf_dup_skb_recursion())++;
 	neigh_xmit(neigh_table, dev, addr, skb);
+	(*nf_get_nf_dup_skb_recursion())--;
 out:
 	regs->verdict.code = verdict;
 }
-- 
2.47.3


^ permalink raw reply related

* [PATCH net 02/14] netfilter: nf_conntrack_sip: add bounds-checked port parsing helper
From: Pablo Neira Ayuso @ 2026-04-16  1:30 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev, kuba, pabeni, edumazet, fw, horms
In-Reply-To: <20260416013101.221555-1-pablo@netfilter.org>

From: Jenny Guanni Qu <qguanni@gmail.com>

Replace unsafe port parsing in epaddr_len(), ct_sip_parse_header_uri(),
and ct_sip_parse_request() with a new sip_parse_port() helper that
validates each digit against the buffer limit, eliminating the use of
simple_strtoul() which assumes NUL-terminated strings.

The previous code dereferenced pointers without bounds checks after
sip_parse_addr() and relied on simple_strtoul() on non-NUL-terminated
skb data. A port that reaches the buffer limit without a trailing
character is also rejected as malformed.

Based on a suggestion by Florian Westphal.

[ fw@strlen.de: make port range check unconditional ]

Fixes: 05e3ced297fe ("[NETFILTER]: nf_conntrack_sip: introduce SIP-URI parsing helper")
Reported-by: Klaudia Kloc <klaudia@vidocsecurity.com>
Reported-by: Dawid Moczadło <dawid@vidocsecurity.com>
Suggested-by: Florian Westphal <fw@strlen.de>
Tested-by: Jenny Guanni Qu <qguanni@gmail.com>
Tested-by: Weiming Shi <bestswngs@gmail.com>
Signed-off-by: Jenny Guanni Qu <qguanni@gmail.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_conntrack_sip.c | 80 +++++++++++++++++++++++---------
 1 file changed, 57 insertions(+), 23 deletions(-)

diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c
index 939502ff7c87..38a55a3b3d19 100644
--- a/net/netfilter/nf_conntrack_sip.c
+++ b/net/netfilter/nf_conntrack_sip.c
@@ -181,6 +181,57 @@ static int sip_parse_addr(const struct nf_conn *ct, const char *cp,
 	return 1;
 }
 
+/* Parse optional port number after IP address.
+ * Returns false on malformed input, true otherwise.
+ * If port is non-NULL, stores parsed port in network byte order.
+ * If no port is present, sets *port to default SIP port.
+ */
+static bool sip_parse_port(const char *dptr, const char **endp,
+			   const char *limit, __be16 *port)
+{
+	unsigned int p = 0;
+	int len = 0;
+
+	if (dptr >= limit)
+		return false;
+
+	if (*dptr != ':') {
+		if (port)
+			*port = htons(SIP_PORT);
+		if (endp)
+			*endp = dptr;
+		return true;
+	}
+
+	dptr++; /* skip ':' */
+
+	while (dptr < limit && isdigit(*dptr)) {
+		p = p * 10 + (*dptr - '0');
+		dptr++;
+		len++;
+		if (len > 5) /* max "65535" */
+			return false;
+	}
+
+	if (len == 0)
+		return false;
+
+	/* reached limit while parsing port */
+	if (dptr >= limit)
+		return false;
+
+	if (p < 1024 || p > 65535)
+		return false;
+
+	if (port)
+		*port = htons(p);
+
+	if (endp)
+		*endp = dptr;
+
+	return true;
+}
+
 /* skip ip address. returns its length. */
 static int epaddr_len(const struct nf_conn *ct, const char *dptr,
 		      const char *limit, int *shift)
@@ -193,11 +244,8 @@ static int epaddr_len(const struct nf_conn *ct, const char *dptr,
 		return 0;
 	}
 
-	/* Port number */
-	if (*dptr == ':') {
-		dptr++;
-		dptr += digits_len(ct, dptr, limit, shift);
-	}
+	if (!sip_parse_port(dptr, &dptr, limit, NULL))
+		return 0;
 	return dptr - aux;
 }
 
@@ -241,7 +289,6 @@ int ct_sip_parse_request(const struct nf_conn *ct,
 {
 	const char *start = dptr, *limit = dptr + datalen, *end;
 	unsigned int mlen;
-	unsigned int p;
 	int shift = 0;
 
 	/* Skip method and following whitespace */
@@ -267,14 +314,8 @@ int ct_sip_parse_request(const struct nf_conn *ct,
 
 	if (!sip_parse_addr(ct, dptr, &end, addr, limit, true))
 		return -1;
-	if (end < limit && *end == ':') {
-		end++;
-		p = simple_strtoul(end, (char **)&end, 10);
-		if (p < 1024 || p > 65535)
-			return -1;
-		*port = htons(p);
-	} else
-		*port = htons(SIP_PORT);
+	if (!sip_parse_port(end, &end, limit, port))
+		return -1;
 
 	if (end == dptr)
 		return 0;
@@ -509,7 +550,6 @@ int ct_sip_parse_header_uri(const struct nf_conn *ct, const char *dptr,
 			    union nf_inet_addr *addr, __be16 *port)
 {
 	const char *c, *limit = dptr + datalen;
-	unsigned int p;
 	int ret;
 
 	ret = ct_sip_walk_headers(ct, dptr, dataoff ? *dataoff : 0, datalen,
@@ -520,14 +560,8 @@ int ct_sip_parse_header_uri(const struct nf_conn *ct, const char *dptr,
 
 	if (!sip_parse_addr(ct, dptr + *matchoff, &c, addr, limit, true))
 		return -1;
-	if (*c == ':') {
-		c++;
-		p = simple_strtoul(c, (char **)&c, 10);
-		if (p < 1024 || p > 65535)
-			return -1;
-		*port = htons(p);
-	} else
-		*port = htons(SIP_PORT);
+	if (!sip_parse_port(c, &c, limit, port))
+		return -1;
 
 	if (dataoff)
 		*dataoff = c - dptr;
-- 
2.47.3


^ permalink raw reply related

* [PATCH net 03/14] netfilter: arp_tables: fix IEEE1394 ARP payload parsing in arp_packet_match()
From: Pablo Neira Ayuso @ 2026-04-16  1:30 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev, kuba, pabeni, edumazet, fw, horms
In-Reply-To: <20260416013101.221555-1-pablo@netfilter.org>

From: Weiming Shi <bestswngs@gmail.com>

arp_packet_match() unconditionally parses the ARP payload assuming two
hardware addresses are present (source and target). However,
IPv4-over-IEEE1394 ARP (RFC 2734) omits the target hardware address
field, and arp_hdr_len() already accounts for this by returning a
shorter length for ARPHRD_IEEE1394 devices.

As a result, on IEEE1394 interfaces arp_packet_match() advances past a
nonexistent target hardware address and reads the wrong bytes for both
the target device address comparison and the target IP address. This
causes arptables rules to match against garbage data, leading to
incorrect filtering decisions: packets that should be accepted may be
dropped and vice versa.

The ARP stack in net/ipv4/arp.c (arp_create and arp_process) already
handles this correctly by skipping the target hardware address for
ARPHRD_IEEE1394. Apply the same pattern to arp_packet_match().

[ Pablo has mangled this patch to include Simon Horman's suggestions ]

Fixes: 6752c8db8e0c ("firewire net, ipv4 arp: Extend hardware address and remove driver-level packet inspection.")
Reported-by: Xiang Mei <xmei5@asu.edu>
Signed-off-by: Weiming Shi <bestswngs@gmail.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv4/netfilter/arp_tables.c | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 1cdd9c28ab2d..a7a56890b5b5 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -110,13 +110,21 @@ static inline int arp_packet_match(const struct arphdr *arphdr,
 	arpptr += dev->addr_len;
 	memcpy(&src_ipaddr, arpptr, sizeof(u32));
 	arpptr += sizeof(u32);
-	tgt_devaddr = arpptr;
-	arpptr += dev->addr_len;
+
+	if (IS_ENABLED(CONFIG_FIREWIRE_NET) && dev->type == ARPHRD_IEEE1394) {
+		tgt_devaddr = NULL;
+	} else {
+		tgt_devaddr = arpptr;
+		arpptr += dev->addr_len;
+	}
 	memcpy(&tgt_ipaddr, arpptr, sizeof(u32));
 
 	if (NF_INVF(arpinfo, ARPT_INV_SRCDEVADDR,
 		    arp_devaddr_compare(&arpinfo->src_devaddr, src_devaddr,
-					dev->addr_len)) ||
+					dev->addr_len)))
+		return 0;
+
+	if (tgt_devaddr &&
 	    NF_INVF(arpinfo, ARPT_INV_TGTDEVADDR,
 		    arp_devaddr_compare(&arpinfo->tgt_devaddr, tgt_devaddr,
 					dev->addr_len)))
-- 
2.47.3


^ permalink raw reply related

* [PATCH net 04/14] netfilter: nfnetlink_osf: fix divide-by-zero in OSF_WSS_MODULO
From: Pablo Neira Ayuso @ 2026-04-16  1:30 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev, kuba, pabeni, edumazet, fw, horms
In-Reply-To: <20260416013101.221555-1-pablo@netfilter.org>

From: Xiang Mei <xmei5@asu.edu>

nf_osf_match_one() computes ctx->window % f->wss.val in the
OSF_WSS_MODULO branch with no guard for f->wss.val == 0. A
CAP_NET_ADMIN user can add such a fingerprint via nfnetlink; a
subsequent matching TCP SYN divides by zero and panics the kernel.

Reject the bogus fingerprint in nfnl_osf_add_callback() above the
per-option for-loop. f->wss is per-fingerprint, not per-option, so
the check must run regardless of f->opt_num (including 0). Also
reject wss.wc >= OSF_WSS_MAX; nf_osf_match_one() already treats that
as "should not happen".

Crash:
 Oops: divide error: 0000 [#1] SMP KASAN NOPTI
 RIP: 0010:nf_osf_match_one (net/netfilter/nfnetlink_osf.c:98)
 Call Trace:
 <IRQ>
  nf_osf_match (net/netfilter/nfnetlink_osf.c:220)
  xt_osf_match_packet (net/netfilter/xt_osf.c:32)
  ipt_do_table (net/ipv4/netfilter/ip_tables.c:348)
  nf_hook_slow (net/netfilter/core.c:622)
  ip_local_deliver (net/ipv4/ip_input.c:265)
  ip_rcv (include/linux/skbuff.h:1162)
  __netif_receive_skb_one_core (net/core/dev.c:6181)
  process_backlog (net/core/dev.c:6642)
  __napi_poll (net/core/dev.c:7710)
  net_rx_action (net/core/dev.c:7945)
  handle_softirqs (kernel/softirq.c:622)

Fixes: 11eeef41d5f6 ("netfilter: passive OS fingerprint xtables match")
Reported-by: Weiming Shi <bestswngs@gmail.com>
Suggested-by: Florian Westphal <fw@strlen.de>
Suggested-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: Xiang Mei <xmei5@asu.edu>
Reviewed-by: Fernando Fernandez Mancera <fmancera@suse.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nfnetlink_osf.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/net/netfilter/nfnetlink_osf.c b/net/netfilter/nfnetlink_osf.c
index 45d9ad231a92..70172ca07858 100644
--- a/net/netfilter/nfnetlink_osf.c
+++ b/net/netfilter/nfnetlink_osf.c
@@ -320,6 +320,10 @@ static int nfnl_osf_add_callback(struct sk_buff *skb,
 	if (f->opt_num > ARRAY_SIZE(f->opt))
 		return -EINVAL;
 
+	if (f->wss.wc >= OSF_WSS_MAX ||
+	    (f->wss.wc == OSF_WSS_MODULO && f->wss.val == 0))
+		return -EINVAL;
+
 	for (i = 0; i < f->opt_num; i++) {
 		if (!f->opt[i].length || f->opt[i].length > MAX_IPOPTLEN)
 			return -EINVAL;
-- 
2.47.3


^ permalink raw reply related

* [PATCH net 05/14] netfilter: nft_osf: restrict it to ipv4
From: Pablo Neira Ayuso @ 2026-04-16  1:30 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev, kuba, pabeni, edumazet, fw, horms
In-Reply-To: <20260416013101.221555-1-pablo@netfilter.org>

This expression only supports for ipv4, restrict it.

Fixes: b96af92d6eaf ("netfilter: nf_tables: implement Passive OS fingerprint module in nft_osf")
Acked-by: Florian Westphal <fw@strlen.de>
Reviewed-by: Fernando Fernandez Mancera <fmancera@suse.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nft_osf.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/net/netfilter/nft_osf.c b/net/netfilter/nft_osf.c
index 1c0b493ef0a9..bdc2f6c90e2f 100644
--- a/net/netfilter/nft_osf.c
+++ b/net/netfilter/nft_osf.c
@@ -28,6 +28,11 @@ static void nft_osf_eval(const struct nft_expr *expr, struct nft_regs *regs,
 	struct nf_osf_data data;
 	struct tcphdr _tcph;
 
+	if (nft_pf(pkt) != NFPROTO_IPV4) {
+		regs->verdict.code = NFT_BREAK;
+		return;
+	}
+
 	if (pkt->tprot != IPPROTO_TCP) {
 		regs->verdict.code = NFT_BREAK;
 		return;
@@ -114,7 +119,6 @@ static int nft_osf_validate(const struct nft_ctx *ctx,
 
 	switch (ctx->family) {
 	case NFPROTO_IPV4:
-	case NFPROTO_IPV6:
 	case NFPROTO_INET:
 		hooks = (1 << NF_INET_LOCAL_IN) |
 			(1 << NF_INET_PRE_ROUTING) |
-- 
2.47.3


^ permalink raw reply related

* [PATCH net 06/14] netfilter: nf_flow_table_ip: Introduce nf_flow_vlan_push()
From: Pablo Neira Ayuso @ 2026-04-16  1:30 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev, kuba, pabeni, edumazet, fw, horms
In-Reply-To: <20260416013101.221555-1-pablo@netfilter.org>

From: Eric Woudstra <ericwouds@gmail.com>

Calling skb_reset_mac_header() before calling skb_vlan_push() does
remove the error:

"skb_vlan_push got skb with skb->data not at mac header (offset 18)"

But the inner vlan tag is still not inserted correctly.

skb_vlan_push() uses __vlan_insert_inner_tag() to insert the tag
at offset ETH_HLEN. But the inner tag should only be pushed, without
offset, similar to nf_flow_pppoe_push().

Fixes: c653d5a78f34 ("netfilter: flowtable: inline vlan encapsulation in xmit path")
Fixes: a3aca98aec9a ("netfilter: nf_flow_table_ip: reset mac header before vlan push")
Signed-off-by: Eric Woudstra <ericwouds@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_flow_table_ip.c | 25 ++++++++++++++++++++++---
 1 file changed, 22 insertions(+), 3 deletions(-)

diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c
index fd56d663cb5b..0086f8a1a0d6 100644
--- a/net/netfilter/nf_flow_table_ip.c
+++ b/net/netfilter/nf_flow_table_ip.c
@@ -544,6 +544,26 @@ static int nf_flow_offload_forward(struct nf_flowtable_ctx *ctx,
 	return 1;
 }
 
+static int nf_flow_vlan_push(struct sk_buff *skb, __be16 proto, u16 id)
+{
+	if (skb_vlan_tag_present(skb)) {
+		struct vlan_hdr *vhdr;
+
+		if (skb_cow_head(skb, VLAN_HLEN))
+			return -1;
+
+		__skb_push(skb, VLAN_HLEN);
+		skb_reset_network_header(skb);
+		vhdr = (struct vlan_hdr *)(skb->data);
+		vhdr->h_vlan_TCI = htons(id);
+		vhdr->h_vlan_encapsulated_proto = skb->protocol;
+		skb->protocol = proto;
+	} else {
+		__vlan_hwaccel_put_tag(skb, proto, id);
+	}
+	return 0;
+}
+
 static int nf_flow_pppoe_push(struct sk_buff *skb, u16 id)
 {
 	int data_len = skb->len + sizeof(__be16);
@@ -738,9 +758,8 @@ static int nf_flow_encap_push(struct sk_buff *skb,
 		switch (tuple->encap[i].proto) {
 		case htons(ETH_P_8021Q):
 		case htons(ETH_P_8021AD):
-			skb_reset_mac_header(skb);
-			if (skb_vlan_push(skb, tuple->encap[i].proto,
-					  tuple->encap[i].id) < 0)
+			if (nf_flow_vlan_push(skb, tuple->encap[i].proto,
+					      tuple->encap[i].id) < 0)
 				return -1;
 			break;
 		case htons(ETH_P_PPP_SES):
-- 
2.47.3


^ permalink raw reply related

* [PATCH net 07/14] netfilter: conntrack: remove sprintf usage
From: Pablo Neira Ayuso @ 2026-04-16  1:30 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev, kuba, pabeni, edumazet, fw, horms
In-Reply-To: <20260416013101.221555-1-pablo@netfilter.org>

From: Florian Westphal <fw@strlen.de>

Replace it with scnprintf, the buffer sizes are expected to be large enough
to hold the result, no need for snprintf+overflow check.

Increase buffer size in mangle_content_len() while at it.

BUG: KASAN: stack-out-of-bounds in vsnprintf+0xea5/0x1270
Write of size 1 at addr [..]
 vsnprintf+0xea5/0x1270
 sprintf+0xb1/0xe0
 mangle_content_len+0x1ac/0x280
 nf_nat_sdp_session+0x1cc/0x240
 process_sdp+0x8f8/0xb80
 process_invite_request+0x108/0x2b0
 process_sip_msg+0x5da/0xf50
 sip_help_tcp+0x45e/0x780
 nf_confirm+0x34d/0x990
 [..]

Fixes: 9fafcd7b2032 ("[NETFILTER]: nf_conntrack/nf_nat: add SIP helper port")
Reported-by: Yiming Qian <yimingqian591@gmail.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_nat_amanda.c |  2 +-
 net/netfilter/nf_nat_sip.c    | 33 ++++++++++++++++++---------------
 2 files changed, 19 insertions(+), 16 deletions(-)

diff --git a/net/netfilter/nf_nat_amanda.c b/net/netfilter/nf_nat_amanda.c
index 98deef6cde69..8f1054920a85 100644
--- a/net/netfilter/nf_nat_amanda.c
+++ b/net/netfilter/nf_nat_amanda.c
@@ -50,7 +50,7 @@ static unsigned int help(struct sk_buff *skb,
 		return NF_DROP;
 	}
 
-	sprintf(buffer, "%u", port);
+	snprintf(buffer, sizeof(buffer), "%u", port);
 	if (!nf_nat_mangle_udp_packet(skb, exp->master, ctinfo,
 				      protoff, matchoff, matchlen,
 				      buffer, strlen(buffer))) {
diff --git a/net/netfilter/nf_nat_sip.c b/net/netfilter/nf_nat_sip.c
index cf4aeb299bde..c845b6d1a2bd 100644
--- a/net/netfilter/nf_nat_sip.c
+++ b/net/netfilter/nf_nat_sip.c
@@ -68,25 +68,27 @@ static unsigned int mangle_packet(struct sk_buff *skb, unsigned int protoff,
 }
 
 static int sip_sprintf_addr(const struct nf_conn *ct, char *buffer,
+			    size_t size,
 			    const union nf_inet_addr *addr, bool delim)
 {
 	if (nf_ct_l3num(ct) == NFPROTO_IPV4)
-		return sprintf(buffer, "%pI4", &addr->ip);
+		return scnprintf(buffer, size, "%pI4", &addr->ip);
 	else {
 		if (delim)
-			return sprintf(buffer, "[%pI6c]", &addr->ip6);
+			return scnprintf(buffer, size, "[%pI6c]", &addr->ip6);
 		else
-			return sprintf(buffer, "%pI6c", &addr->ip6);
+			return scnprintf(buffer, size, "%pI6c", &addr->ip6);
 	}
 }
 
 static int sip_sprintf_addr_port(const struct nf_conn *ct, char *buffer,
+				 size_t size,
 				 const union nf_inet_addr *addr, u16 port)
 {
 	if (nf_ct_l3num(ct) == NFPROTO_IPV4)
-		return sprintf(buffer, "%pI4:%u", &addr->ip, port);
+		return scnprintf(buffer, size, "%pI4:%u", &addr->ip, port);
 	else
-		return sprintf(buffer, "[%pI6c]:%u", &addr->ip6, port);
+		return scnprintf(buffer, size, "[%pI6c]:%u", &addr->ip6, port);
 }
 
 static int map_addr(struct sk_buff *skb, unsigned int protoff,
@@ -119,7 +121,7 @@ static int map_addr(struct sk_buff *skb, unsigned int protoff,
 	if (nf_inet_addr_cmp(&newaddr, addr) && newport == port)
 		return 1;
 
-	buflen = sip_sprintf_addr_port(ct, buffer, &newaddr, ntohs(newport));
+	buflen = sip_sprintf_addr_port(ct, buffer, sizeof(buffer), &newaddr, ntohs(newport));
 	return mangle_packet(skb, protoff, dataoff, dptr, datalen,
 			     matchoff, matchlen, buffer, buflen);
 }
@@ -212,7 +214,7 @@ static unsigned int nf_nat_sip(struct sk_buff *skb, unsigned int protoff,
 					       &addr, true) > 0 &&
 		    nf_inet_addr_cmp(&addr, &ct->tuplehash[dir].tuple.src.u3) &&
 		    !nf_inet_addr_cmp(&addr, &ct->tuplehash[!dir].tuple.dst.u3)) {
-			buflen = sip_sprintf_addr(ct, buffer,
+			buflen = sip_sprintf_addr(ct, buffer, sizeof(buffer),
 					&ct->tuplehash[!dir].tuple.dst.u3,
 					true);
 			if (!mangle_packet(skb, protoff, dataoff, dptr, datalen,
@@ -229,7 +231,7 @@ static unsigned int nf_nat_sip(struct sk_buff *skb, unsigned int protoff,
 					       &addr, false) > 0 &&
 		    nf_inet_addr_cmp(&addr, &ct->tuplehash[dir].tuple.dst.u3) &&
 		    !nf_inet_addr_cmp(&addr, &ct->tuplehash[!dir].tuple.src.u3)) {
-			buflen = sip_sprintf_addr(ct, buffer,
+			buflen = sip_sprintf_addr(ct, buffer, sizeof(buffer),
 					&ct->tuplehash[!dir].tuple.src.u3,
 					false);
 			if (!mangle_packet(skb, protoff, dataoff, dptr, datalen,
@@ -247,7 +249,7 @@ static unsigned int nf_nat_sip(struct sk_buff *skb, unsigned int protoff,
 		    htons(n) == ct->tuplehash[dir].tuple.dst.u.udp.port &&
 		    htons(n) != ct->tuplehash[!dir].tuple.src.u.udp.port) {
 			__be16 p = ct->tuplehash[!dir].tuple.src.u.udp.port;
-			buflen = sprintf(buffer, "%u", ntohs(p));
+			buflen = scnprintf(buffer, sizeof(buffer), "%u", ntohs(p));
 			if (!mangle_packet(skb, protoff, dataoff, dptr, datalen,
 					   poff, plen, buffer, buflen)) {
 				nf_ct_helper_log(skb, ct, "cannot mangle rport");
@@ -418,7 +420,8 @@ static unsigned int nf_nat_sip_expect(struct sk_buff *skb, unsigned int protoff,
 
 	if (!nf_inet_addr_cmp(&exp->tuple.dst.u3, &exp->saved_addr) ||
 	    exp->tuple.dst.u.udp.port != exp->saved_proto.udp.port) {
-		buflen = sip_sprintf_addr_port(ct, buffer, &newaddr, port);
+		buflen = sip_sprintf_addr_port(ct, buffer, sizeof(buffer),
+					       &newaddr, port);
 		if (!mangle_packet(skb, protoff, dataoff, dptr, datalen,
 				   matchoff, matchlen, buffer, buflen)) {
 			nf_ct_helper_log(skb, ct, "cannot mangle packet");
@@ -438,8 +441,8 @@ static int mangle_content_len(struct sk_buff *skb, unsigned int protoff,
 {
 	enum ip_conntrack_info ctinfo;
 	struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+	char buffer[sizeof("4294967295")];
 	unsigned int matchoff, matchlen;
-	char buffer[sizeof("65536")];
 	int buflen, c_len;
 
 	/* Get actual SDP length */
@@ -454,7 +457,7 @@ static int mangle_content_len(struct sk_buff *skb, unsigned int protoff,
 			      &matchoff, &matchlen) <= 0)
 		return 0;
 
-	buflen = sprintf(buffer, "%u", c_len);
+	buflen = scnprintf(buffer, sizeof(buffer), "%u", c_len);
 	return mangle_packet(skb, protoff, dataoff, dptr, datalen,
 			     matchoff, matchlen, buffer, buflen);
 }
@@ -491,7 +494,7 @@ static unsigned int nf_nat_sdp_addr(struct sk_buff *skb, unsigned int protoff,
 	char buffer[INET6_ADDRSTRLEN];
 	unsigned int buflen;
 
-	buflen = sip_sprintf_addr(ct, buffer, addr, false);
+	buflen = sip_sprintf_addr(ct, buffer, sizeof(buffer), addr, false);
 	if (mangle_sdp_packet(skb, protoff, dataoff, dptr, datalen,
 			      sdpoff, type, term, buffer, buflen))
 		return 0;
@@ -509,7 +512,7 @@ static unsigned int nf_nat_sdp_port(struct sk_buff *skb, unsigned int protoff,
 	char buffer[sizeof("nnnnn")];
 	unsigned int buflen;
 
-	buflen = sprintf(buffer, "%u", port);
+	buflen = scnprintf(buffer, sizeof(buffer), "%u", port);
 	if (!mangle_packet(skb, protoff, dataoff, dptr, datalen,
 			   matchoff, matchlen, buffer, buflen))
 		return 0;
@@ -529,7 +532,7 @@ static unsigned int nf_nat_sdp_session(struct sk_buff *skb, unsigned int protoff
 	unsigned int buflen;
 
 	/* Mangle session description owner and contact addresses */
-	buflen = sip_sprintf_addr(ct, buffer, addr, false);
+	buflen = sip_sprintf_addr(ct, buffer, sizeof(buffer), addr, false);
 	if (mangle_sdp_packet(skb, protoff, dataoff, dptr, datalen, sdpoff,
 			      SDP_HDR_OWNER, SDP_HDR_MEDIA, buffer, buflen))
 		return 0;
-- 
2.47.3


^ permalink raw reply related

* [PATCH net 08/14] netfilter: xtables: restrict several matches to inet family
From: Pablo Neira Ayuso @ 2026-04-16  1:30 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev, kuba, pabeni, edumazet, fw, horms
In-Reply-To: <20260416013101.221555-1-pablo@netfilter.org>

This is a partial revert of:

  commit ab4f21e6fb1c ("netfilter: xtables: use NFPROTO_UNSPEC in more extensions")

to allow ipv4 and ipv6 only.

- xt_mac
- xt_owner
- xt_physdev

These extensions are not used by ebtables in userspace.

Moreover, xt_realm is only for ipv4, since dst->tclassid is ipv4
specific.

Fixes: ab4f21e6fb1c ("netfilter: xtables: use NFPROTO_UNSPEC in more extensions")
Reported-by: "Kito Xu (veritas501)" <hxzene@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/xt_mac.c     | 34 +++++++++++++++++++++++-----------
 net/netfilter/xt_owner.c   | 37 +++++++++++++++++++++++++------------
 net/netfilter/xt_physdev.c | 29 +++++++++++++++++++----------
 net/netfilter/xt_realm.c   |  2 +-
 4 files changed, 68 insertions(+), 34 deletions(-)

diff --git a/net/netfilter/xt_mac.c b/net/netfilter/xt_mac.c
index 81649da57ba5..bd2354760895 100644
--- a/net/netfilter/xt_mac.c
+++ b/net/netfilter/xt_mac.c
@@ -38,25 +38,37 @@ static bool mac_mt(const struct sk_buff *skb, struct xt_action_param *par)
 	return ret;
 }
 
-static struct xt_match mac_mt_reg __read_mostly = {
-	.name      = "mac",
-	.revision  = 0,
-	.family    = NFPROTO_UNSPEC,
-	.match     = mac_mt,
-	.matchsize = sizeof(struct xt_mac_info),
-	.hooks     = (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_IN) |
-	             (1 << NF_INET_FORWARD),
-	.me        = THIS_MODULE,
+static struct xt_match mac_mt_reg[] __read_mostly = {
+	{
+		.name		= "mac",
+		.family		= NFPROTO_IPV4,
+		.match		= mac_mt,
+		.matchsize	= sizeof(struct xt_mac_info),
+		.hooks		= (1 << NF_INET_PRE_ROUTING) |
+				  (1 << NF_INET_LOCAL_IN) |
+				  (1 << NF_INET_FORWARD),
+		.me		= THIS_MODULE,
+	},
+	{
+		.name		= "mac",
+		.family		= NFPROTO_IPV6,
+		.match		= mac_mt,
+		.matchsize	= sizeof(struct xt_mac_info),
+		.hooks		= (1 << NF_INET_PRE_ROUTING) |
+				  (1 << NF_INET_LOCAL_IN) |
+				  (1 << NF_INET_FORWARD),
+		.me		= THIS_MODULE,
+	},
 };
 
 static int __init mac_mt_init(void)
 {
-	return xt_register_match(&mac_mt_reg);
+	return xt_register_matches(mac_mt_reg, ARRAY_SIZE(mac_mt_reg));
 }
 
 static void __exit mac_mt_exit(void)
 {
-	xt_unregister_match(&mac_mt_reg);
+	xt_unregister_matches(mac_mt_reg, ARRAY_SIZE(mac_mt_reg));
 }
 
 module_init(mac_mt_init);
diff --git a/net/netfilter/xt_owner.c b/net/netfilter/xt_owner.c
index 50332888c8d2..7be2fe22b067 100644
--- a/net/netfilter/xt_owner.c
+++ b/net/netfilter/xt_owner.c
@@ -127,26 +127,39 @@ owner_mt(const struct sk_buff *skb, struct xt_action_param *par)
 	return true;
 }
 
-static struct xt_match owner_mt_reg __read_mostly = {
-	.name       = "owner",
-	.revision   = 1,
-	.family     = NFPROTO_UNSPEC,
-	.checkentry = owner_check,
-	.match      = owner_mt,
-	.matchsize  = sizeof(struct xt_owner_match_info),
-	.hooks      = (1 << NF_INET_LOCAL_OUT) |
-	              (1 << NF_INET_POST_ROUTING),
-	.me         = THIS_MODULE,
+static struct xt_match owner_mt_reg[] __read_mostly = {
+	{
+		.name       = "owner",
+		.revision   = 1,
+		.family     = NFPROTO_IPV4,
+		.checkentry = owner_check,
+		.match      = owner_mt,
+		.matchsize  = sizeof(struct xt_owner_match_info),
+		.hooks      = (1 << NF_INET_LOCAL_OUT) |
+			      (1 << NF_INET_POST_ROUTING),
+		.me         = THIS_MODULE,
+	},
+	{
+		.name       = "owner",
+		.revision   = 1,
+		.family     = NFPROTO_IPV6,
+		.checkentry = owner_check,
+		.match      = owner_mt,
+		.matchsize  = sizeof(struct xt_owner_match_info),
+		.hooks      = (1 << NF_INET_LOCAL_OUT) |
+			      (1 << NF_INET_POST_ROUTING),
+		.me         = THIS_MODULE,
+	}
 };
 
 static int __init owner_mt_init(void)
 {
-	return xt_register_match(&owner_mt_reg);
+	return xt_register_matches(owner_mt_reg, ARRAY_SIZE(owner_mt_reg));
 }
 
 static void __exit owner_mt_exit(void)
 {
-	xt_unregister_match(&owner_mt_reg);
+	xt_unregister_matches(owner_mt_reg, ARRAY_SIZE(owner_mt_reg));
 }
 
 module_init(owner_mt_init);
diff --git a/net/netfilter/xt_physdev.c b/net/netfilter/xt_physdev.c
index 343e65f377d4..130842c35c6f 100644
--- a/net/netfilter/xt_physdev.c
+++ b/net/netfilter/xt_physdev.c
@@ -115,24 +115,33 @@ static int physdev_mt_check(const struct xt_mtchk_param *par)
 	return 0;
 }
 
-static struct xt_match physdev_mt_reg __read_mostly = {
-	.name       = "physdev",
-	.revision   = 0,
-	.family     = NFPROTO_UNSPEC,
-	.checkentry = physdev_mt_check,
-	.match      = physdev_mt,
-	.matchsize  = sizeof(struct xt_physdev_info),
-	.me         = THIS_MODULE,
+static struct xt_match physdev_mt_reg[] __read_mostly = {
+	{
+		.name		= "physdev",
+		.family		= NFPROTO_IPV4,
+		.checkentry	= physdev_mt_check,
+		.match		= physdev_mt,
+		.matchsize	= sizeof(struct xt_physdev_info),
+		.me		= THIS_MODULE,
+	},
+	{
+		.name		= "physdev",
+		.family		= NFPROTO_IPV6,
+		.checkentry	= physdev_mt_check,
+		.match		= physdev_mt,
+		.matchsize	= sizeof(struct xt_physdev_info),
+		.me		= THIS_MODULE,
+	},
 };
 
 static int __init physdev_mt_init(void)
 {
-	return xt_register_match(&physdev_mt_reg);
+	return xt_register_matches(physdev_mt_reg, ARRAY_SIZE(physdev_mt_reg));
 }
 
 static void __exit physdev_mt_exit(void)
 {
-	xt_unregister_match(&physdev_mt_reg);
+	xt_unregister_matches(physdev_mt_reg, ARRAY_SIZE(physdev_mt_reg));
 }
 
 module_init(physdev_mt_init);
diff --git a/net/netfilter/xt_realm.c b/net/netfilter/xt_realm.c
index 6df485f4403d..61b2f1e58d15 100644
--- a/net/netfilter/xt_realm.c
+++ b/net/netfilter/xt_realm.c
@@ -33,7 +33,7 @@ static struct xt_match realm_mt_reg __read_mostly = {
 	.matchsize	= sizeof(struct xt_realm_info),
 	.hooks		= (1 << NF_INET_POST_ROUTING) | (1 << NF_INET_FORWARD) |
 			  (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_LOCAL_IN),
-	.family		= NFPROTO_UNSPEC,
+	.family		= NFPROTO_IPV4,
 	.me		= THIS_MODULE
 };
 
-- 
2.47.3


^ permalink raw reply related

* [PATCH net 09/14] netfilter: nat: use kfree_rcu to release ops
From: Pablo Neira Ayuso @ 2026-04-16  1:30 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev, kuba, pabeni, edumazet, fw, horms
In-Reply-To: <20260416013101.221555-1-pablo@netfilter.org>

Florian Westphal says:

"Historically this is not an issue, even for normal base hooks: the data
path doesn't use the original nf_hook_ops that are used to register the
callbacks.

However, in v5.14 I added the ability to dump the active netfilter
hooks from userspace.

This code will peek back into the nf_hook_ops that are available
at the tail of the pointer-array blob used by the datapath.

The nat hooks are special, because they are called indirectly from
the central nat dispatcher hook. They are currently invisible to
the nfnl hook dump subsystem though.

But once that changes the nat ops structures have to be deferred too."

Update nf_nat_register_fn() to deal with partial exposition of the hooks
from error path which can be also an issue for nfnetlink_hook.

Fixes: e2cf17d3774c ("netfilter: add new hook nfnl subsystem")
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv4/netfilter/iptable_nat.c  |  2 +-
 net/ipv6/netfilter/ip6table_nat.c |  2 +-
 net/netfilter/nf_nat_core.c       | 10 ++++++----
 3 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c
index a5db7c67d61b..3b1de7f82bf8 100644
--- a/net/ipv4/netfilter/iptable_nat.c
+++ b/net/ipv4/netfilter/iptable_nat.c
@@ -100,7 +100,7 @@ static void ipt_nat_unregister_lookups(struct net *net)
 	for (i = 0; i < ARRAY_SIZE(nf_nat_ipv4_ops); i++)
 		nf_nat_ipv4_unregister_fn(net, &ops[i]);
 
-	kfree(ops);
+	kfree_rcu(ops, rcu);
 }
 
 static int iptable_nat_table_init(struct net *net)
diff --git a/net/ipv6/netfilter/ip6table_nat.c b/net/ipv6/netfilter/ip6table_nat.c
index e119d4f090cc..9adfbfeaab0c 100644
--- a/net/ipv6/netfilter/ip6table_nat.c
+++ b/net/ipv6/netfilter/ip6table_nat.c
@@ -102,7 +102,7 @@ static void ip6t_nat_unregister_lookups(struct net *net)
 	for (i = 0; i < ARRAY_SIZE(nf_nat_ipv6_ops); i++)
 		nf_nat_ipv6_unregister_fn(net, &ops[i]);
 
-	kfree(ops);
+	kfree_rcu(ops, rcu);
 }
 
 static int ip6table_nat_table_init(struct net *net)
diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c
index 3b5434e4ec9c..b30ca94c2bb7 100644
--- a/net/netfilter/nf_nat_core.c
+++ b/net/netfilter/nf_nat_core.c
@@ -1228,9 +1228,11 @@ int nf_nat_register_fn(struct net *net, u8 pf, const struct nf_hook_ops *ops,
 		ret = nf_register_net_hooks(net, nat_ops, ops_count);
 		if (ret < 0) {
 			mutex_unlock(&nf_nat_proto_mutex);
-			for (i = 0; i < ops_count; i++)
-				kfree(nat_ops[i].priv);
-			kfree(nat_ops);
+			for (i = 0; i < ops_count; i++) {
+				priv = nat_ops[i].priv;
+				kfree_rcu(priv, rcu_head);
+			}
+			kfree_rcu(nat_ops, rcu);
 			return ret;
 		}
 
@@ -1294,7 +1296,7 @@ void nf_nat_unregister_fn(struct net *net, u8 pf, const struct nf_hook_ops *ops,
 		}
 
 		nat_proto_net->nat_hook_ops = NULL;
-		kfree(nat_ops);
+		kfree_rcu(nat_ops, rcu);
 	}
 unlock:
 	mutex_unlock(&nf_nat_proto_mutex);
-- 
2.47.3


^ permalink raw reply related

* [PATCH net 10/14] ipvs: fix MTU check for GSO packets in tunnel mode
From: Pablo Neira Ayuso @ 2026-04-16  1:30 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev, kuba, pabeni, edumazet, fw, horms
In-Reply-To: <20260416013101.221555-1-pablo@netfilter.org>

From: Yingnan Zhang <342144303@qq.com>

Currently, IPVS skips MTU checks for GSO packets by excluding them with
the !skb_is_gso(skb) condition. This creates problems when IPVS tunnel
mode encapsulates GSO packets with IPIP headers.

The issue manifests in two ways:

1. MTU violation after encapsulation:
   When a GSO packet passes through IPVS tunnel mode, the original MTU
   check is bypassed. After adding the IPIP tunnel header, the packet
   size may exceed the outgoing interface MTU, leading to unexpected
   fragmentation at the IP layer.

2. Fragmentation with problematic IP IDs:
   When net.ipv4.vs.pmtu_disc=1 and a GSO packet with multiple segments
   is fragmented after encapsulation, each segment gets a sequentially
   incremented IP ID (0, 1, 2, ...). This happens because:

   a) The GSO packet bypasses MTU check and gets encapsulated
   b) At __ip_finish_output, the oversized GSO packet is split into
      separate SKBs (one per segment), with IP IDs incrementing
   c) Each SKB is then fragmented again based on the actual MTU

   This sequential IP ID allocation differs from the expected behavior
   and can cause issues with fragment reassembly and packet tracking.

Fix this by properly validating GSO packets using
skb_gso_validate_network_len(). This function correctly validates
whether the GSO segments will fit within the MTU after segmentation. If
validation fails, send an ICMP Fragmentation Needed message to enable
proper PMTU discovery.

Fixes: 4cdd34084d53 ("netfilter: nf_conntrack_ipv6: improve fragmentation handling")
Signed-off-by: Yingnan Zhang <342144303@qq.com>
Acked-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/ipvs/ip_vs_xmit.c | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index 3601eb86d025..7c570f48ade2 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -102,6 +102,18 @@ __ip_vs_dst_check(struct ip_vs_dest *dest)
 	return dest_dst;
 }
 
+/* Based on ip_exceeds_mtu(). */
+static bool ip_vs_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu)
+{
+	if (skb->len <= mtu)
+		return false;
+
+	if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
+		return false;
+
+	return true;
+}
+
 static inline bool
 __mtu_check_toobig_v6(const struct sk_buff *skb, u32 mtu)
 {
@@ -111,10 +123,9 @@ __mtu_check_toobig_v6(const struct sk_buff *skb, u32 mtu)
 		 */
 		if (IP6CB(skb)->frag_max_size > mtu)
 			return true; /* largest fragment violate MTU */
-	}
-	else if (skb->len > mtu && !skb_is_gso(skb)) {
+	} else if (ip_vs_exceeds_mtu(skb, mtu))
 		return true; /* Packet size violate MTU size */
-	}
+
 	return false;
 }
 
@@ -232,7 +243,7 @@ static inline bool ensure_mtu_is_adequate(struct netns_ipvs *ipvs, int skb_af,
 			return true;
 
 		if (unlikely(ip_hdr(skb)->frag_off & htons(IP_DF) &&
-			     skb->len > mtu && !skb_is_gso(skb) &&
+			     ip_vs_exceeds_mtu(skb, mtu) &&
 			     !ip_vs_iph_icmp(ipvsh))) {
 			icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
 				  htonl(mtu));
-- 
2.47.3


^ permalink raw reply related

* [PATCH net 11/14] netfilter: nf_tables: use list_del_rcu for netlink hooks
From: Pablo Neira Ayuso @ 2026-04-16  1:30 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev, kuba, pabeni, edumazet, fw, horms
In-Reply-To: <20260416013101.221555-1-pablo@netfilter.org>

From: Florian Westphal <fw@strlen.de>

nft_netdev_unregister_hooks and __nft_unregister_flowtable_net_hooks need
to use list_del_rcu(), this list can be walked by concurrent dumpers.

Add a new helper and use it consistently.

Fixes: f9a43007d3f7 ("netfilter: nf_tables: double hook unregistration in netns path")
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_tables_api.c | 44 ++++++++++++++---------------------
 1 file changed, 18 insertions(+), 26 deletions(-)

diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 8c42247a176c..090d4d688a33 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -374,6 +374,12 @@ static void nft_netdev_hook_free_rcu(struct nft_hook *hook)
 	call_rcu(&hook->rcu, __nft_netdev_hook_free_rcu);
 }
 
+static void nft_netdev_hook_unlink_free_rcu(struct nft_hook *hook)
+{
+	list_del_rcu(&hook->list);
+	nft_netdev_hook_free_rcu(hook);
+}
+
 static void nft_netdev_unregister_hooks(struct net *net,
 					struct list_head *hook_list,
 					bool release_netdev)
@@ -384,10 +390,8 @@ static void nft_netdev_unregister_hooks(struct net *net,
 	list_for_each_entry_safe(hook, next, hook_list, list) {
 		list_for_each_entry(ops, &hook->ops_list, list)
 			nf_unregister_net_hook(net, ops);
-		if (release_netdev) {
-			list_del(&hook->list);
-			nft_netdev_hook_free_rcu(hook);
-		}
+		if (release_netdev)
+			nft_netdev_hook_unlink_free_rcu(hook);
 	}
 }
 
@@ -2323,10 +2327,8 @@ void nf_tables_chain_destroy(struct nft_chain *chain)
 
 		if (nft_base_chain_netdev(table->family, basechain->ops.hooknum)) {
 			list_for_each_entry_safe(hook, next,
-						 &basechain->hook_list, list) {
-				list_del_rcu(&hook->list);
-				nft_netdev_hook_free_rcu(hook);
-			}
+						 &basechain->hook_list, list)
+				nft_netdev_hook_unlink_free_rcu(hook);
 		}
 		module_put(basechain->type->owner);
 		if (rcu_access_pointer(basechain->stats)) {
@@ -3026,6 +3028,7 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy,
 				list_for_each_entry(ops, &h->ops_list, list)
 					nf_unregister_net_hook(ctx->net, ops);
 			}
+			/* hook.list is on stack, no need for list_del_rcu() */
 			list_del(&h->list);
 			nft_netdev_hook_free_rcu(h);
 		}
@@ -8903,10 +8906,8 @@ static void __nft_unregister_flowtable_net_hooks(struct net *net,
 	list_for_each_entry_safe(hook, next, hook_list, list) {
 		list_for_each_entry(ops, &hook->ops_list, list)
 			nft_unregister_flowtable_ops(net, flowtable, ops);
-		if (release_netdev) {
-			list_del(&hook->list);
-			nft_netdev_hook_free_rcu(hook);
-		}
+		if (release_netdev)
+			nft_netdev_hook_unlink_free_rcu(hook);
 	}
 }
 
@@ -8977,8 +8978,7 @@ static int nft_register_flowtable_net_hooks(struct net *net,
 
 			nft_unregister_flowtable_ops(net, flowtable, ops);
 		}
-		list_del_rcu(&hook->list);
-		nft_netdev_hook_free_rcu(hook);
+		nft_netdev_hook_unlink_free_rcu(hook);
 	}
 
 	return err;
@@ -8988,10 +8988,8 @@ static void nft_hooks_destroy(struct list_head *hook_list)
 {
 	struct nft_hook *hook, *next;
 
-	list_for_each_entry_safe(hook, next, hook_list, list) {
-		list_del_rcu(&hook->list);
-		nft_netdev_hook_free_rcu(hook);
-	}
+	list_for_each_entry_safe(hook, next, hook_list, list)
+		nft_netdev_hook_unlink_free_rcu(hook);
 }
 
 static int nft_flowtable_update(struct nft_ctx *ctx, const struct nlmsghdr *nlh,
@@ -9079,8 +9077,7 @@ static int nft_flowtable_update(struct nft_ctx *ctx, const struct nlmsghdr *nlh,
 				nft_unregister_flowtable_ops(ctx->net,
 							     flowtable, ops);
 		}
-		list_del_rcu(&hook->list);
-		nft_netdev_hook_free_rcu(hook);
+		nft_netdev_hook_unlink_free_rcu(hook);
 	}
 
 	return err;
@@ -9586,13 +9583,8 @@ static void nf_tables_flowtable_notify(struct nft_ctx *ctx,
 
 static void nf_tables_flowtable_destroy(struct nft_flowtable *flowtable)
 {
-	struct nft_hook *hook, *next;
-
 	flowtable->data.type->free(&flowtable->data);
-	list_for_each_entry_safe(hook, next, &flowtable->hook_list, list) {
-		list_del_rcu(&hook->list);
-		nft_netdev_hook_free_rcu(hook);
-	}
+	nft_hooks_destroy(&flowtable->hook_list);
 	kfree(flowtable->name);
 	module_put(flowtable->data.type->owner);
 	kfree(flowtable);
-- 
2.47.3


^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox