Netdev List
 help / color / mirror / Atom feed
* Re: [PATCH net-next v3 12/12] selftests: net: Add a test for BIG TCP in UDP tunnels
From: Willem de Bruijn @ 2026-04-16 12:06 UTC (permalink / raw)
  To: Alice Mikityanska, Daniel Borkmann, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Xin Long, Willem de Bruijn,
	David Ahern, Nikolay Aleksandrov
  Cc: Shuah Khan, Stanislav Fomichev, Andrew Lunn, Simon Horman,
	Florian Westphal, netdev, Alice Mikityanska
In-Reply-To: <20260410150943.993350-13-alice.kernel@fastmail.im>

Alice Mikityanska wrote:
> From: Alice Mikityanska <alice@isovalent.com>
> 
> The test sets up VXLAN and GENEVE tunnels over IPv4 and IPv6 and runs
> IPv4 and IPv6 traffic through them with BIG TCP enabled. It checks that
> a non-negligible amount of big aggregated packets are seen in tcpdump.
> 
> Signed-off-by: Alice Mikityanska <alice@isovalent.com>
> ---
>  tools/testing/selftests/net/Makefile          |   1 +
>  .../testing/selftests/net/big_tcp_tunnels.sh  | 145 ++++++++++++++++++
>  2 files changed, 146 insertions(+)
>  create mode 100755 tools/testing/selftests/net/big_tcp_tunnels.sh
> 
> diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile
> index cab74ebdaced..c8ea9d4bb94f 100644
> --- a/tools/testing/selftests/net/Makefile
> +++ b/tools/testing/selftests/net/Makefile
> @@ -13,6 +13,7 @@ TEST_PROGS := \
>  	arp_ndisc_untracked_subnets.sh \
>  	bareudp.sh \
>  	big_tcp.sh \
> +	big_tcp_tunnels.sh \
>  	bind_bhash.sh \
>  	bpf_offload.py \
>  	bridge_vlan_dump.sh \
> diff --git a/tools/testing/selftests/net/big_tcp_tunnels.sh b/tools/testing/selftests/net/big_tcp_tunnels.sh
> new file mode 100755
> index 000000000000..b819911519ac
> --- /dev/null
> +++ b/tools/testing/selftests/net/big_tcp_tunnels.sh
> @@ -0,0 +1,145 @@
> +#!/usr/bin/env bash
> +# SPDX-License-Identifier: GPL-2.0
> +#
> +# Testing for IPv4 and IPv6 BIG TCP over VXLAN and GENEVE tunnels.
> +
> +SERVER_NS=$(mktemp -u server-XXXXXXXX)
> +SERVER_IP4="192.168.1.1"
> +SERVER_IP6="2001:db8::1:1"
> +SERVER_IP4_TUN="192.168.2.1"
> +SERVER_IP6_TUN="2001:db8::2:1"
> +
> +CLIENT_NS=$(mktemp -u client-XXXXXXXX)
> +CLIENT_IP4="192.168.1.2"
> +CLIENT_IP6="2001:db8::1:2"
> +CLIENT_IP4_TUN="192.168.2.2"
> +CLIENT_IP6_TUN="2001:db8::2:2"
> +
> +PACKETS_THRESHOLD=10000
> +
> +# Kselftest framework requirement - SKIP code is 4.
> +ksft_skip=4
> +
> +setup() {
> +	ip netns add "$SERVER_NS"
> +	ip netns add "$CLIENT_NS"
> +	ip -netns "$SERVER_NS" link add link1 type veth peer name link0 netns "$CLIENT_NS"
> +
> +	ip -netns "$CLIENT_NS" link set link0 up
> +	ip -netns "$CLIENT_NS" addr replace "$CLIENT_IP4/24" dev link0
> +	ip -netns "$CLIENT_NS" addr replace "$CLIENT_IP6/112" dev link0 nodad
> +	ip -netns "$CLIENT_NS" link set link0 \
> +		gso_max_size 196608 gso_ipv4_max_size 196608 \
> +		gro_max_size 196608 gro_ipv4_max_size 196608
> +	ip -netns "$SERVER_NS" link set link1 up
> +	ip -netns "$SERVER_NS" addr replace "$SERVER_IP4/24" dev link1
> +	ip -netns "$SERVER_NS" addr replace "$SERVER_IP6/112" dev link1 nodad
> +	ip -netns "$SERVER_NS" link set link1 \
> +		gso_max_size 196608 gso_ipv4_max_size 196608 \
> +		gro_max_size 196608 gro_ipv4_max_size 196608
> +
> +	ip netns exec "$SERVER_NS" netserver >/dev/null
> +}
> +
> +setup_tunnel() {
> +	if [ "$2" = 4 ]; then
> +		SERVER_IP="$SERVER_IP4"
> +		CLIENT_IP="$CLIENT_IP4"
> +		echo "Setting up ${1^^} over IPv4"
> +	else
> +		SERVER_IP="$SERVER_IP6"
> +		CLIENT_IP="$CLIENT_IP6"
> +		echo "Setting up ${1^^} over IPv6"
> +	fi
> +
> +	if [ "$1" = vxlan ]; then
> +		ip -netns "$CLIENT_NS" link add tun0 type vxlan \
> +			id 5001 remote "$SERVER_IP" local "$CLIENT_IP" dev link0 dstport 4789
> +	else
> +		ip -netns "$CLIENT_NS" link add tun0 type geneve \
> +			id 5001 remote "$SERVER_IP"
> +	fi
> +	ip -netns "$CLIENT_NS" link set tun0 up
> +	ip -netns "$CLIENT_NS" addr replace "$CLIENT_IP4_TUN/24" dev tun0
> +	ip -netns "$CLIENT_NS" addr replace "$CLIENT_IP6_TUN/112" dev tun0 nodad
> +	ip -netns "$CLIENT_NS" link set tun0 \
> +		gso_max_size 196608 gso_ipv4_max_size 196608 \
> +		gro_max_size 196608 gro_ipv4_max_size 196608
> +	if [ "$1" = vxlan ]; then
> +		ip -netns "$SERVER_NS" link add tun1 type vxlan \
> +			id 5001 remote "$CLIENT_IP" local "$SERVER_IP" dev link1 dstport 4789
> +	else
> +		ip -netns "$SERVER_NS" link add tun1 type geneve \
> +			id 5001 remote "$CLIENT_IP"
> +	fi
> +	ip -netns "$SERVER_NS" link set tun1 up
> +	ip -netns "$SERVER_NS" addr replace "$SERVER_IP4_TUN/24" dev tun1
> +	ip -netns "$SERVER_NS" addr replace "$SERVER_IP6_TUN/112" dev tun1 nodad
> +	ip -netns "$SERVER_NS" link set tun1 \
> +		gso_max_size 196608 gso_ipv4_max_size 196608 \
> +		gro_max_size 196608 gro_ipv4_max_size 196608
> +}
> +
> +cleanup_tunnel() {
> +	ip -netns "$CLIENT_NS" link del tun0
> +	ip -netns "$SERVER_NS" link del tun1
> +}
> +
> +cleanup() {
> +	ip netns exec "$SERVER_NS" killall netserver
> +	ip netns del "$SERVER_NS"
> +	ip netns del "$CLIENT_NS"
> +}
> +
> +do_test() {
> +	exec 3< <(ip netns exec "$SERVER_NS" tcpdump -nn -i link1 greater 65536 2> /dev/null)
> +	TCPDUMP_SERVER_PID="$!"
> +	exec 4< <(wc -l <&3)
> +	exec 5< <(ip netns exec "$CLIENT_NS" tcpdump -nn -i link0 greater 65536 2> /dev/null)
> +	TCPDUMP_CLIENT_PID="$!"
> +	exec 6< <(wc -l <&5)
> +
> +	if [ "$1" = 4 ]; then
> +		SERVER_IP="$SERVER_IP4_TUN"
> +		echo "Running IPv4 traffic in the tunnel"
> +	else
> +		SERVER_IP="$SERVER_IP6_TUN"
> +		echo "Running IPv6 traffic in the tunnel"
> +	fi
> +
> +	ip netns exec "$CLIENT_NS" netperf -t TCP_STREAM -l 5 -H "$SERVER_IP" -- \
> +		-r 80000:80000 > /dev/null

is -r valid for TCP_STREAM

> +	kill "$TCPDUMP_SERVER_PID" "$TCPDUMP_CLIENT_PID"
> +	wait "$TCPDUMP_SERVER_PID" "$TCPDUMP_CLIENT_PID"
> +	PACKETS_SERVER=$(cat <&4)
> +	PACKETS_CLIENT=$(cat <&6)
> +	exec 3>&- 4>&- 5>&- 6>&-
> +
> +	# One line is empty, each packet is two lines (inner and outer).
> +	echo "Captured BIG TCP GRO packets: $(((PACKETS_SERVER - 1) / 2))"
> +	echo "Captured BIG TCP GSO packets: $(((PACKETS_CLIENT - 1) / 2))"
> +	[ "$PACKETS_SERVER" -gt "$(( PACKETS_THRESHOLD * 2 + 1))" ] || return 1
> +	[ "$PACKETS_CLIENT" -gt "$(( PACKETS_THRESHOLD * 2 + 1))" ] || return 1
> +}
> +
> +if ! netperf -V &> /dev/null; then
> +	echo "SKIP: Could not run test without netperf tool"
> +	exit "$ksft_skip"
> +fi
> +
> +if ! ip link help 2>&1 | grep gso_ipv4_max_size &> /dev/null; then
> +	echo "SKIP: Could not run test without gso/gro_ipv4_max_size supported in ip-link"
> +	exit "$ksft_skip"
> +fi
> +
> +trap cleanup EXIT
> +setup
> +for tunnel in vxlan geneve; do
> +	for tun_family in 4 6; do
> +		for traffic_family in 4 6; do
> +			setup_tunnel "$tunnel" "$tun_family" || exit "$?"
> +			do_test "$traffic_family" || exit "$?"
> +			cleanup_tunnel
> +		done
> +	done
> +done
> -- 
> 2.53.0
> 



^ permalink raw reply

* Re: [PATCH net-next v3 00/12] BIG TCP for UDP tunnels
From: Willem de Bruijn @ 2026-04-16 12:07 UTC (permalink / raw)
  To: Alice Mikityanska, Jakub Kicinski
  Cc: Alice Mikityanska, Daniel Borkmann, David S. Miller, Eric Dumazet,
	Paolo Abeni, Xin Long, Willem de Bruijn, David Ahern,
	Nikolay Aleksandrov, Shuah Khan, Stanislav Fomichev, Andrew Lunn,
	Simon Horman, Florian Westphal, netdev
In-Reply-To: <CAD0BsJU_Q6O6MpOMvD7dv6tPYsB8w8SFc-=2mEGpVJtUJW2u-A@mail.gmail.com>

Alice Mikityanska wrote:
> On Tue, 14 Apr 2026 at 01:55, Jakub Kicinski <kuba@kernel.org> wrote:
> >
> > On Fri, 10 Apr 2026 18:09:31 +0300 Alice Mikityanska wrote:
> > > This series is a follow-up to "BIG TCP without HBH in IPv6", and it adds
> > > support for BIG TCP IPv4/IPv6 workloads in vxlan and geneve. Now that
> > > IPv6 BIG TCP doesn't require stripping the HBH in all various
> > > combinations in tunneled traffic, adding BIG TCP becomes feasible.
> >
> > No longer applies, sorry :(
> 
> That's a pity :(. I see that the only conflict is because udplite
> parts have been removed from net/netfilter/nf_conntrack_proto_udp.c,
> so I just need to drop my change that touches udplite.
> 
> > We'll have to revisit after the merge window.
> 
> OK, I'll resubmit after the merge window. I'd appreciate it if I can
> still collect review comments in the meanwhile.

Sashiko has some points. Predominantly false positives at this point, I
think, but do take a look.

With that aside, for the series

Reviewed-by: Willem de Bruijn <willemb@google.com>



^ permalink raw reply

* Re: [PATCH net-next 5/6] net: stmmac: move PHY handling out of __stmmac_open()/release()
From: Russell King (Oracle) @ 2026-04-16 12:13 UTC (permalink / raw)
  To: Alexander Stein
  Cc: Andrew Lunn, Heiner Kallweit, Alexandre Torgue, Andrew Lunn,
	David S. Miller, Eric Dumazet, Jakub Kicinski, linux-arm-kernel,
	linux-stm32, Maxime Coquelin, netdev, Paolo Abeni
In-Reply-To: <2410317.ElGaqSPkdT@steina-w>

On Thu, Apr 16, 2026 at 02:02:53PM +0200, Alexander Stein wrote:
> Hi Russel,
> 
> Am Donnerstag, 16. April 2026, 12:49:25 CEST schrieb Russell King (Oracle):
> > On Thu, Apr 16, 2026 at 08:20:13AM +0200, Alexander Stein wrote:
> > > Am Mittwoch, 15. April 2026, 14:59:32 CEST schrieb Russell King (Oracle):
> > > > On Wed, Apr 15, 2026 at 08:08:40AM +0200, Alexander Stein wrote:
> > > > > Hi,
> > > > > 
> > > > > Am Dienstag, 23. September 2025, 13:26:19 CEST schrieb Russell King (Oracle):
> > > > > > Move the PHY attachment/detachment from the network driver out of
> > > > > > __stmmac_open() and __stmmac_release() into stmmac_open() and
> > > > > > stmmac_release() where these actions will only happen when the
> > > > > > interface is administratively brought up or down. It does not make
> > > > > > sense to detach and re-attach the PHY during a change of MTU.
> > > > > 
> > > > > Sorry for coming up now. But I recently noticed this commit breaks changing
> > > > > the MTU on i.MX8MP. Once I simply change the MTU I run into some DMA error:
> > > > > $ ip link set dev end1 mtu 1400
> > > > > imx-dwmac 30bf0000.ethernet end1: Register MEM_TYPE_PAGE_POOL RxQ-0
> > > > > imx-dwmac 30bf0000.ethernet end1: Register MEM_TYPE_PAGE_POOL RxQ-1
> > > > > imx-dwmac 30bf0000.ethernet end1: Register MEM_TYPE_PAGE_POOL RxQ-2
> > > > > imx-dwmac 30bf0000.ethernet end1: Register MEM_TYPE_PAGE_POOL RxQ-3
> > > > > imx-dwmac 30bf0000.ethernet end1: Register MEM_TYPE_PAGE_POOL RxQ-4
> > > > > imx-dwmac 30bf0000.ethernet end1: Link is Down
> > > > > imx-dwmac 30bf0000.ethernet end1: Failed to reset the dma
> > > > > imx-dwmac 30bf0000.ethernet end1: stmmac_hw_setup: DMA engine initialization failed
> > > > 
> > > > This basically means that a clock is missing. Please provide more
> > > > information:
> > > > 
> > > > - what kernel version are you using?
> > > 
> > > Currently I am using v6.18.22.
> > > $ ethtool -i end1
> > > driver: st_gmac
> > > version: 6.18.22
> > > firmware-version: 
> > > expansion-rom-version: 
> > > bus-info: 30bf0000.ethernet
> > > supports-statistics: yes
> > > supports-test: no
> > > supports-eeprom-access: no
> > > supports-register-dump: yes
> > > supports-priv-flags: no
> > > 
> > > > - has EEE been negotiated?
> > > 
> > > No. It is marked as not supported
> > > 
> > > $ ethtool --show-eee end1
> > > EEE settings for end1:
> > >         EEE status: not supported
> > > 
> > > > - does the problem persist when EEE is disabled?
> > > 
> > > As EEE is not supported the problem occurs even with EEE disabled.
> > > 
> > > > - which PHY is attached to stmmac?
> > > 
> > > It is a TI DP83867.
> > > 
> > > imx-dwmac 30bf0000.ethernet eth1: PHY [stmmac-1:03] driver [TI DP83867] (irq=136)
> > > 
> > > > - which PHY interface mode is being used to connect the PHY to stmmac?
> > > 
> > > For this interface
> > > > phy-mode = "rgmii-id";
> > > is set.
> > > 
> > > In case it is helpful. My platform is arch/arm64/boot/dts/freescale/imx8mp-tqma8mpql-mba8mpxl.dts
> > > Thanks for assisting. If there a further questions, don't hesitate to ask.
> > 
> > Thanks.
> > 
> > So, as best I can determine at the moment, we end up with the following
> > sequence:
> > 
> > stmmac_change_mtu()
> >  __stmmac_release()
> >   phylink_stop()
> >    phy_stop()
> >     phy->state = PHY_HALTED
> >     _phy_state_machine() returns PHY_STATE_WORK_SUSPEND
> >     _phy_state_machine_post_work()
> >      phy_suspend()
> >       genphy_suspend()
> >        phy_set_bits(phydev, MII_BMCR, BMCR_PDOWN)
> > 
> > With the DP83867, this causes most of the PHY to be powered down, thus
> > stopping the clocks, and this causes the stmmac reset to time out.
> > 
> > Prior to this commit, we would have called phylink_disconnect_phy()
> > immediately after phylink_stop(), but I can see nothing that would
> > be affected by this change there (since that also calls
> > phy_suspend(), but as the PHY is already suspended, this becomes a
> > no-op.)
> > 
> > However, __stmmac_open() would have called stmmac_init_phy(), which
> > would reattach the PHY. This would have called phy_init_hw(), 
> > resetting the PHY, and phy_resume() which would ensure that the
> > PDOWN bit is clear - thus clocks would be running.
> > 
> > As a hack, please can you try calling phylink_prepare_resume()
> > between the __stmmac_release() and __stmmac_open() in
> > stmmac_change_mtu(). This should resume the PHY, thus restoring the
> > clocks necessary for stmmac to reset.
> 
> I tried the following patch. This works as you suspected.

Brilliant, thanks for proving the theory why it broke.

I'll have a think about the best way to solve this, because
phylink_prepare_resume() is supposed to be paired with phylink_resume()
and that isn't the case here.

Please bear with me as my availability for looking at the kernel is
very unpredictable at present (family health issues.)

-- 
RMK's Patch system: https://www.armlinux.org.uk/developer/patches/
FTTP is here! 80Mbps down 10Mbps up. Decent connectivity at last!

^ permalink raw reply

* Re: [PATCH 55/61] interconnect: Prefer IS_ERR_OR_NULL over manual NULL check
From: Krzysztof Kozlowski @ 2026-04-16 12:24 UTC (permalink / raw)
  To: Philipp Hahn, amd-gfx, apparmor, bpf, ceph-devel, cocci, dm-devel,
	dri-devel, gfs2, intel-gfx, intel-wired-lan, iommu, kvm,
	linux-arm-kernel, linux-block, linux-bluetooth, linux-btrfs,
	linux-cifs, linux-clk, linux-erofs, linux-ext4, linux-fsdevel,
	linux-gpio, linux-hyperv, linux-input, linux-kernel, linux-leds,
	linux-media, linux-mips, linux-mm, linux-modules, linux-mtd,
	linux-nfs, linux-omap, linux-phy, linux-pm, linux-rockchip,
	linux-s390, linux-scsi, linux-sctp, linux-security-module,
	linux-sh, linux-sound, linux-stm32, linux-trace-kernel, linux-usb,
	linux-wireless, netdev, ntfs3, samba-technical, sched-ext,
	target-devel, tipc-discussion, v9fs
  Cc: Georgi Djakov
In-Reply-To: <20260310-b4-is_err_or_null-v1-55-bd63b656022d@avm.de>

On 10/03/2026 12:49, Philipp Hahn wrote:
> Prefer using IS_ERR_OR_NULL() over using IS_ERR() and a manual NULL
> check.
> 
> Semantich change: Previously the code only printed the warning on error,
> but not when the pointer was NULL. Now the warning is printed in both
> cases!

NAK, read the code

> 
> Change found with coccinelle.
> 
> To: Georgi Djakov <djakov@kernel.org>
> Cc: linux-pm@vger.kernel.org
> Cc: linux-kernel@vger.kernel.org
> Signed-off-by: Philipp Hahn <phahn-oss@avm.de>
> ---
>  drivers/interconnect/core.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/drivers/interconnect/core.c b/drivers/interconnect/core.c
> index 8569b78a18517b33abeafac091978b25cbc1acc7..22e92b30f73853d5bd2e05b4f52cb5aa22556468 100644
> --- a/drivers/interconnect/core.c
> +++ b/drivers/interconnect/core.c
> @@ -790,7 +790,7 @@ void icc_put(struct icc_path *path)
>  	size_t i;
>  	int ret;
>  
> -	if (!path || WARN_ON(IS_ERR(path)))
> +	if (WARN_ON(IS_ERR_OR_NULL(path)))

IS_ERR_OR_NULL is simply discouraged, but beside of code preference, you
just added bug here. This is clearly not equivalent and you emit warn on
perfectly valid case!

Best regards,
Krzysztof

^ permalink raw reply

* Re: [PATCH v3 1/3] net: dsa: microchip: implement KSZ87xx Module 3 low-loss cable errata
From: Andrew Lunn @ 2026-04-16 12:25 UTC (permalink / raw)
  To: Fidelio LAWSON
  Cc: Marek Vasut, Woojung Huh, UNGLinuxDriver, Vladimir Oltean,
	David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Maxime Chevallier, Simon Horman, Heiner Kallweit, Russell King,
	netdev, linux-kernel, Fidelio Lawson
In-Reply-To: <84e24758-2f59-44ca-a9b8-a46094578f83@gmail.com>

> Yes, I think a reasonable compromise could be to expose three tunables:
> 
> - a boolean "short-cable" tunable, which applies the known good settings
>   (LPF 62 MHz BW, DSP EQ initial value 0).
> 
> - an integer LPF bandwidth tunable, for advanced use cases where further
>   tuning is needed;
> 
> - an integer DSP EQ initial value tunable, for the same advanced cases.
> 
> The boolean tunable would follow the KISS principle and cover the common
> scenario, while the more granular controls would remain optional.

How do the three interact? Do you need to first enable short-cable
before you set LPG bandwidth or DSP EQ? If it is not enabled, do you
get -EINVAL?

It seems like having extack would be useful to return informative
error messages to user space, however, that requires netlink
ethtool. And ETHTOOL_PHY_STUNABLE has not been added to netlink
ethtool yet :-(

	Andrew

^ permalink raw reply

* [PATCH bpf] bpf: Fix precedence bug in convert_bpf_ld_abs alignment check
From: Daniel Borkmann @ 2026-04-16 12:27 UTC (permalink / raw)
  To: bpf; +Cc: netdev, edumazet, willemdebruijn.kernel

Fix an operator precedence issue in convert_bpf_ld_abs() where the
expression offset + ip_align % size evaluates as offset + (ip_align % size)
due to % having higher precedence than +. That latter evaluation does
not make any sense. The intended check is (offset + ip_align) % size == 0
to verify that the packet load offset is properly aligned for direct
access.

With NET_IP_ALIGN == 2, the bug causes the inline fast-path for direct
packet loads to almost never be taken on !CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
platforms. This forces nearly all cBPF BPF_LD_ABS packet loads through
the bpf_skb_load_helper slow path on the affected archs.

Fixes: e0cea7ce988c ("bpf: implement ld_abs/ld_ind in native bpf")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 net/core/filter.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/core/filter.c b/net/core/filter.c
index a10cdcb7103e..35f5ee7af64e 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -508,7 +508,7 @@ static bool convert_bpf_ld_abs(struct sock_filter *fp, struct bpf_insn **insnp)
 	    ((unaligned_ok && offset >= 0) ||
 	     (!unaligned_ok && offset >= 0 &&
 	      offset + ip_align >= 0 &&
-	      offset + ip_align % size == 0))) {
+	      (offset + ip_align) % size == 0))) {
 		bool ldx_off_ok = offset <= S16_MAX;
 
 		*insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_H);
-- 
2.43.0


^ permalink raw reply related

* Re: [RFC PATCH 1/2] kernel/notifier: replace single-linked list with double-linked list for reverse traversal
From: David Laight @ 2026-04-16 12:30 UTC (permalink / raw)
  To: chensong_2000
  Cc: rafael, lenb, mturquette, sboyd, viresh.kumar, agk, snitzer,
	mpatocka, bmarzins, song, yukuai, linan122, jason.wessel, danielt,
	dianders, horms, davem, edumazet, kuba, pabeni, paulmck, frederic,
	mcgrof, petr.pavlu, da.gomez, samitolvanen, atomlin, jpoimboe,
	jikos, mbenes, pmladek, joe.lawrence, rostedt, mhiramat,
	mark.rutland, mathieu.desnoyers, linux-modules, linux-kernel,
	linux-trace-kernel, linux-acpi, linux-clk, linux-pm,
	live-patching, dm-devel, linux-raid, kgdb-bugreport, netdev
In-Reply-To: <20260415070137.17860-1-chensong_2000@189.cn>

On Wed, 15 Apr 2026 15:01:37 +0800
chensong_2000@189.cn wrote:

> From: Song Chen <chensong_2000@189.cn>
> 
> The current notifier chain implementation uses a single-linked list
> (struct notifier_block *next), which only supports forward traversal
> in priority order. This makes it difficult to handle cleanup/teardown
> scenarios that require notifiers to be called in reverse priority order.

If it is only cleanup/teardown then the list can be order-reversed
as part of that process at the same time as the list is deleted.

	David



^ permalink raw reply

* Re: [PATCH 01/61] Coccinelle: Prefer IS_ERR_OR_NULL over manual NULL check
From: Krzysztof Kozlowski @ 2026-04-16 12:30 UTC (permalink / raw)
  To: Philipp Hahn, amd-gfx, apparmor, bpf, ceph-devel, cocci, dm-devel,
	dri-devel, gfs2, intel-gfx, intel-wired-lan, iommu, kvm,
	linux-arm-kernel, linux-block, linux-bluetooth, linux-btrfs,
	linux-cifs, linux-clk, linux-erofs, linux-ext4, linux-fsdevel,
	linux-gpio, linux-hyperv, linux-input, linux-kernel, linux-leds,
	linux-media, linux-mips, linux-mm, linux-modules, linux-mtd,
	linux-nfs, linux-omap, linux-phy, linux-pm, linux-rockchip,
	linux-s390, linux-scsi, linux-sctp, linux-security-module,
	linux-sh, linux-sound, linux-stm32, linux-trace-kernel, linux-usb,
	linux-wireless, netdev, ntfs3, samba-technical, sched-ext,
	target-devel, tipc-discussion, v9fs
  Cc: Julia Lawall, Nicolas Palix
In-Reply-To: <20260310-b4-is_err_or_null-v1-1-bd63b656022d@avm.de>

On 10/03/2026 12:48, Philipp Hahn wrote:
> Find and convert uses of IS_ERR() plus NULL check to IS_ERR_OR_NULL().
> 
> There are several cases where `!ptr && WARN_ON[_ONCE](IS_ERR(ptr))` is
> used:
> - arch/x86/kernel/callthunks.c:215 WARN_ON_ONCE
> - drivers/clk/clk.c:4561 WARN_ON_ONCE
> - drivers/interconnect/core.c:793 WARN_ON
> - drivers/reset/core.c:718 WARN_ON
> The change is not 100% semantical equivalent as the warning will now
> also happen when the pointer is NULL.
> 
> To: Julia Lawall <Julia.Lawall@inria.fr>
> To: Nicolas Palix <nicolas.palix@imag.fr>
> Cc: cocci@inria.fr
> Cc: linux-kernel@vger.kernel.org
> 
> ---
> drivers/clocksource/mips-gic-timer.c:283 looks suspicious: ret != clk,
> but Daniel Lezcano verified it as cottect.
> 
> There are some cases where the checks are part of a larger expression:
> - mm/kmemleak.c:1095
> - mm/kmemleak.c:1155
> - mm/kmemleak.c:1173
> - mm/kmemleak.c:1290
> - mm/kmemleak.c:1328
> - mm/kmemleak.c:1241
> - mm/kmemleak.c:1310
> - mm/kmemleak.c:1258
> - net/netlink/af_netlink.c:2670
> Thanks to Julia Lawall for the help to also handle them.
> 
> Signed-off-by: Philipp Hahn <phahn-oss@avm.de>
> ---
>  scripts/coccinelle/api/is_err_or_null.cocci | 125 ++++++++++++++++++++++++++++
>  1 file changed, 125 insertions(+)
> 

Neither this, nor try from 2011, nor any future try should be accepted,
because it creates impression IS_ERR_OR_NULL is somehow okay. No, it is
not okay, it is a discouraged pattern leading to less readable and
maintainable code. We should not have therefore any tools suggesting
usage of IS_ERR_OR_NULL, because people will be converting poor code
into that, instead of fixing that poor code.

Best regards,
Krzysztof

^ permalink raw reply

* Re: [PATCH bpf-next v4 5/6] bpf: clear decap tunnel GSO state in skb_adjust_room
From: Willem de Bruijn @ 2026-04-16 12:32 UTC (permalink / raw)
  To: Nick Hudson, bpf, netdev, Willem de Bruijn, Martin KaFai Lau
  Cc: Nick Hudson, Max Tottenham, Anna Glasgall, Daniel Borkmann,
	Alexei Starovoitov, Andrii Nakryiko, Eduard Zingerman,
	Kumar Kartikeya Dwivedi, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, linux-kernel
In-Reply-To: <20260416075514.927101-6-nhudson@akamai.com>

Nick Hudson wrote:
> On shrink in bpf_skb_adjust_room(), clear tunnel-specific GSO flags
> according to the decapsulation flags:
> 
> - BPF_F_ADJ_ROOM_DECAP_L4_UDP clears SKB_GSO_UDP_TUNNEL{,_CSUM}
> - BPF_F_ADJ_ROOM_DECAP_L4_GRE clears SKB_GSO_GRE{,_CSUM}
> - BPF_F_ADJ_ROOM_DECAP_IPXIP4 clears SKB_GSO_IPXIP4
> - BPF_F_ADJ_ROOM_DECAP_IPXIP6 clears SKB_GSO_IPXIP6
> 
> When all tunnel-related GSO bits are cleared, also clear
> skb->encapsulation.
> 
> Handle the ESP inside a UDP tunnel case where encapsulation should remain
> set.
> 
> If UDP decap is performed, clear encap_hdr_csum and remcsum_offload.
> 
> Co-developed-by: Max Tottenham <mtottenh@akamai.com>
> Signed-off-by: Max Tottenham <mtottenh@akamai.com>
> Co-developed-by: Anna Glasgall <aglasgal@akamai.com>
> Signed-off-by: Anna Glasgall <aglasgal@akamai.com>
> Signed-off-by: Nick Hudson <nhudson@akamai.com>
> ---
>  net/core/filter.c | 38 ++++++++++++++++++++++++++++++++++++++
>  1 file changed, 38 insertions(+)
> 
> diff --git a/net/core/filter.c b/net/core/filter.c
> index 7f8d43420afb..e113ae2f3f14 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -3667,6 +3667,44 @@ static int bpf_skb_net_shrink(struct sk_buff *skb, u32 off, u32 len_diff,
>  		if (!(flags & BPF_F_ADJ_ROOM_FIXED_GSO))
>  			skb_increase_gso_size(shinfo, len_diff);
>  
> +		/* Selective GSO flag clearing based on decap type.
> +		 * Only clear the flags for the tunnel layer being removed.
> +		 */
> +		if ((flags & BPF_F_ADJ_ROOM_DECAP_L4_UDP) &&
> +		    (shinfo->gso_type & (SKB_GSO_UDP_TUNNEL |
> +					 SKB_GSO_UDP_TUNNEL_CSUM)))
> +			shinfo->gso_type &= ~(SKB_GSO_UDP_TUNNEL |
> +					      SKB_GSO_UDP_TUNNEL_CSUM);
> +		if ((flags & BPF_F_ADJ_ROOM_DECAP_L4_GRE) &&
> +		    (shinfo->gso_type & (SKB_GSO_GRE | SKB_GSO_GRE_CSUM)))
> +			shinfo->gso_type &= ~(SKB_GSO_GRE |
> +					      SKB_GSO_GRE_CSUM);
> +		if ((flags & BPF_F_ADJ_ROOM_DECAP_IPXIP4) &&
> +		    (shinfo->gso_type & SKB_GSO_IPXIP4))
> +			shinfo->gso_type &= ~SKB_GSO_IPXIP4;
> +		if ((flags & BPF_F_ADJ_ROOM_DECAP_IPXIP6) &&
> +		    (shinfo->gso_type & SKB_GSO_IPXIP6))
> +			shinfo->gso_type &= ~SKB_GSO_IPXIP6;
> +
> +		/* Clear encapsulation flag only when no tunnel GSO flags remain */
> +		if (flags & (BPF_F_ADJ_ROOM_DECAP_L4_MASK |
> +			     BPF_F_ADJ_ROOM_DECAP_IPXIP_MASK)) {
> +			if (!(shinfo->gso_type & (SKB_GSO_UDP_TUNNEL |
> +						  SKB_GSO_UDP_TUNNEL_CSUM |
> +						  SKB_GSO_GRE |
> +						  SKB_GSO_GRE_CSUM |
> +						  SKB_GSO_IPXIP4 |
> +						  SKB_GSO_IPXIP6 |
> +						  SKB_GSO_ESP)))
> +				if (skb->encapsulation)
> +					skb->encapsulation = 0;
> +
> +			if (flags & BPF_F_ADJ_ROOM_DECAP_L4_UDP) {
> +				skb->encap_hdr_csum = 0;

This field is not used with UDP_L4.

Similar to remcsum, I'd ignore it entirely in this series.

> +				skb->remcsum_offload = 0;

Why still include remote checksum handling?

> +			}
> +		}
> +
>  		/* Header must be checked, and gso_segs recomputed. */
>  		shinfo->gso_type |= SKB_GSO_DODGY;
>  		shinfo->gso_segs = 0;
> -- 
> 2.34.1
> 



^ permalink raw reply

* Re: [PATCH bpf-next v4 6/6] selftests/bpf: tc_tunnel validate decap GSO state
From: Willem de Bruijn @ 2026-04-16 12:33 UTC (permalink / raw)
  To: Nick Hudson, bpf, netdev, Willem de Bruijn, Martin KaFai Lau
  Cc: Nick Hudson, Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
	Eduard Zingerman, Kumar Kartikeya Dwivedi, Shuah Khan,
	linux-kselftest, linux-kernel
In-Reply-To: <20260416075514.927101-7-nhudson@akamai.com>

Nick Hudson wrote:
> Require BPF_F_ADJ_ROOM_DECAP_L4_UDP and BPF_F_ADJ_ROOM_DECAP_L4_GRE enum
> values at runtime using CO-RE enum existence checks so missing kernel
> support fails fast instead of silently proceeding.
> 
> After bpf_skb_adjust_room() decapsulation, inspect skb_shared_info and
> sk_buff state for GSO packets and assert that the expected tunnel GSO
> bits are cleared and encapsulation matches the remaining tunnel state.
> 
> Signed-off-by: Nick Hudson <nhudson@akamai.com>
> ---
>  .../selftests/bpf/progs/test_tc_tunnel.c      | 58 +++++++++++++++++++
>  1 file changed, 58 insertions(+)
> 
> diff --git a/tools/testing/selftests/bpf/progs/test_tc_tunnel.c b/tools/testing/selftests/bpf/progs/test_tc_tunnel.c
> index 7376df405a6b..74dfb694a210 100644
> --- a/tools/testing/selftests/bpf/progs/test_tc_tunnel.c
> +++ b/tools/testing/selftests/bpf/progs/test_tc_tunnel.c
> @@ -6,6 +6,7 @@
>  
>  #include <bpf/bpf_helpers.h>
>  #include <bpf/bpf_endian.h>
> +#include <bpf/bpf_core_read.h>
>  #include "bpf_tracing_net.h"
>  #include "bpf_compiler.h"
>  
> @@ -37,6 +38,23 @@ struct vxlanhdr___local {
>  
>  #define	EXTPROTO_VXLAN	0x1
>  
> +#define SKB_GSO_UDP_TUNNEL_MASK	(SKB_GSO_UDP_TUNNEL |			\
> +				 SKB_GSO_UDP_TUNNEL_CSUM |		\
> +				 SKB_GSO_TUNNEL_REMCSUM)

Leftover remcsum reference?

> +
> +#define SKB_GSO_TUNNEL_MASK		(SKB_GSO_UDP_TUNNEL_MASK |		\

Odd indentation?

> +				 SKB_GSO_GRE |				\
> +				 SKB_GSO_GRE_CSUM |			\
> +				 SKB_GSO_IPXIP4 |			\
> +				 SKB_GSO_IPXIP6 |			\
> +				 SKB_GSO_ESP)
> +
> +#define BPF_F_ADJ_ROOM_DECAP_L4_MASK	(BPF_F_ADJ_ROOM_DECAP_L4_UDP |	\
> +				 BPF_F_ADJ_ROOM_DECAP_L4_GRE)
> +
> +#define BPF_F_ADJ_ROOM_DECAP_IPXIP_MASK	(BPF_F_ADJ_ROOM_DECAP_IPXIP4 |	\
> +					 BPF_F_ADJ_ROOM_DECAP_IPXIP6)
> +
>  #define	VXLAN_FLAGS     bpf_htonl(1<<27)
>  #define	VNI_ID		1
>  #define	VXLAN_VNI	bpf_htonl(VNI_ID << 8)
> @@ -592,6 +610,8 @@ int __encap_ip6vxlan_eth(struct __sk_buff *skb)
>  static int decap_internal(struct __sk_buff *skb, int off, int len, char proto)
>  {
>  	__u64 flags = BPF_F_ADJ_ROOM_FIXED_GSO;
> +	struct sk_buff *kskb;
> +	struct skb_shared_info *shinfo;
>  	struct ipv6_opt_hdr ip6_opt_hdr;
>  	struct gre_hdr greh;
>  	struct udphdr udph;
> @@ -621,6 +641,11 @@ static int decap_internal(struct __sk_buff *skb, int off, int len, char proto)
>  		break;
>  	case IPPROTO_GRE:
>  		olen += sizeof(struct gre_hdr);
> +		if (!bpf_core_enum_value_exists(enum bpf_adj_room_flags,
> +						BPF_F_ADJ_ROOM_DECAP_L4_GRE))
> +			return TC_ACT_SHOT;
> +		flags |= BPF_F_ADJ_ROOM_DECAP_L4_GRE;
> +
>  		if (bpf_skb_load_bytes(skb, off + len, &greh, sizeof(greh)) < 0)
>  			return TC_ACT_OK;
>  		switch (bpf_ntohs(greh.protocol)) {
> @@ -634,6 +659,10 @@ static int decap_internal(struct __sk_buff *skb, int off, int len, char proto)
>  		break;
>  	case IPPROTO_UDP:
>  		olen += sizeof(struct udphdr);
> +		if (!bpf_core_enum_value_exists(enum bpf_adj_room_flags,
> +						BPF_F_ADJ_ROOM_DECAP_L4_UDP))
> +			return TC_ACT_SHOT;
> +		flags |= BPF_F_ADJ_ROOM_DECAP_L4_UDP;
>  		if (bpf_skb_load_bytes(skb, off + len, &udph, sizeof(udph)) < 0)
>  			return TC_ACT_OK;
>  		switch (bpf_ntohs(udph.dest)) {
> @@ -655,6 +684,35 @@ static int decap_internal(struct __sk_buff *skb, int off, int len, char proto)
>  	if (bpf_skb_adjust_room(skb, -olen, BPF_ADJ_ROOM_MAC, flags))
>  		return TC_ACT_SHOT;
>  
> +	kskb = bpf_cast_to_kern_ctx(skb);
> +	shinfo = bpf_core_cast(kskb->head + kskb->end, struct skb_shared_info);
> +	if (!shinfo->gso_size)
> +		return TC_ACT_OK;
> +
> +	if ((flags & BPF_F_ADJ_ROOM_DECAP_L4_UDP) &&
> +	    (shinfo->gso_type & SKB_GSO_UDP_TUNNEL_MASK))
> +		return TC_ACT_SHOT;
> +
> +	if ((flags & BPF_F_ADJ_ROOM_DECAP_L4_GRE) &&
> +	    (shinfo->gso_type & (SKB_GSO_GRE | SKB_GSO_GRE_CSUM)))
> +		return TC_ACT_SHOT;
> +
> +	if ((flags & BPF_F_ADJ_ROOM_DECAP_IPXIP4) &&
> +	    (shinfo->gso_type & SKB_GSO_IPXIP4))
> +		return TC_ACT_SHOT;
> +
> +	if ((flags & BPF_F_ADJ_ROOM_DECAP_IPXIP6) &&
> +	    (shinfo->gso_type & SKB_GSO_IPXIP6))
> +		return TC_ACT_SHOT;
> +
> +	if (flags & (BPF_F_ADJ_ROOM_DECAP_L4_MASK |
> +		     BPF_F_ADJ_ROOM_DECAP_IPXIP_MASK)) {
> +		if ((shinfo->gso_type & SKB_GSO_TUNNEL_MASK) && !kskb->encapsulation)
> +			return TC_ACT_SHOT;
> +		if (!(shinfo->gso_type & SKB_GSO_TUNNEL_MASK) && kskb->encapsulation)
> +			return TC_ACT_SHOT;
> +	}
> +
>  	return TC_ACT_OK;
>  }
>  
> -- 
> 2.34.1
> 



^ permalink raw reply

* Re: [PATCH net 00/14] Netfilter/IPVS fixes for net
From: Fernando Fernandez Mancera @ 2026-04-16 12:49 UTC (permalink / raw)
  To: Florian Westphal, Pablo Neira Ayuso
  Cc: netfilter-devel, davem, netdev, kuba, pabeni, edumazet, horms
In-Reply-To: <aeC8hyj6IFW7UvUG@strlen.de>

On 4/16/26 12:40 PM, Florian Westphal wrote:
> Pablo Neira Ayuso <pablo@netfilter.org> wrote:
>> I cannot send a batch before 16h my local time, I need a bit more
>> time.
>>
>> Sorry.
> 
> No problem.  Alternative is to drop patches, this is what I did in the
> past.  Some LLM comment indicates problem, remove patch from v2
> and defer to next week.
> 
> But that was before LLM reviews flagged 50% of patches.
> I'll pick up on anything left behind for next weeks batch(es).
> 

Hi,

I would like to propose to add netfilter-devel mailing list to 
sashiko.dev and also to Netdev CI.. I think Jakub mentioned it was 
possible on a previous situation.

I think it isn't sustainable to review and address the AI/LLM comments 
when sending the pull request to for net/net-next.

If you agree I could help moving this forward.

Thanks,
Fernando.


^ permalink raw reply

* Re: [PATCH] mISDN: socket: drop device references acquired by get_mdevice()
From: Simon Horman @ 2026-04-16 12:50 UTC (permalink / raw)
  To: shuvampandey1; +Cc: 'Simon Horman', netdev, linux-kernel
In-Reply-To: <20260414071322.30851-1-shuvampandey1@gmail.com>

From: 'Simon Horman' <horms@kernel.org>

This is an AI-generated review of your patch. The human sending this
email has considered the AI review valid, or at least plausible.
Full review at: https://sashiko.dev
---
mISDN: socket: drop device references acquired by get_mdevice()

This patch attempts to fix a memory leak where device references acquired
by get_mdevice() are not dropped. It adds put_device() calls in the ioctl
completion paths and in the socket release paths.

> diff --git a/drivers/isdn/mISDN/socket.c b/drivers/isdn/mISDN/socket.c
> index 77b900db1cac2..9209ee68f9cda 100644
> --- a/drivers/isdn/mISDN/socket.c
> +++ b/drivers/isdn/mISDN/socket.c
> @@ -266,6 +266,11 @@ data_sock_release(struct socket *sock)
>  
>  	lock_sock(sk);
>  
> +	if (_pms(sk)->dev) {
> +		put_device(&_pms(sk)->dev->dev);
> +		_pms(sk)->dev = NULL;
> +	}
> +

Could this introduce a use-after-free regression if the underlying hardware
is removed before the socket is closed?

It appears the mISDN subsystem handles kobject lifetimes unconventionally.
Hardware drivers manually call kfree() on their private structures immediately
after calling mISDN_unregister_device(), without waiting for the kref to
reach zero.

If a socket holds a reference to a device when the module is unloaded, the
device memory is synchronously freed. When the socket is later closed, this
put_device() would attempt to decrement a kref inside an already freed driver
structure.

This isn't a regression, but is there a use-after-free in the stack teardown
when the hardware is removed?

When a socket is bound via data_sock_bind(), connect_layer1() stores a pointer
to the device's stack on the channel (ch->st = dev->D.st). If the hardware is
removed, mISDN_unregister_device() calls delete_stack(), which unconditionally
frees the stack pointer.

When the user eventually closes the socket, data_sock_release() executes:

    if (sk->sk_state == MISDN_BOUND)
        delete_channel(&_pms(sk)->ch);

And delete_channel() dereferences the dangling stack pointer:

    write_lock_bh(&ch->st->l1sock.lock);

This also isn't a regression, but can data_sock_release() allow an
unprivileged user to spam the kernel log?

If an unprivileged user creates a SOCK_DGRAM socket with protocol
ISDN_P_LAPD_TE and closes it without binding, data_sock_release() evaluates
the closure:

    case ISDN_P_LAPD_TE:
        delete_channel(&_pms(sk)->ch);

Since it was never bound, the channel's stack pointer remains NULL.
delete_channel() handles this by executing:

    if (!ch->st) {
        printk(KERN_WARNING "%s: no stack\n", __func__);
        return;
    }

Since there is no ratelimiting on this printk, could a user continuously create
and close these sockets to flood dmesg?

>  	sock_orphan(sk);
>  	skb_queue_purge(&sk->sk_receive_queue);
>  
> @@ -387,6 +392,7 @@ data_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
>  			strscpy(di.name, dev_name(&dev->dev), sizeof(di.name));

This isn't a regression, but might this read from freed memory if a device
rename happens concurrently?

In IMGETDEVINFO, the device name is read via dev_name() without holding locks.
Concurrently, a privileged user can issue the IMSETDEVNAME ioctl, which
executes device_rename().

Since device_rename() updates the device's kobject name, allocating a new
string and calling kfree() on the old name string, it seems strscpy() could
read from the freed name string if executed at the same time.

>  			if (copy_to_user((void __user *)arg, &di, sizeof(di)))
>  				err = -EFAULT;
> +			put_device(&dev->dev);
>  		} else
>  			err = -ENODEV;
>  		break;

[ ... ]

^ permalink raw reply

* [PATCH net] net/sched: act_ct: fix skb leak on fragment check failure
From: Dudu Lu @ 2026-04-16 13:01 UTC (permalink / raw)
  To: netdev; +Cc: jhs, jiri, horms, Dudu Lu

When tcf_ct_handle_fragments() returns an error other than -EINPROGRESS
(e.g. -EINVAL from malformed fragments), tcf_ct_act() jumps to out_frag
which unconditionally returns TC_ACT_CONSUMED. This tells the caller the
skb was consumed, but it was not freed, leaking one skb per malformed
fragment.

TC_ACT_CONSUMED is only correct for -EINPROGRESS, where defragmentation
is genuinely in progress and the skb has been queued. For all other
errors the skb is still owned by the caller and must be freed via
TC_ACT_SHOT.

Fixes: 3f14b377d01d ("net/sched: act_ct: fix skb leak and crash on ooo frags")
Signed-off-by: Dudu Lu <phx0fer@gmail.com>
---
 net/sched/act_ct.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c
index 7d5e50c921a0..870655f682bd 100644
--- a/net/sched/act_ct.c
+++ b/net/sched/act_ct.c
@@ -1107,8 +1107,10 @@ TC_INDIRECT_SCOPE int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a,
 	return retval;
 
 out_frag:
-	if (err != -EINPROGRESS)
+	if (err != -EINPROGRESS) {
 		tcf_action_inc_drop_qstats(&c->common);
+		return TC_ACT_SHOT;
+	}
 	return TC_ACT_CONSUMED;
 
 drop:
-- 
2.39.3 (Apple Git-145)


^ permalink raw reply related

* Re: [RFC PATCH 2/2] kernel/module: Decouple klp and ftrace from load_module
From: Petr Mladek @ 2026-04-16 13:09 UTC (permalink / raw)
  To: Song Chen
  Cc: Petr Pavlu, rafael, lenb, mturquette, sboyd, viresh.kumar, agk,
	snitzer, mpatocka, bmarzins, song, yukuai, linan122, jason.wessel,
	danielt, dianders, horms, davem, edumazet, kuba, pabeni, paulmck,
	frederic, mcgrof, da.gomez, samitolvanen, atomlin, jpoimboe,
	jikos, mbenes, joe.lawrence, rostedt, mhiramat, mark.rutland,
	mathieu.desnoyers, linux-modules, linux-kernel,
	linux-trace-kernel, linux-acpi, linux-clk, linux-pm,
	live-patching, dm-devel, linux-raid, kgdb-bugreport, netdev
In-Reply-To: <a35f5f94-7d5a-4347-974b-b270c89ef241@189.cn>

On Wed 2026-04-15 14:43:53, Song Chen wrote:
> Hi,
> 
> On 4/14/26 22:33, Petr Pavlu wrote:
> > On 4/13/26 10:07 AM, chensong_2000@189.cn wrote:
> > > From: Song Chen <chensong_2000@189.cn>
> > > 
> > > ftrace and livepatch currently have their module load/unload callbacks
> > > hard-coded in the module loader as direct function calls to
> > > ftrace_module_enable(), klp_module_coming(), klp_module_going()
> > > and ftrace_release_mod(). This tight coupling was originally introduced
> > > to enforce strict call ordering that could not be guaranteed by the
> > > module notifier chain, which only supported forward traversal. Their
> > > notifiers were moved in and out back and forth. see [1] and [2].
> > 
> > I'm unclear about what is meant by the notifiers being moved back and
> > forth. The links point to patches that converted ftrace+klp from using
> > module notifiers to explicit callbacks due to ordering issues, but this
> > switch occurred only once. Have there been other attempts to use
> > notifiers again?
> > 
> > > diff --git a/include/linux/module.h b/include/linux/module.h
> > > index 14f391b186c6..0bdd56f9defd 100644
> > > --- a/include/linux/module.h
> > > +++ b/include/linux/module.h
> > > @@ -308,6 +308,14 @@ enum module_state {
> > >   	MODULE_STATE_COMING,	/* Full formed, running module_init. */
> > >   	MODULE_STATE_GOING,	/* Going away. */
> > >   	MODULE_STATE_UNFORMED,	/* Still setting it up. */
> > > +	MODULE_STATE_FORMED,
> > 
> > I don't see a reason to add a new module state. Why is it necessary and
> > how does it fit with the existing states?
> > 
> because once notifier fails in state MODULE_STATE_UNFORMED (now only ftrace
> has someting to do in this state), notifier chain will roll back by calling
> blocking_notifier_call_chain_robust, i'm afraid MODULE_STATE_GOING is going
> to jeopardise the notifers which don't handle it appropriately, like:
> 
> case MODULE_STATE_COMING:
>      kmalloc();
> case MODULE_STATE_GOING:
>      kfree();
> 
> 
> > > +};
> > > +
> > > +enum module_notifier_prio {
> > > +	MODULE_NOTIFIER_PRIO_LOW = INT_MIN,	/* Low prioroty, coming last, going first */
> > > +	MODULE_NOTIFIER_PRIO_MID = 0,	/* Normal priority. */
> > > +	MODULE_NOTIFIER_PRIO_SECOND_HIGH = INT_MAX - 1,	/* Second high priorigy, coming second*/
> > > +	MODULE_NOTIFIER_PRIO_HIGH = INT_MAX,	/* High priorigy, coming first, going late. */
> > 
> > I suggest being explicit about how the notifiers are ordered. For
> > example:
> > 
> > enum module_notifier_prio {
> > 	MODULE_NOTIFIER_PRIO_NORMAL,	/* Normal priority, coming last, going first. */
> > 	MODULE_NOTIFIER_PRIO_LIVEPATCH,
> > 	MODULE_NOTIFIER_PRIO_FTRACE,	/* High priority, coming first, going late. */
> > };
> > 

I like the explicit PRIO_LIVEPATCH/FTRACE names.

But I would keep the INT_MAX - 1 and INT_MAX priorities. I believe
that ftrace/livepatching will always be the first/last to call.
And INT_MAX would help to preserve kABI when PRIO_NORMAL is not
enough for the rest of notifiers.

That said, I am not sure whether this is worth the effort.
This patch tries to move the explicit callbacks in a generic
notifiers API. But it will still need to use some explictly
defined (reserved) priorities. And it will
not guarantee a misuse. Also it requires the double linked
list which complicates the notifiers code.


> > >   };
> > >   struct mod_tree_node {
> > > --- a/kernel/module/main.c
> > > +++ b/kernel/module/main.c
> > > @@ -3281,20 +3277,14 @@ static int complete_formation(struct module *mod, struct load_info *info)
> > >   	return err;
> > >   }
> > > -static int prepare_coming_module(struct module *mod)
> > > +static int prepare_module_state_transaction(struct module *mod,
> > > +			unsigned long val_up, unsigned long val_down)
> > >   {
> > >   	int err;
> > > -	ftrace_module_enable(mod);
> > > -	err = klp_module_coming(mod);
> > > -	if (err)
> > > -		return err;
> > > -
> > >   	err = blocking_notifier_call_chain_robust(&module_notify_list,
> > > -			MODULE_STATE_COMING, MODULE_STATE_GOING, mod);
> > > +			val_up, val_down, mod);
> > >   	err = notifier_to_errno(err);
> > > -	if (err)
> > > -		klp_module_going(mod);
> > >   	return err;
> > >   }

I personally find the name "prepare_module_state_transaction"
misleading. What is the "transaction" here? If this was a "preparation"
step then where is the transaction done/finished?

It might be better to just opencode the
blocking_notifier_call_chain_robust() instead.

> > > @@ -3468,14 +3458,21 @@ static int load_module(struct load_info *info, const char __user *uargs,
> > >   	init_build_id(mod, info);
> > >   	/* Ftrace init must be called in the MODULE_STATE_UNFORMED state */
> > > -	ftrace_module_init(mod);
> > > +	err = prepare_module_state_transaction(mod,
> > > +				MODULE_STATE_UNFORMED, MODULE_STATE_FORMED);
> > 
> > I believe val_down should be MODULE_STATE_GOING to reverse the
> > operation. Why is the new state MODULE_STATE_FORMED needed here?
> to avoid this:
> 
> case MODULE_STATE_COMING:
>      kmalloc();
> case MODULE_STATE_GOING:
>      kfree();

Hmm, the module is in "FORMED" state here.

> > > +	if (err)
> > > +		goto ddebug_cleanup;
> > >   	/* Finally it's fully formed, ready to start executing. */
> > >   	err = complete_formation(mod, info);

And we call "complete_formation()" function. This sounds like
it was not really "FORMED" before. => It is confusing and nono.

Please, try to avoid the new state if possible. My experience
with reading the module loader code is that any new state
brings a lot of complexity. You need to take it into account
when checking correctness of other changes, features, ...

Something tells me that if the state was not needed before
then we could avoid it.

> > > -	if (err)
> > > +	if (err) {
> > > +		blocking_notifier_call_chain_reverse(&module_notify_list,
> > > +				MODULE_STATE_FORMED, mod);
> > >   		goto ddebug_cleanup;
> > > +	}
> > > -	err = prepare_coming_module(mod);
> > > +	err = prepare_module_state_transaction(mod,
> > > +				MODULE_STATE_COMING, MODULE_STATE_GOING);
> > >   	if (err)
> > >   		goto bug_cleanup;
> > > --- a/kernel/trace/ftrace.c
> > > +++ b/kernel/trace/ftrace.c
> > > @@ -5241,6 +5241,44 @@ static int __init ftrace_mod_cmd_init(void)
> > >   }
> > >   core_initcall(ftrace_mod_cmd_init);
> > > +static int ftrace_module_callback(struct notifier_block *nb, unsigned long op,
> > > +			void *module)
> > > +{
> > > +	struct module *mod = module;
> > > +
> > > +	switch (op) {
> > > +	case MODULE_STATE_UNFORMED:
> > > +		ftrace_module_init(mod);
> > > +		break;
> > > +	case MODULE_STATE_COMING:
> > > +		ftrace_module_enable(mod);
> > > +		break;
> > > +	case MODULE_STATE_LIVE:
> > > +		ftrace_free_mem(mod, mod->mem[MOD_INIT_TEXT].base,
> > > +				mod->mem[MOD_INIT_TEXT].base + mod->mem[MOD_INIT_TEXT].size);
> > > +		break;
> > > +	case MODULE_STATE_GOING:
> > > +	case MODULE_STATE_FORMED:
> > > +		ftrace_release_mod(mod);

This calls "release" in a "FORMED" state. It does not make any
sense. Something looks fishy, either the code or the naming.

> > > +		break;
> > > +	default:
> > > +		break;
> > > +	}
> > 

I am sorry for being so picky about names. I believe that good names
help to prevent bugs and reduce headaches.

Best Regards,
Petr

^ permalink raw reply

* Re: [PATCH net 00/14] Netfilter/IPVS fixes for net
From: Florian Westphal @ 2026-04-16 13:14 UTC (permalink / raw)
  To: Fernando Fernandez Mancera
  Cc: Pablo Neira Ayuso, netfilter-devel, davem, netdev, kuba, pabeni,
	edumazet, horms
In-Reply-To: <36ccd420-25f2-43e9-89bf-088fcad40f81@suse.de>

Fernando Fernandez Mancera <fmancera@suse.de> wrote:
> I would like to propose to add netfilter-devel mailing list to 
> sashiko.dev and also to Netdev CI.. I think Jakub mentioned it was 
> possible on a previous situation.

I already run all my pull requests through most of NIPAs test, with
additional netfilter-specific tests.

> I think it isn't sustainable to review and address the AI/LLM comments 
> when sending the pull request to for net/net-next.

The current bug report influx is already unsustainable for us.

> If you agree I could help moving this forward.

If you know who to contact to make sashiko also digest netfilter-devel
that would be good to have.

^ permalink raw reply

* [PATCH net,v2 00/11] Netfilter/IPVS fixes for net
From: Pablo Neira Ayuso @ 2026-04-16 13:14 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev, kuba, pabeni, edumazet, fw, horms

v2: Keep back patches that have lengthy feedback by AI, they might
    need more work.

-o-

Hi,

The following patchset contains Netfilter/IPVS fixes for net: Mostly
addressing very old bugs in the SIP conntrack helper string parser,
unsafe arp_tables match support with legacy IEEE1394, restrict xt_realm
to IPv4 and incorrect use of RCU lists in nat core and nftables. This
batch also includes one IPVS MTU fix.

1) Fix arp_tables match with IEEE1394 ARP payload, allowing to
   reach bytes off the skb boundary, note that matching on the
   target address is deliberately ignored, patch from Weiming Shi.

2) Reject unsafe nfnetlink_osf configurations from control plane,
   this is addressing a possible division by zero, from Xiang Mei.

3) nft_osf actually only supports IPv4, restrict it.

4) Possible null-ptr-deref in nfnetlink_osf, check__in_dev_get_rcu
   return NULL, from Kito Xu.

5) Remove unsafe use of sprintf to fix possible buffer overflow
   in the SIP NAT helper, from Florian Westphal.

6) Restrict xt_mac, xt_owner and xt_physdev to inet families only;
   xt_realm is only for ipv4, otherwise null-pointer-deref is possible.

7) Use kfree_rcu() in nat core to release hooks, this can be an issue
   once nfnetlink_hook gets support to dump NAT hook information,
   not currently a real issue but better fix it now.

8) Fix MTU checks in IPVS, from Yingnan Zhang.

9) Use list_del_rcu() in chain and flowtable hook unregistration,
   concurrent RCU reader could be walking over the hook list,
   from Florian Westphal

10) Add list_splice_rcu(), this is required to fix unsafe
    splice to RCU protected hook list. Reviewed by Paul McKenney.

11) Use list_splice_rcu() to splice new chain and flowtable hooks.

Please, pull these changes from:

  git://git.kernel.org/pub/scm/linux/kernel/git/netfilter/nf.git nf-26-04-16

Thanks.

----------------------------------------------------------------

The following changes since commit 2dddb34dd0d07b01fa770eca89480a4da4f13153:

  net: ethernet: mtk_eth_soc: initialize PPE per-tag-layer MTU registers (2026-04-12 15:22:58 -0700)

are available in the Git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/netfilter/nf.git tags/nf-26-04-16

for you to fetch changes up to 985f517db19a734d4267e003438b5d6995669aff:

  netfilter: nf_tables: join hook list via splice_list_rcu() in commit phase (2026-04-16 14:53:52 +0200)

----------------------------------------------------------------
netfilter pull request 26-04-16

----------------------------------------------------------------
Florian Westphal (2):
      netfilter: conntrack: remove sprintf usage
      netfilter: nf_tables: use list_del_rcu for netlink hooks

Kito Xu (veritas501) (1):
      netfilter: nfnetlink_osf: fix null-ptr-deref in nf_osf_ttl

Pablo Neira Ayuso (5):
      netfilter: nft_osf: restrict it to ipv4
      netfilter: xtables: restrict several matches to inet family
      netfilter: nat: use kfree_rcu to release ops
      rculist: add list_splice_rcu() for private lists
      netfilter: nf_tables: join hook list via splice_list_rcu() in commit phase

Weiming Shi (1):
      netfilter: arp_tables: fix IEEE1394 ARP payload parsing in arp_packet_match()

Xiang Mei (1):
      netfilter: nfnetlink_osf: fix divide-by-zero in OSF_WSS_MODULO

Yingnan Zhang (1):
      ipvs: fix MTU check for GSO packets in tunnel mode

 include/linux/rculist.h           | 29 ++++++++++++++++++++++
 net/ipv4/netfilter/arp_tables.c   | 14 ++++++++---
 net/ipv4/netfilter/iptable_nat.c  |  2 +-
 net/ipv6/netfilter/ip6table_nat.c |  2 +-
 net/netfilter/ipvs/ip_vs_xmit.c   | 19 +++++++++++---
 net/netfilter/nf_nat_amanda.c     |  2 +-
 net/netfilter/nf_nat_core.c       | 10 +++++---
 net/netfilter/nf_nat_sip.c        | 33 ++++++++++++++-----------
 net/netfilter/nf_tables_api.c     | 52 +++++++++++++++++----------------------
 net/netfilter/nfnetlink_osf.c     |  7 ++++++
 net/netfilter/nft_osf.c           |  6 ++++-
 net/netfilter/xt_mac.c            | 34 ++++++++++++++++---------
 net/netfilter/xt_owner.c          | 37 +++++++++++++++++++---------
 net/netfilter/xt_physdev.c        | 29 ++++++++++++++--------
 net/netfilter/xt_realm.c          |  2 +-
 15 files changed, 184 insertions(+), 94 deletions(-)

^ permalink raw reply

* [PATCH net 01/11] netfilter: arp_tables: fix IEEE1394 ARP payload parsing in arp_packet_match()
From: Pablo Neira Ayuso @ 2026-04-16 13:14 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev, kuba, pabeni, edumazet, fw, horms
In-Reply-To: <20260416131453.308611-1-pablo@netfilter.org>

From: Weiming Shi <bestswngs@gmail.com>

arp_packet_match() unconditionally parses the ARP payload assuming two
hardware addresses are present (source and target). However,
IPv4-over-IEEE1394 ARP (RFC 2734) omits the target hardware address
field, and arp_hdr_len() already accounts for this by returning a
shorter length for ARPHRD_IEEE1394 devices.

As a result, on IEEE1394 interfaces arp_packet_match() advances past a
nonexistent target hardware address and reads the wrong bytes for both
the target device address comparison and the target IP address. This
causes arptables rules to match against garbage data, leading to
incorrect filtering decisions: packets that should be accepted may be
dropped and vice versa.

The ARP stack in net/ipv4/arp.c (arp_create and arp_process) already
handles this correctly by skipping the target hardware address for
ARPHRD_IEEE1394. Apply the same pattern to arp_packet_match().

[ Pablo has mangled this patch to include Simon Horman's suggestions ]

Fixes: 6752c8db8e0c ("firewire net, ipv4 arp: Extend hardware address and remove driver-level packet inspection.")
Reported-by: Xiang Mei <xmei5@asu.edu>
Signed-off-by: Weiming Shi <bestswngs@gmail.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv4/netfilter/arp_tables.c | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 1cdd9c28ab2d..a7a56890b5b5 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -110,13 +110,21 @@ static inline int arp_packet_match(const struct arphdr *arphdr,
 	arpptr += dev->addr_len;
 	memcpy(&src_ipaddr, arpptr, sizeof(u32));
 	arpptr += sizeof(u32);
-	tgt_devaddr = arpptr;
-	arpptr += dev->addr_len;
+
+	if (IS_ENABLED(CONFIG_FIREWIRE_NET) && dev->type == ARPHRD_IEEE1394) {
+		tgt_devaddr = NULL;
+	} else {
+		tgt_devaddr = arpptr;
+		arpptr += dev->addr_len;
+	}
 	memcpy(&tgt_ipaddr, arpptr, sizeof(u32));
 
 	if (NF_INVF(arpinfo, ARPT_INV_SRCDEVADDR,
 		    arp_devaddr_compare(&arpinfo->src_devaddr, src_devaddr,
-					dev->addr_len)) ||
+					dev->addr_len)))
+		return 0;
+
+	if (tgt_devaddr &&
 	    NF_INVF(arpinfo, ARPT_INV_TGTDEVADDR,
 		    arp_devaddr_compare(&arpinfo->tgt_devaddr, tgt_devaddr,
 					dev->addr_len)))
-- 
2.47.3


^ permalink raw reply related

* [PATCH net 02/11] netfilter: nfnetlink_osf: fix divide-by-zero in OSF_WSS_MODULO
From: Pablo Neira Ayuso @ 2026-04-16 13:14 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev, kuba, pabeni, edumazet, fw, horms
In-Reply-To: <20260416131453.308611-1-pablo@netfilter.org>

From: Xiang Mei <xmei5@asu.edu>

nf_osf_match_one() computes ctx->window % f->wss.val in the
OSF_WSS_MODULO branch with no guard for f->wss.val == 0. A
CAP_NET_ADMIN user can add such a fingerprint via nfnetlink; a
subsequent matching TCP SYN divides by zero and panics the kernel.

Reject the bogus fingerprint in nfnl_osf_add_callback() above the
per-option for-loop. f->wss is per-fingerprint, not per-option, so
the check must run regardless of f->opt_num (including 0). Also
reject wss.wc >= OSF_WSS_MAX; nf_osf_match_one() already treats that
as "should not happen".

Crash:
 Oops: divide error: 0000 [#1] SMP KASAN NOPTI
 RIP: 0010:nf_osf_match_one (net/netfilter/nfnetlink_osf.c:98)
 Call Trace:
 <IRQ>
  nf_osf_match (net/netfilter/nfnetlink_osf.c:220)
  xt_osf_match_packet (net/netfilter/xt_osf.c:32)
  ipt_do_table (net/ipv4/netfilter/ip_tables.c:348)
  nf_hook_slow (net/netfilter/core.c:622)
  ip_local_deliver (net/ipv4/ip_input.c:265)
  ip_rcv (include/linux/skbuff.h:1162)
  __netif_receive_skb_one_core (net/core/dev.c:6181)
  process_backlog (net/core/dev.c:6642)
  __napi_poll (net/core/dev.c:7710)
  net_rx_action (net/core/dev.c:7945)
  handle_softirqs (kernel/softirq.c:622)

Fixes: 11eeef41d5f6 ("netfilter: passive OS fingerprint xtables match")
Reported-by: Weiming Shi <bestswngs@gmail.com>
Suggested-by: Florian Westphal <fw@strlen.de>
Suggested-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: Xiang Mei <xmei5@asu.edu>
Reviewed-by: Fernando Fernandez Mancera <fmancera@suse.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nfnetlink_osf.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/net/netfilter/nfnetlink_osf.c b/net/netfilter/nfnetlink_osf.c
index 45d9ad231a92..70172ca07858 100644
--- a/net/netfilter/nfnetlink_osf.c
+++ b/net/netfilter/nfnetlink_osf.c
@@ -320,6 +320,10 @@ static int nfnl_osf_add_callback(struct sk_buff *skb,
 	if (f->opt_num > ARRAY_SIZE(f->opt))
 		return -EINVAL;
 
+	if (f->wss.wc >= OSF_WSS_MAX ||
+	    (f->wss.wc == OSF_WSS_MODULO && f->wss.val == 0))
+		return -EINVAL;
+
 	for (i = 0; i < f->opt_num; i++) {
 		if (!f->opt[i].length || f->opt[i].length > MAX_IPOPTLEN)
 			return -EINVAL;
-- 
2.47.3


^ permalink raw reply related

* [PATCH net 03/11] netfilter: nft_osf: restrict it to ipv4
From: Pablo Neira Ayuso @ 2026-04-16 13:14 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev, kuba, pabeni, edumazet, fw, horms
In-Reply-To: <20260416131453.308611-1-pablo@netfilter.org>

This expression only supports for ipv4, restrict it.

Fixes: b96af92d6eaf ("netfilter: nf_tables: implement Passive OS fingerprint module in nft_osf")
Acked-by: Florian Westphal <fw@strlen.de>
Reviewed-by: Fernando Fernandez Mancera <fmancera@suse.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nft_osf.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/net/netfilter/nft_osf.c b/net/netfilter/nft_osf.c
index 1c0b493ef0a9..bdc2f6c90e2f 100644
--- a/net/netfilter/nft_osf.c
+++ b/net/netfilter/nft_osf.c
@@ -28,6 +28,11 @@ static void nft_osf_eval(const struct nft_expr *expr, struct nft_regs *regs,
 	struct nf_osf_data data;
 	struct tcphdr _tcph;
 
+	if (nft_pf(pkt) != NFPROTO_IPV4) {
+		regs->verdict.code = NFT_BREAK;
+		return;
+	}
+
 	if (pkt->tprot != IPPROTO_TCP) {
 		regs->verdict.code = NFT_BREAK;
 		return;
@@ -114,7 +119,6 @@ static int nft_osf_validate(const struct nft_ctx *ctx,
 
 	switch (ctx->family) {
 	case NFPROTO_IPV4:
-	case NFPROTO_IPV6:
 	case NFPROTO_INET:
 		hooks = (1 << NF_INET_LOCAL_IN) |
 			(1 << NF_INET_PRE_ROUTING) |
-- 
2.47.3


^ permalink raw reply related

* [PATCH net 04/11] netfilter: nfnetlink_osf: fix null-ptr-deref in nf_osf_ttl
From: Pablo Neira Ayuso @ 2026-04-16 13:14 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev, kuba, pabeni, edumazet, fw, horms
In-Reply-To: <20260416131453.308611-1-pablo@netfilter.org>

From: "Kito Xu (veritas501)" <hxzene@gmail.com>

nf_osf_ttl() calls __in_dev_get_rcu(skb->dev) and passes the result
to in_dev_for_each_ifa_rcu() without checking for NULL. When the
receiving device has no IPv4 configuration (ip_ptr is NULL),
__in_dev_get_rcu() returns NULL and in_dev_for_each_ifa_rcu()
dereferences it unconditionally, causing a kernel crash.

This can happen when a packet arrives on a device that has had its
IPv4 configuration removed (e.g., MTU set below IPV4_MIN_MTU causing
inetdev_destroy) or on a device that was never assigned an IPv4
address, while an xt_osf or nft_osf rule with TTL_LESS mode is
active and the packet TTL exceeds the fingerprint TTL.

Add a NULL check for in_dev before using it. When in_dev is NULL,
return 0 (no match) since source-address locality cannot be
determined without IPv4 addresses on the device.

KASAN: null-ptr-deref in range [0x0000000000000010-0x0000000000000017]
RIP: 0010:nf_osf_match_one+0x204/0xa70
Call Trace:
 <IRQ>
 nf_osf_match+0x2f8/0x780
 xt_osf_match_packet+0x11c/0x1f0
 ipt_do_table+0x7fe/0x12b0
 nf_hook_slow+0xac/0x1e0
 ip_rcv+0x123/0x370
 __netif_receive_skb_one_core+0x166/0x1b0
 process_backlog+0x197/0x590
 __napi_poll+0xa1/0x540
 net_rx_action+0x401/0xd80
 handle_softirqs+0x19f/0x610
 </IRQ>

Fixes: a218dc82f0b5 ("netfilter: nft_osf: Add ttl option support")
Suggested-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: Kito Xu (veritas501) <hxzene@gmail.com>
Reviewed-by: Fernando Fernandez Mancera <fmancera@suse.de>
Reviewed-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nfnetlink_osf.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/net/netfilter/nfnetlink_osf.c b/net/netfilter/nfnetlink_osf.c
index 70172ca07858..4bbe64288b90 100644
--- a/net/netfilter/nfnetlink_osf.c
+++ b/net/netfilter/nfnetlink_osf.c
@@ -36,6 +36,9 @@ static inline int nf_osf_ttl(const struct sk_buff *skb,
 	const struct in_ifaddr *ifa;
 	int ret = 0;
 
+	if (!in_dev)
+		return 0;
+
 	if (ttl_check == NF_OSF_TTL_TRUE)
 		return ip->ttl == f_ttl;
 	if (ttl_check == NF_OSF_TTL_NOCHECK)
-- 
2.47.3


^ permalink raw reply related

* [PATCH net 05/11] netfilter: conntrack: remove sprintf usage
From: Pablo Neira Ayuso @ 2026-04-16 13:14 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev, kuba, pabeni, edumazet, fw, horms
In-Reply-To: <20260416131453.308611-1-pablo@netfilter.org>

From: Florian Westphal <fw@strlen.de>

Replace it with scnprintf, the buffer sizes are expected to be large enough
to hold the result, no need for snprintf+overflow check.

Increase buffer size in mangle_content_len() while at it.

BUG: KASAN: stack-out-of-bounds in vsnprintf+0xea5/0x1270
Write of size 1 at addr [..]
 vsnprintf+0xea5/0x1270
 sprintf+0xb1/0xe0
 mangle_content_len+0x1ac/0x280
 nf_nat_sdp_session+0x1cc/0x240
 process_sdp+0x8f8/0xb80
 process_invite_request+0x108/0x2b0
 process_sip_msg+0x5da/0xf50
 sip_help_tcp+0x45e/0x780
 nf_confirm+0x34d/0x990
 [..]

Fixes: 9fafcd7b2032 ("[NETFILTER]: nf_conntrack/nf_nat: add SIP helper port")
Reported-by: Yiming Qian <yimingqian591@gmail.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_nat_amanda.c |  2 +-
 net/netfilter/nf_nat_sip.c    | 33 ++++++++++++++++++---------------
 2 files changed, 19 insertions(+), 16 deletions(-)

diff --git a/net/netfilter/nf_nat_amanda.c b/net/netfilter/nf_nat_amanda.c
index 98deef6cde69..8f1054920a85 100644
--- a/net/netfilter/nf_nat_amanda.c
+++ b/net/netfilter/nf_nat_amanda.c
@@ -50,7 +50,7 @@ static unsigned int help(struct sk_buff *skb,
 		return NF_DROP;
 	}
 
-	sprintf(buffer, "%u", port);
+	snprintf(buffer, sizeof(buffer), "%u", port);
 	if (!nf_nat_mangle_udp_packet(skb, exp->master, ctinfo,
 				      protoff, matchoff, matchlen,
 				      buffer, strlen(buffer))) {
diff --git a/net/netfilter/nf_nat_sip.c b/net/netfilter/nf_nat_sip.c
index cf4aeb299bde..c845b6d1a2bd 100644
--- a/net/netfilter/nf_nat_sip.c
+++ b/net/netfilter/nf_nat_sip.c
@@ -68,25 +68,27 @@ static unsigned int mangle_packet(struct sk_buff *skb, unsigned int protoff,
 }
 
 static int sip_sprintf_addr(const struct nf_conn *ct, char *buffer,
+			    size_t size,
 			    const union nf_inet_addr *addr, bool delim)
 {
 	if (nf_ct_l3num(ct) == NFPROTO_IPV4)
-		return sprintf(buffer, "%pI4", &addr->ip);
+		return scnprintf(buffer, size, "%pI4", &addr->ip);
 	else {
 		if (delim)
-			return sprintf(buffer, "[%pI6c]", &addr->ip6);
+			return scnprintf(buffer, size, "[%pI6c]", &addr->ip6);
 		else
-			return sprintf(buffer, "%pI6c", &addr->ip6);
+			return scnprintf(buffer, size, "%pI6c", &addr->ip6);
 	}
 }
 
 static int sip_sprintf_addr_port(const struct nf_conn *ct, char *buffer,
+				 size_t size,
 				 const union nf_inet_addr *addr, u16 port)
 {
 	if (nf_ct_l3num(ct) == NFPROTO_IPV4)
-		return sprintf(buffer, "%pI4:%u", &addr->ip, port);
+		return scnprintf(buffer, size, "%pI4:%u", &addr->ip, port);
 	else
-		return sprintf(buffer, "[%pI6c]:%u", &addr->ip6, port);
+		return scnprintf(buffer, size, "[%pI6c]:%u", &addr->ip6, port);
 }
 
 static int map_addr(struct sk_buff *skb, unsigned int protoff,
@@ -119,7 +121,7 @@ static int map_addr(struct sk_buff *skb, unsigned int protoff,
 	if (nf_inet_addr_cmp(&newaddr, addr) && newport == port)
 		return 1;
 
-	buflen = sip_sprintf_addr_port(ct, buffer, &newaddr, ntohs(newport));
+	buflen = sip_sprintf_addr_port(ct, buffer, sizeof(buffer), &newaddr, ntohs(newport));
 	return mangle_packet(skb, protoff, dataoff, dptr, datalen,
 			     matchoff, matchlen, buffer, buflen);
 }
@@ -212,7 +214,7 @@ static unsigned int nf_nat_sip(struct sk_buff *skb, unsigned int protoff,
 					       &addr, true) > 0 &&
 		    nf_inet_addr_cmp(&addr, &ct->tuplehash[dir].tuple.src.u3) &&
 		    !nf_inet_addr_cmp(&addr, &ct->tuplehash[!dir].tuple.dst.u3)) {
-			buflen = sip_sprintf_addr(ct, buffer,
+			buflen = sip_sprintf_addr(ct, buffer, sizeof(buffer),
 					&ct->tuplehash[!dir].tuple.dst.u3,
 					true);
 			if (!mangle_packet(skb, protoff, dataoff, dptr, datalen,
@@ -229,7 +231,7 @@ static unsigned int nf_nat_sip(struct sk_buff *skb, unsigned int protoff,
 					       &addr, false) > 0 &&
 		    nf_inet_addr_cmp(&addr, &ct->tuplehash[dir].tuple.dst.u3) &&
 		    !nf_inet_addr_cmp(&addr, &ct->tuplehash[!dir].tuple.src.u3)) {
-			buflen = sip_sprintf_addr(ct, buffer,
+			buflen = sip_sprintf_addr(ct, buffer, sizeof(buffer),
 					&ct->tuplehash[!dir].tuple.src.u3,
 					false);
 			if (!mangle_packet(skb, protoff, dataoff, dptr, datalen,
@@ -247,7 +249,7 @@ static unsigned int nf_nat_sip(struct sk_buff *skb, unsigned int protoff,
 		    htons(n) == ct->tuplehash[dir].tuple.dst.u.udp.port &&
 		    htons(n) != ct->tuplehash[!dir].tuple.src.u.udp.port) {
 			__be16 p = ct->tuplehash[!dir].tuple.src.u.udp.port;
-			buflen = sprintf(buffer, "%u", ntohs(p));
+			buflen = scnprintf(buffer, sizeof(buffer), "%u", ntohs(p));
 			if (!mangle_packet(skb, protoff, dataoff, dptr, datalen,
 					   poff, plen, buffer, buflen)) {
 				nf_ct_helper_log(skb, ct, "cannot mangle rport");
@@ -418,7 +420,8 @@ static unsigned int nf_nat_sip_expect(struct sk_buff *skb, unsigned int protoff,
 
 	if (!nf_inet_addr_cmp(&exp->tuple.dst.u3, &exp->saved_addr) ||
 	    exp->tuple.dst.u.udp.port != exp->saved_proto.udp.port) {
-		buflen = sip_sprintf_addr_port(ct, buffer, &newaddr, port);
+		buflen = sip_sprintf_addr_port(ct, buffer, sizeof(buffer),
+					       &newaddr, port);
 		if (!mangle_packet(skb, protoff, dataoff, dptr, datalen,
 				   matchoff, matchlen, buffer, buflen)) {
 			nf_ct_helper_log(skb, ct, "cannot mangle packet");
@@ -438,8 +441,8 @@ static int mangle_content_len(struct sk_buff *skb, unsigned int protoff,
 {
 	enum ip_conntrack_info ctinfo;
 	struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+	char buffer[sizeof("4294967295")];
 	unsigned int matchoff, matchlen;
-	char buffer[sizeof("65536")];
 	int buflen, c_len;
 
 	/* Get actual SDP length */
@@ -454,7 +457,7 @@ static int mangle_content_len(struct sk_buff *skb, unsigned int protoff,
 			      &matchoff, &matchlen) <= 0)
 		return 0;
 
-	buflen = sprintf(buffer, "%u", c_len);
+	buflen = scnprintf(buffer, sizeof(buffer), "%u", c_len);
 	return mangle_packet(skb, protoff, dataoff, dptr, datalen,
 			     matchoff, matchlen, buffer, buflen);
 }
@@ -491,7 +494,7 @@ static unsigned int nf_nat_sdp_addr(struct sk_buff *skb, unsigned int protoff,
 	char buffer[INET6_ADDRSTRLEN];
 	unsigned int buflen;
 
-	buflen = sip_sprintf_addr(ct, buffer, addr, false);
+	buflen = sip_sprintf_addr(ct, buffer, sizeof(buffer), addr, false);
 	if (mangle_sdp_packet(skb, protoff, dataoff, dptr, datalen,
 			      sdpoff, type, term, buffer, buflen))
 		return 0;
@@ -509,7 +512,7 @@ static unsigned int nf_nat_sdp_port(struct sk_buff *skb, unsigned int protoff,
 	char buffer[sizeof("nnnnn")];
 	unsigned int buflen;
 
-	buflen = sprintf(buffer, "%u", port);
+	buflen = scnprintf(buffer, sizeof(buffer), "%u", port);
 	if (!mangle_packet(skb, protoff, dataoff, dptr, datalen,
 			   matchoff, matchlen, buffer, buflen))
 		return 0;
@@ -529,7 +532,7 @@ static unsigned int nf_nat_sdp_session(struct sk_buff *skb, unsigned int protoff
 	unsigned int buflen;
 
 	/* Mangle session description owner and contact addresses */
-	buflen = sip_sprintf_addr(ct, buffer, addr, false);
+	buflen = sip_sprintf_addr(ct, buffer, sizeof(buffer), addr, false);
 	if (mangle_sdp_packet(skb, protoff, dataoff, dptr, datalen, sdpoff,
 			      SDP_HDR_OWNER, SDP_HDR_MEDIA, buffer, buflen))
 		return 0;
-- 
2.47.3


^ permalink raw reply related

* [PATCH net 06/11] netfilter: xtables: restrict several matches to inet family
From: Pablo Neira Ayuso @ 2026-04-16 13:14 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev, kuba, pabeni, edumazet, fw, horms
In-Reply-To: <20260416131453.308611-1-pablo@netfilter.org>

This is a partial revert of:

  commit ab4f21e6fb1c ("netfilter: xtables: use NFPROTO_UNSPEC in more extensions")

to allow ipv4 and ipv6 only.

- xt_mac
- xt_owner
- xt_physdev

These extensions are not used by ebtables in userspace.

Moreover, xt_realm is only for ipv4, since dst->tclassid is ipv4
specific.

Fixes: ab4f21e6fb1c ("netfilter: xtables: use NFPROTO_UNSPEC in more extensions")
Reported-by: "Kito Xu (veritas501)" <hxzene@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/xt_mac.c     | 34 +++++++++++++++++++++++-----------
 net/netfilter/xt_owner.c   | 37 +++++++++++++++++++++++++------------
 net/netfilter/xt_physdev.c | 29 +++++++++++++++++++----------
 net/netfilter/xt_realm.c   |  2 +-
 4 files changed, 68 insertions(+), 34 deletions(-)

diff --git a/net/netfilter/xt_mac.c b/net/netfilter/xt_mac.c
index 81649da57ba5..bd2354760895 100644
--- a/net/netfilter/xt_mac.c
+++ b/net/netfilter/xt_mac.c
@@ -38,25 +38,37 @@ static bool mac_mt(const struct sk_buff *skb, struct xt_action_param *par)
 	return ret;
 }
 
-static struct xt_match mac_mt_reg __read_mostly = {
-	.name      = "mac",
-	.revision  = 0,
-	.family    = NFPROTO_UNSPEC,
-	.match     = mac_mt,
-	.matchsize = sizeof(struct xt_mac_info),
-	.hooks     = (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_IN) |
-	             (1 << NF_INET_FORWARD),
-	.me        = THIS_MODULE,
+static struct xt_match mac_mt_reg[] __read_mostly = {
+	{
+		.name		= "mac",
+		.family		= NFPROTO_IPV4,
+		.match		= mac_mt,
+		.matchsize	= sizeof(struct xt_mac_info),
+		.hooks		= (1 << NF_INET_PRE_ROUTING) |
+				  (1 << NF_INET_LOCAL_IN) |
+				  (1 << NF_INET_FORWARD),
+		.me		= THIS_MODULE,
+	},
+	{
+		.name		= "mac",
+		.family		= NFPROTO_IPV6,
+		.match		= mac_mt,
+		.matchsize	= sizeof(struct xt_mac_info),
+		.hooks		= (1 << NF_INET_PRE_ROUTING) |
+				  (1 << NF_INET_LOCAL_IN) |
+				  (1 << NF_INET_FORWARD),
+		.me		= THIS_MODULE,
+	},
 };
 
 static int __init mac_mt_init(void)
 {
-	return xt_register_match(&mac_mt_reg);
+	return xt_register_matches(mac_mt_reg, ARRAY_SIZE(mac_mt_reg));
 }
 
 static void __exit mac_mt_exit(void)
 {
-	xt_unregister_match(&mac_mt_reg);
+	xt_unregister_matches(mac_mt_reg, ARRAY_SIZE(mac_mt_reg));
 }
 
 module_init(mac_mt_init);
diff --git a/net/netfilter/xt_owner.c b/net/netfilter/xt_owner.c
index 50332888c8d2..7be2fe22b067 100644
--- a/net/netfilter/xt_owner.c
+++ b/net/netfilter/xt_owner.c
@@ -127,26 +127,39 @@ owner_mt(const struct sk_buff *skb, struct xt_action_param *par)
 	return true;
 }
 
-static struct xt_match owner_mt_reg __read_mostly = {
-	.name       = "owner",
-	.revision   = 1,
-	.family     = NFPROTO_UNSPEC,
-	.checkentry = owner_check,
-	.match      = owner_mt,
-	.matchsize  = sizeof(struct xt_owner_match_info),
-	.hooks      = (1 << NF_INET_LOCAL_OUT) |
-	              (1 << NF_INET_POST_ROUTING),
-	.me         = THIS_MODULE,
+static struct xt_match owner_mt_reg[] __read_mostly = {
+	{
+		.name       = "owner",
+		.revision   = 1,
+		.family     = NFPROTO_IPV4,
+		.checkentry = owner_check,
+		.match      = owner_mt,
+		.matchsize  = sizeof(struct xt_owner_match_info),
+		.hooks      = (1 << NF_INET_LOCAL_OUT) |
+			      (1 << NF_INET_POST_ROUTING),
+		.me         = THIS_MODULE,
+	},
+	{
+		.name       = "owner",
+		.revision   = 1,
+		.family     = NFPROTO_IPV6,
+		.checkentry = owner_check,
+		.match      = owner_mt,
+		.matchsize  = sizeof(struct xt_owner_match_info),
+		.hooks      = (1 << NF_INET_LOCAL_OUT) |
+			      (1 << NF_INET_POST_ROUTING),
+		.me         = THIS_MODULE,
+	}
 };
 
 static int __init owner_mt_init(void)
 {
-	return xt_register_match(&owner_mt_reg);
+	return xt_register_matches(owner_mt_reg, ARRAY_SIZE(owner_mt_reg));
 }
 
 static void __exit owner_mt_exit(void)
 {
-	xt_unregister_match(&owner_mt_reg);
+	xt_unregister_matches(owner_mt_reg, ARRAY_SIZE(owner_mt_reg));
 }
 
 module_init(owner_mt_init);
diff --git a/net/netfilter/xt_physdev.c b/net/netfilter/xt_physdev.c
index 343e65f377d4..130842c35c6f 100644
--- a/net/netfilter/xt_physdev.c
+++ b/net/netfilter/xt_physdev.c
@@ -115,24 +115,33 @@ static int physdev_mt_check(const struct xt_mtchk_param *par)
 	return 0;
 }
 
-static struct xt_match physdev_mt_reg __read_mostly = {
-	.name       = "physdev",
-	.revision   = 0,
-	.family     = NFPROTO_UNSPEC,
-	.checkentry = physdev_mt_check,
-	.match      = physdev_mt,
-	.matchsize  = sizeof(struct xt_physdev_info),
-	.me         = THIS_MODULE,
+static struct xt_match physdev_mt_reg[] __read_mostly = {
+	{
+		.name		= "physdev",
+		.family		= NFPROTO_IPV4,
+		.checkentry	= physdev_mt_check,
+		.match		= physdev_mt,
+		.matchsize	= sizeof(struct xt_physdev_info),
+		.me		= THIS_MODULE,
+	},
+	{
+		.name		= "physdev",
+		.family		= NFPROTO_IPV6,
+		.checkentry	= physdev_mt_check,
+		.match		= physdev_mt,
+		.matchsize	= sizeof(struct xt_physdev_info),
+		.me		= THIS_MODULE,
+	},
 };
 
 static int __init physdev_mt_init(void)
 {
-	return xt_register_match(&physdev_mt_reg);
+	return xt_register_matches(physdev_mt_reg, ARRAY_SIZE(physdev_mt_reg));
 }
 
 static void __exit physdev_mt_exit(void)
 {
-	xt_unregister_match(&physdev_mt_reg);
+	xt_unregister_matches(physdev_mt_reg, ARRAY_SIZE(physdev_mt_reg));
 }
 
 module_init(physdev_mt_init);
diff --git a/net/netfilter/xt_realm.c b/net/netfilter/xt_realm.c
index 6df485f4403d..61b2f1e58d15 100644
--- a/net/netfilter/xt_realm.c
+++ b/net/netfilter/xt_realm.c
@@ -33,7 +33,7 @@ static struct xt_match realm_mt_reg __read_mostly = {
 	.matchsize	= sizeof(struct xt_realm_info),
 	.hooks		= (1 << NF_INET_POST_ROUTING) | (1 << NF_INET_FORWARD) |
 			  (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_LOCAL_IN),
-	.family		= NFPROTO_UNSPEC,
+	.family		= NFPROTO_IPV4,
 	.me		= THIS_MODULE
 };
 
-- 
2.47.3


^ permalink raw reply related

* [PATCH net 07/11] netfilter: nat: use kfree_rcu to release ops
From: Pablo Neira Ayuso @ 2026-04-16 13:14 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev, kuba, pabeni, edumazet, fw, horms
In-Reply-To: <20260416131453.308611-1-pablo@netfilter.org>

Florian Westphal says:

"Historically this is not an issue, even for normal base hooks: the data
path doesn't use the original nf_hook_ops that are used to register the
callbacks.

However, in v5.14 I added the ability to dump the active netfilter
hooks from userspace.

This code will peek back into the nf_hook_ops that are available
at the tail of the pointer-array blob used by the datapath.

The nat hooks are special, because they are called indirectly from
the central nat dispatcher hook. They are currently invisible to
the nfnl hook dump subsystem though.

But once that changes the nat ops structures have to be deferred too."

Update nf_nat_register_fn() to deal with partial exposition of the hooks
from error path which can be also an issue for nfnetlink_hook.

Fixes: e2cf17d3774c ("netfilter: add new hook nfnl subsystem")
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv4/netfilter/iptable_nat.c  |  2 +-
 net/ipv6/netfilter/ip6table_nat.c |  2 +-
 net/netfilter/nf_nat_core.c       | 10 ++++++----
 3 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c
index a5db7c67d61b..3b1de7f82bf8 100644
--- a/net/ipv4/netfilter/iptable_nat.c
+++ b/net/ipv4/netfilter/iptable_nat.c
@@ -100,7 +100,7 @@ static void ipt_nat_unregister_lookups(struct net *net)
 	for (i = 0; i < ARRAY_SIZE(nf_nat_ipv4_ops); i++)
 		nf_nat_ipv4_unregister_fn(net, &ops[i]);
 
-	kfree(ops);
+	kfree_rcu(ops, rcu);
 }
 
 static int iptable_nat_table_init(struct net *net)
diff --git a/net/ipv6/netfilter/ip6table_nat.c b/net/ipv6/netfilter/ip6table_nat.c
index e119d4f090cc..9adfbfeaab0c 100644
--- a/net/ipv6/netfilter/ip6table_nat.c
+++ b/net/ipv6/netfilter/ip6table_nat.c
@@ -102,7 +102,7 @@ static void ip6t_nat_unregister_lookups(struct net *net)
 	for (i = 0; i < ARRAY_SIZE(nf_nat_ipv6_ops); i++)
 		nf_nat_ipv6_unregister_fn(net, &ops[i]);
 
-	kfree(ops);
+	kfree_rcu(ops, rcu);
 }
 
 static int ip6table_nat_table_init(struct net *net)
diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c
index 3b5434e4ec9c..b30ca94c2bb7 100644
--- a/net/netfilter/nf_nat_core.c
+++ b/net/netfilter/nf_nat_core.c
@@ -1228,9 +1228,11 @@ int nf_nat_register_fn(struct net *net, u8 pf, const struct nf_hook_ops *ops,
 		ret = nf_register_net_hooks(net, nat_ops, ops_count);
 		if (ret < 0) {
 			mutex_unlock(&nf_nat_proto_mutex);
-			for (i = 0; i < ops_count; i++)
-				kfree(nat_ops[i].priv);
-			kfree(nat_ops);
+			for (i = 0; i < ops_count; i++) {
+				priv = nat_ops[i].priv;
+				kfree_rcu(priv, rcu_head);
+			}
+			kfree_rcu(nat_ops, rcu);
 			return ret;
 		}
 
@@ -1294,7 +1296,7 @@ void nf_nat_unregister_fn(struct net *net, u8 pf, const struct nf_hook_ops *ops,
 		}
 
 		nat_proto_net->nat_hook_ops = NULL;
-		kfree(nat_ops);
+		kfree_rcu(nat_ops, rcu);
 	}
 unlock:
 	mutex_unlock(&nf_nat_proto_mutex);
-- 
2.47.3


^ permalink raw reply related

* [PATCH net 08/11] ipvs: fix MTU check for GSO packets in tunnel mode
From: Pablo Neira Ayuso @ 2026-04-16 13:14 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev, kuba, pabeni, edumazet, fw, horms
In-Reply-To: <20260416131453.308611-1-pablo@netfilter.org>

From: Yingnan Zhang <342144303@qq.com>

Currently, IPVS skips MTU checks for GSO packets by excluding them with
the !skb_is_gso(skb) condition. This creates problems when IPVS tunnel
mode encapsulates GSO packets with IPIP headers.

The issue manifests in two ways:

1. MTU violation after encapsulation:
   When a GSO packet passes through IPVS tunnel mode, the original MTU
   check is bypassed. After adding the IPIP tunnel header, the packet
   size may exceed the outgoing interface MTU, leading to unexpected
   fragmentation at the IP layer.

2. Fragmentation with problematic IP IDs:
   When net.ipv4.vs.pmtu_disc=1 and a GSO packet with multiple segments
   is fragmented after encapsulation, each segment gets a sequentially
   incremented IP ID (0, 1, 2, ...). This happens because:

   a) The GSO packet bypasses MTU check and gets encapsulated
   b) At __ip_finish_output, the oversized GSO packet is split into
      separate SKBs (one per segment), with IP IDs incrementing
   c) Each SKB is then fragmented again based on the actual MTU

   This sequential IP ID allocation differs from the expected behavior
   and can cause issues with fragment reassembly and packet tracking.

Fix this by properly validating GSO packets using
skb_gso_validate_network_len(). This function correctly validates
whether the GSO segments will fit within the MTU after segmentation. If
validation fails, send an ICMP Fragmentation Needed message to enable
proper PMTU discovery.

Fixes: 4cdd34084d53 ("netfilter: nf_conntrack_ipv6: improve fragmentation handling")
Signed-off-by: Yingnan Zhang <342144303@qq.com>
Acked-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/ipvs/ip_vs_xmit.c | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index 3601eb86d025..7c570f48ade2 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -102,6 +102,18 @@ __ip_vs_dst_check(struct ip_vs_dest *dest)
 	return dest_dst;
 }
 
+/* Based on ip_exceeds_mtu(). */
+static bool ip_vs_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu)
+{
+	if (skb->len <= mtu)
+		return false;
+
+	if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
+		return false;
+
+	return true;
+}
+
 static inline bool
 __mtu_check_toobig_v6(const struct sk_buff *skb, u32 mtu)
 {
@@ -111,10 +123,9 @@ __mtu_check_toobig_v6(const struct sk_buff *skb, u32 mtu)
 		 */
 		if (IP6CB(skb)->frag_max_size > mtu)
 			return true; /* largest fragment violate MTU */
-	}
-	else if (skb->len > mtu && !skb_is_gso(skb)) {
+	} else if (ip_vs_exceeds_mtu(skb, mtu))
 		return true; /* Packet size violate MTU size */
-	}
+
 	return false;
 }
 
@@ -232,7 +243,7 @@ static inline bool ensure_mtu_is_adequate(struct netns_ipvs *ipvs, int skb_af,
 			return true;
 
 		if (unlikely(ip_hdr(skb)->frag_off & htons(IP_DF) &&
-			     skb->len > mtu && !skb_is_gso(skb) &&
+			     ip_vs_exceeds_mtu(skb, mtu) &&
 			     !ip_vs_iph_icmp(ipvsh))) {
 			icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
 				  htonl(mtu));
-- 
2.47.3


^ permalink raw reply related

* [PATCH net 09/11] netfilter: nf_tables: use list_del_rcu for netlink hooks
From: Pablo Neira Ayuso @ 2026-04-16 13:14 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev, kuba, pabeni, edumazet, fw, horms
In-Reply-To: <20260416131453.308611-1-pablo@netfilter.org>

From: Florian Westphal <fw@strlen.de>

nft_netdev_unregister_hooks and __nft_unregister_flowtable_net_hooks need
to use list_del_rcu(), this list can be walked by concurrent dumpers.

Add a new helper and use it consistently.

Fixes: f9a43007d3f7 ("netfilter: nf_tables: double hook unregistration in netns path")
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_tables_api.c | 44 ++++++++++++++---------------------
 1 file changed, 18 insertions(+), 26 deletions(-)

diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 8c42247a176c..090d4d688a33 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -374,6 +374,12 @@ static void nft_netdev_hook_free_rcu(struct nft_hook *hook)
 	call_rcu(&hook->rcu, __nft_netdev_hook_free_rcu);
 }
 
+static void nft_netdev_hook_unlink_free_rcu(struct nft_hook *hook)
+{
+	list_del_rcu(&hook->list);
+	nft_netdev_hook_free_rcu(hook);
+}
+
 static void nft_netdev_unregister_hooks(struct net *net,
 					struct list_head *hook_list,
 					bool release_netdev)
@@ -384,10 +390,8 @@ static void nft_netdev_unregister_hooks(struct net *net,
 	list_for_each_entry_safe(hook, next, hook_list, list) {
 		list_for_each_entry(ops, &hook->ops_list, list)
 			nf_unregister_net_hook(net, ops);
-		if (release_netdev) {
-			list_del(&hook->list);
-			nft_netdev_hook_free_rcu(hook);
-		}
+		if (release_netdev)
+			nft_netdev_hook_unlink_free_rcu(hook);
 	}
 }
 
@@ -2323,10 +2327,8 @@ void nf_tables_chain_destroy(struct nft_chain *chain)
 
 		if (nft_base_chain_netdev(table->family, basechain->ops.hooknum)) {
 			list_for_each_entry_safe(hook, next,
-						 &basechain->hook_list, list) {
-				list_del_rcu(&hook->list);
-				nft_netdev_hook_free_rcu(hook);
-			}
+						 &basechain->hook_list, list)
+				nft_netdev_hook_unlink_free_rcu(hook);
 		}
 		module_put(basechain->type->owner);
 		if (rcu_access_pointer(basechain->stats)) {
@@ -3026,6 +3028,7 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy,
 				list_for_each_entry(ops, &h->ops_list, list)
 					nf_unregister_net_hook(ctx->net, ops);
 			}
+			/* hook.list is on stack, no need for list_del_rcu() */
 			list_del(&h->list);
 			nft_netdev_hook_free_rcu(h);
 		}
@@ -8903,10 +8906,8 @@ static void __nft_unregister_flowtable_net_hooks(struct net *net,
 	list_for_each_entry_safe(hook, next, hook_list, list) {
 		list_for_each_entry(ops, &hook->ops_list, list)
 			nft_unregister_flowtable_ops(net, flowtable, ops);
-		if (release_netdev) {
-			list_del(&hook->list);
-			nft_netdev_hook_free_rcu(hook);
-		}
+		if (release_netdev)
+			nft_netdev_hook_unlink_free_rcu(hook);
 	}
 }
 
@@ -8977,8 +8978,7 @@ static int nft_register_flowtable_net_hooks(struct net *net,
 
 			nft_unregister_flowtable_ops(net, flowtable, ops);
 		}
-		list_del_rcu(&hook->list);
-		nft_netdev_hook_free_rcu(hook);
+		nft_netdev_hook_unlink_free_rcu(hook);
 	}
 
 	return err;
@@ -8988,10 +8988,8 @@ static void nft_hooks_destroy(struct list_head *hook_list)
 {
 	struct nft_hook *hook, *next;
 
-	list_for_each_entry_safe(hook, next, hook_list, list) {
-		list_del_rcu(&hook->list);
-		nft_netdev_hook_free_rcu(hook);
-	}
+	list_for_each_entry_safe(hook, next, hook_list, list)
+		nft_netdev_hook_unlink_free_rcu(hook);
 }
 
 static int nft_flowtable_update(struct nft_ctx *ctx, const struct nlmsghdr *nlh,
@@ -9079,8 +9077,7 @@ static int nft_flowtable_update(struct nft_ctx *ctx, const struct nlmsghdr *nlh,
 				nft_unregister_flowtable_ops(ctx->net,
 							     flowtable, ops);
 		}
-		list_del_rcu(&hook->list);
-		nft_netdev_hook_free_rcu(hook);
+		nft_netdev_hook_unlink_free_rcu(hook);
 	}
 
 	return err;
@@ -9586,13 +9583,8 @@ static void nf_tables_flowtable_notify(struct nft_ctx *ctx,
 
 static void nf_tables_flowtable_destroy(struct nft_flowtable *flowtable)
 {
-	struct nft_hook *hook, *next;
-
 	flowtable->data.type->free(&flowtable->data);
-	list_for_each_entry_safe(hook, next, &flowtable->hook_list, list) {
-		list_del_rcu(&hook->list);
-		nft_netdev_hook_free_rcu(hook);
-	}
+	nft_hooks_destroy(&flowtable->hook_list);
 	kfree(flowtable->name);
 	module_put(flowtable->data.type->owner);
 	kfree(flowtable);
-- 
2.47.3


^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox