Netdev List

Netdev List
 help / color / mirror / Atom feed

* Re: [PATCH net-next v3 12/12] selftests: net: Add a test for BIG TCP in UDP tunnels
From: Willem de Bruijn @ 2026-04-16 12:06 UTC (permalink / raw)
  To: Alice Mikityanska, Daniel Borkmann, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Xin Long, Willem de Bruijn,
	David Ahern, Nikolay Aleksandrov
  Cc: Shuah Khan, Stanislav Fomichev, Andrew Lunn, Simon Horman,
	Florian Westphal, netdev, Alice Mikityanska
In-Reply-To: <20260410150943.993350-13-alice.kernel@fastmail.im>

Alice Mikityanska wrote:
> From: Alice Mikityanska <alice@isovalent.com>
> 
> The test sets up VXLAN and GENEVE tunnels over IPv4 and IPv6 and runs
> IPv4 and IPv6 traffic through them with BIG TCP enabled. It checks that
> a non-negligible amount of big aggregated packets are seen in tcpdump.
> 
> Signed-off-by: Alice Mikityanska <alice@isovalent.com>
> ---
>  tools/testing/selftests/net/Makefile          |   1 +
>  .../testing/selftests/net/big_tcp_tunnels.sh  | 145 ++++++++++++++++++
>  2 files changed, 146 insertions(+)
>  create mode 100755 tools/testing/selftests/net/big_tcp_tunnels.sh
> 
> diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile
> index cab74ebdaced..c8ea9d4bb94f 100644
> --- a/tools/testing/selftests/net/Makefile
> +++ b/tools/testing/selftests/net/Makefile
> @@ -13,6 +13,7 @@ TEST_PROGS := \
>  	arp_ndisc_untracked_subnets.sh \
>  	bareudp.sh \
>  	big_tcp.sh \
> +	big_tcp_tunnels.sh \
>  	bind_bhash.sh \
>  	bpf_offload.py \
>  	bridge_vlan_dump.sh \
> diff --git a/tools/testing/selftests/net/big_tcp_tunnels.sh b/tools/testing/selftests/net/big_tcp_tunnels.sh
> new file mode 100755
> index 000000000000..b819911519ac
> --- /dev/null
> +++ b/tools/testing/selftests/net/big_tcp_tunnels.sh
> @@ -0,0 +1,145 @@
> +#!/usr/bin/env bash
> +# SPDX-License-Identifier: GPL-2.0
> +#
> +# Testing for IPv4 and IPv6 BIG TCP over VXLAN and GENEVE tunnels.
> +
> +SERVER_NS=$(mktemp -u server-XXXXXXXX)
> +SERVER_IP4="192.168.1.1"
> +SERVER_IP6="2001:db8::1:1"
> +SERVER_IP4_TUN="192.168.2.1"
> +SERVER_IP6_TUN="2001:db8::2:1"
> +
> +CLIENT_NS=$(mktemp -u client-XXXXXXXX)
> +CLIENT_IP4="192.168.1.2"
> +CLIENT_IP6="2001:db8::1:2"
> +CLIENT_IP4_TUN="192.168.2.2"
> +CLIENT_IP6_TUN="2001:db8::2:2"
> +
> +PACKETS_THRESHOLD=10000
> +
> +# Kselftest framework requirement - SKIP code is 4.
> +ksft_skip=4
> +
> +setup() {
> +	ip netns add "$SERVER_NS"
> +	ip netns add "$CLIENT_NS"
> +	ip -netns "$SERVER_NS" link add link1 type veth peer name link0 netns "$CLIENT_NS"
> +
> +	ip -netns "$CLIENT_NS" link set link0 up
> +	ip -netns "$CLIENT_NS" addr replace "$CLIENT_IP4/24" dev link0
> +	ip -netns "$CLIENT_NS" addr replace "$CLIENT_IP6/112" dev link0 nodad
> +	ip -netns "$CLIENT_NS" link set link0 \
> +		gso_max_size 196608 gso_ipv4_max_size 196608 \
> +		gro_max_size 196608 gro_ipv4_max_size 196608
> +	ip -netns "$SERVER_NS" link set link1 up
> +	ip -netns "$SERVER_NS" addr replace "$SERVER_IP4/24" dev link1
> +	ip -netns "$SERVER_NS" addr replace "$SERVER_IP6/112" dev link1 nodad
> +	ip -netns "$SERVER_NS" link set link1 \
> +		gso_max_size 196608 gso_ipv4_max_size 196608 \
> +		gro_max_size 196608 gro_ipv4_max_size 196608
> +
> +	ip netns exec "$SERVER_NS" netserver >/dev/null
> +}
> +
> +setup_tunnel() {
> +	if [ "$2" = 4 ]; then
> +		SERVER_IP="$SERVER_IP4"
> +		CLIENT_IP="$CLIENT_IP4"
> +		echo "Setting up ${1^^} over IPv4"
> +	else
> +		SERVER_IP="$SERVER_IP6"
> +		CLIENT_IP="$CLIENT_IP6"
> +		echo "Setting up ${1^^} over IPv6"
> +	fi
> +
> +	if [ "$1" = vxlan ]; then
> +		ip -netns "$CLIENT_NS" link add tun0 type vxlan \
> +			id 5001 remote "$SERVER_IP" local "$CLIENT_IP" dev link0 dstport 4789
> +	else
> +		ip -netns "$CLIENT_NS" link add tun0 type geneve \
> +			id 5001 remote "$SERVER_IP"
> +	fi
> +	ip -netns "$CLIENT_NS" link set tun0 up
> +	ip -netns "$CLIENT_NS" addr replace "$CLIENT_IP4_TUN/24" dev tun0
> +	ip -netns "$CLIENT_NS" addr replace "$CLIENT_IP6_TUN/112" dev tun0 nodad
> +	ip -netns "$CLIENT_NS" link set tun0 \
> +		gso_max_size 196608 gso_ipv4_max_size 196608 \
> +		gro_max_size 196608 gro_ipv4_max_size 196608
> +	if [ "$1" = vxlan ]; then
> +		ip -netns "$SERVER_NS" link add tun1 type vxlan \
> +			id 5001 remote "$CLIENT_IP" local "$SERVER_IP" dev link1 dstport 4789
> +	else
> +		ip -netns "$SERVER_NS" link add tun1 type geneve \
> +			id 5001 remote "$CLIENT_IP"
> +	fi
> +	ip -netns "$SERVER_NS" link set tun1 up
> +	ip -netns "$SERVER_NS" addr replace "$SERVER_IP4_TUN/24" dev tun1
> +	ip -netns "$SERVER_NS" addr replace "$SERVER_IP6_TUN/112" dev tun1 nodad
> +	ip -netns "$SERVER_NS" link set tun1 \
> +		gso_max_size 196608 gso_ipv4_max_size 196608 \
> +		gro_max_size 196608 gro_ipv4_max_size 196608
> +}
> +
> +cleanup_tunnel() {
> +	ip -netns "$CLIENT_NS" link del tun0
> +	ip -netns "$SERVER_NS" link del tun1
> +}
> +
> +cleanup() {
> +	ip netns exec "$SERVER_NS" killall netserver
> +	ip netns del "$SERVER_NS"
> +	ip netns del "$CLIENT_NS"
> +}
> +
> +do_test() {
> +	exec 3< <(ip netns exec "$SERVER_NS" tcpdump -nn -i link1 greater 65536 2> /dev/null)
> +	TCPDUMP_SERVER_PID="$!"
> +	exec 4< <(wc -l <&3)
> +	exec 5< <(ip netns exec "$CLIENT_NS" tcpdump -nn -i link0 greater 65536 2> /dev/null)
> +	TCPDUMP_CLIENT_PID="$!"
> +	exec 6< <(wc -l <&5)
> +
> +	if [ "$1" = 4 ]; then
> +		SERVER_IP="$SERVER_IP4_TUN"
> +		echo "Running IPv4 traffic in the tunnel"
> +	else
> +		SERVER_IP="$SERVER_IP6_TUN"
> +		echo "Running IPv6 traffic in the tunnel"
> +	fi
> +
> +	ip netns exec "$CLIENT_NS" netperf -t TCP_STREAM -l 5 -H "$SERVER_IP" -- \
> +		-r 80000:80000 > /dev/null

is -r valid for TCP_STREAM

> +	kill "$TCPDUMP_SERVER_PID" "$TCPDUMP_CLIENT_PID"
> +	wait "$TCPDUMP_SERVER_PID" "$TCPDUMP_CLIENT_PID"
> +	PACKETS_SERVER=$(cat <&4)
> +	PACKETS_CLIENT=$(cat <&6)
> +	exec 3>&- 4>&- 5>&- 6>&-
> +
> +	# One line is empty, each packet is two lines (inner and outer).
> +	echo "Captured BIG TCP GRO packets: $(((PACKETS_SERVER - 1) / 2))"
> +	echo "Captured BIG TCP GSO packets: $(((PACKETS_CLIENT - 1) / 2))"
> +	[ "$PACKETS_SERVER" -gt "$(( PACKETS_THRESHOLD * 2 + 1))" ] || return 1
> +	[ "$PACKETS_CLIENT" -gt "$(( PACKETS_THRESHOLD * 2 + 1))" ] || return 1
> +}
> +
> +if ! netperf -V &> /dev/null; then
> +	echo "SKIP: Could not run test without netperf tool"
> +	exit "$ksft_skip"
> +fi
> +
> +if ! ip link help 2>&1 | grep gso_ipv4_max_size &> /dev/null; then
> +	echo "SKIP: Could not run test without gso/gro_ipv4_max_size supported in ip-link"
> +	exit "$ksft_skip"
> +fi
> +
> +trap cleanup EXIT
> +setup
> +for tunnel in vxlan geneve; do
> +	for tun_family in 4 6; do
> +		for traffic_family in 4 6; do
> +			setup_tunnel "$tunnel" "$tun_family" || exit "$?"
> +			do_test "$traffic_family" || exit "$?"
> +			cleanup_tunnel
> +		done
> +	done
> +done
> -- 
> 2.53.0
> 



^ permalink raw reply

* Re: [PATCH net-next 5/6] net: stmmac: move PHY handling out of __stmmac_open()/release()
From: Alexander Stein @ 2026-04-16 12:02 UTC (permalink / raw)
  To: Russell King (Oracle)
  Cc: Andrew Lunn, Heiner Kallweit, Alexandre Torgue, Andrew Lunn,
	David S. Miller, Eric Dumazet, Jakub Kicinski, linux-arm-kernel,
	linux-stm32, Maxime Coquelin, netdev, Paolo Abeni
In-Reply-To: <aeC-tc2CooYDoBok@shell.armlinux.org.uk>

Hi Russel,

Am Donnerstag, 16. April 2026, 12:49:25 CEST schrieb Russell King (Oracle):
> On Thu, Apr 16, 2026 at 08:20:13AM +0200, Alexander Stein wrote:
> > Am Mittwoch, 15. April 2026, 14:59:32 CEST schrieb Russell King (Oracle):
> > > On Wed, Apr 15, 2026 at 08:08:40AM +0200, Alexander Stein wrote:
> > > > Hi,
> > > > 
> > > > Am Dienstag, 23. September 2025, 13:26:19 CEST schrieb Russell King (Oracle):
> > > > > Move the PHY attachment/detachment from the network driver out of
> > > > > __stmmac_open() and __stmmac_release() into stmmac_open() and
> > > > > stmmac_release() where these actions will only happen when the
> > > > > interface is administratively brought up or down. It does not make
> > > > > sense to detach and re-attach the PHY during a change of MTU.
> > > > 
> > > > Sorry for coming up now. But I recently noticed this commit breaks changing
> > > > the MTU on i.MX8MP. Once I simply change the MTU I run into some DMA error:
> > > > $ ip link set dev end1 mtu 1400
> > > > imx-dwmac 30bf0000.ethernet end1: Register MEM_TYPE_PAGE_POOL RxQ-0
> > > > imx-dwmac 30bf0000.ethernet end1: Register MEM_TYPE_PAGE_POOL RxQ-1
> > > > imx-dwmac 30bf0000.ethernet end1: Register MEM_TYPE_PAGE_POOL RxQ-2
> > > > imx-dwmac 30bf0000.ethernet end1: Register MEM_TYPE_PAGE_POOL RxQ-3
> > > > imx-dwmac 30bf0000.ethernet end1: Register MEM_TYPE_PAGE_POOL RxQ-4
> > > > imx-dwmac 30bf0000.ethernet end1: Link is Down
> > > > imx-dwmac 30bf0000.ethernet end1: Failed to reset the dma
> > > > imx-dwmac 30bf0000.ethernet end1: stmmac_hw_setup: DMA engine initialization failed
> > > 
> > > This basically means that a clock is missing. Please provide more
> > > information:
> > > 
> > > - what kernel version are you using?
> > 
> > Currently I am using v6.18.22.
> > $ ethtool -i end1
> > driver: st_gmac
> > version: 6.18.22
> > firmware-version: 
> > expansion-rom-version: 
> > bus-info: 30bf0000.ethernet
> > supports-statistics: yes
> > supports-test: no
> > supports-eeprom-access: no
> > supports-register-dump: yes
> > supports-priv-flags: no
> > 
> > > - has EEE been negotiated?
> > 
> > No. It is marked as not supported
> > 
> > $ ethtool --show-eee end1
> > EEE settings for end1:
> >         EEE status: not supported
> > 
> > > - does the problem persist when EEE is disabled?
> > 
> > As EEE is not supported the problem occurs even with EEE disabled.
> > 
> > > - which PHY is attached to stmmac?
> > 
> > It is a TI DP83867.
> > 
> > imx-dwmac 30bf0000.ethernet eth1: PHY [stmmac-1:03] driver [TI DP83867] (irq=136)
> > 
> > > - which PHY interface mode is being used to connect the PHY to stmmac?
> > 
> > For this interface
> > > phy-mode = "rgmii-id";
> > is set.
> > 
> > In case it is helpful. My platform is arch/arm64/boot/dts/freescale/imx8mp-tqma8mpql-mba8mpxl.dts
> > Thanks for assisting. If there a further questions, don't hesitate to ask.
> 
> Thanks.
> 
> So, as best I can determine at the moment, we end up with the following
> sequence:
> 
> stmmac_change_mtu()
>  __stmmac_release()
>   phylink_stop()
>    phy_stop()
>     phy->state = PHY_HALTED
>     _phy_state_machine() returns PHY_STATE_WORK_SUSPEND
>     _phy_state_machine_post_work()
>      phy_suspend()
>       genphy_suspend()
>        phy_set_bits(phydev, MII_BMCR, BMCR_PDOWN)
> 
> With the DP83867, this causes most of the PHY to be powered down, thus
> stopping the clocks, and this causes the stmmac reset to time out.
> 
> Prior to this commit, we would have called phylink_disconnect_phy()
> immediately after phylink_stop(), but I can see nothing that would
> be affected by this change there (since that also calls
> phy_suspend(), but as the PHY is already suspended, this becomes a
> no-op.)
> 
> However, __stmmac_open() would have called stmmac_init_phy(), which
> would reattach the PHY. This would have called phy_init_hw(), 
> resetting the PHY, and phy_resume() which would ensure that the
> PDOWN bit is clear - thus clocks would be running.
> 
> As a hack, please can you try calling phylink_prepare_resume()
> between the __stmmac_release() and __stmmac_open() in
> stmmac_change_mtu(). This should resume the PHY, thus restoring the
> clocks necessary for stmmac to reset.

I tried the following patch. This works as you suspected.

Thanks and best regards
Alexander

---8<---
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -5875,6 +5875,8 @@ static int stmmac_change_mtu(struct net_device *dev, int new_mtu)
 
                __stmmac_release(dev);
 
+               phylink_prepare_resume(priv->phylink);
+
                ret = __stmmac_open(dev, dma_conf);
                if (ret) {
                        free_dma_desc_resources(priv, dma_conf);
---8<---

-- 
TQ-Systems GmbH | Mühlstraße 2, Gut Delling | 82229 Seefeld, Germany
Amtsgericht München, HRB 105018
Geschäftsführer: Detlef Schneider, Rüdiger Stahl, Stefan Schneider
http://www.tq-group.com/



^ permalink raw reply

* Re: [PATCH net-next v3 2/9] dt-bindings: net: lan9645x: add LAN9645X switch bindings
From: Rob Herring (Arm) @ 2026-04-16 12:00 UTC (permalink / raw)
  To: Jens Emil Schulz Østergaard
  Cc: Steen Hegelund, Krzysztof Kozlowski, devicetree, Jakub Kicinski,
	linux-kernel, Vladimir Oltean, Woojung Huh, Conor Dooley,
	Russell King, David S. Miller, netdev, UNGLinuxDriver,
	Simon Horman, Daniel Machon, Paolo Abeni, Eric Dumazet,
	Andrew Lunn
In-Reply-To: <20260410-dsa_lan9645x_switch_driver_base-v3-2-aadc8595306d@microchip.com>


On Fri, 10 Apr 2026 13:48:38 +0200, Jens Emil Schulz Østergaard wrote:
> Add bindings for LAN9645X switch. We use a fallback compatible for the
> smallest SKU microchip,lan96455s-switch.
> 
> Reviewed-by: Steen Hegelund <Steen.Hegelund@microchip.com>
> Signed-off-by: Jens Emil Schulz Østergaard <jensemil.schulzostergaard@microchip.com>
> ---
> Changes in v3:
> - remove additionalProperties: true
> - remove unnecessary | from description
> - change top level $ref to dsa.yaml#/$defs/ethernet-ports
> - use ethernet-ports and ethernet-port
> - move ethernet-ports under properties instead of patternProperties
> - move unevaluatedProperties: false after $ref
> - update example to use ethernet-ports and ethernet-port
> 
> Changes in v2:
> - rename file to microchip,lan96455s-switch.yaml
> - remove led vendor property
> - add {rx,tx}-internal-delay-ps for rgmii delay
> - remove labels from example
> - remove container node from example
> ---
>  .../net/dsa/microchip,lan96455s-switch.yaml        | 111 +++++++++++++++++++++
>  MAINTAINERS                                        |   1 +
>  2 files changed, 112 insertions(+)
> 

Reviewed-by: Rob Herring (Arm) <robh@kernel.org>


^ permalink raw reply

* Re: [PATCH net v1 1/2] nexthop: fix IPv6 route referencing IPv4 nexthop
From: patchwork-bot+netdevbpf @ 2026-04-16 12:00 UTC (permalink / raw)
  To: Jiayuan Chen
  Cc: netdev, dsahern, davem, edumazet, kuba, pabeni, horms, shuah,
	linux-kernel, linux-kselftest
In-Reply-To: <20260413114522.147784-1-jiayuan.chen@linux.dev>

Hello:

This series was applied to netdev/net.git (main)
by Paolo Abeni <pabeni@redhat.com>:

On Mon, 13 Apr 2026 19:45:19 +0800 you wrote:
> syzbot reported a panic [1] [2].
> 
> When an IPv6 nexthop is replaced with an IPv4 nexthop, the has_v4 flag
> of all groups containing this nexthop is not updated. This is because
> nh_group_v4_update is only called when replacing AF_INET to AF_INET6,
> but the reverse direction (AF_INET6 to AF_INET) is missed.
> 
> [...]

Here is the summary with links:
  - [net,v1,1/2] nexthop: fix IPv6 route referencing IPv4 nexthop
    https://git.kernel.org/netdev/net/c/29c95185ba32
  - [net,v1,2/2] selftests: fib_nexthops: test stale has_v4 on nexthop replace
    https://git.kernel.org/netdev/net/c/104f082f5ed6

You are awesome, thank you!
-- 
Deet-doot-dot, I am a bot.
https://korg.docs.kernel.org/patchwork/pwbot.html



^ permalink raw reply

* Re: NULL pointer dereference in map_kptr_match_type when storing scalar values into kptr slots
From: Mykyta Yatsenko @ 2026-04-16 11:58 UTC (permalink / raw)
  To: Hiker Cl, bpf; +Cc: linux-kernel, netdev
In-Reply-To: <CAGM=xGABGeeGVU7hy_mRr_rp377dBzVVAOpkLnuYKb8XyEs7Hg@mail.gmail.com>

Hiker Cl <clhiker365@gmail.com> writes:

> Hi BPF maintainers,
>
> I'm reporting a bug I encountered in the BPF subsystem on Linux kernel
> version 7.0.0-g1f5ffc672165.
>
> ### Summary
> A NULL pointer dereference vulnerability was discovered in the eBPF
> verifier. A local user can trigger this by loading a BPF program that
> attempts to store a scalar value (non-pointer) into a map slot
> designated as a kptr (kernel pointer). This leads to an immediate
> kernel crash (DoS).
> ### Environment
> - Kernel version: 7.0.0-rc6 (Commit: 71b500afd2f7 from bpf-next tree),
> 7.0.0-g1f5ffc672165 (Commit: 1f5ffc672165 from linux tree)
> - Architecture: x86_64
> - Config: BPF_SYSCALL=y, DEBUG_INFO_BTF=y
>
> ### Steps to Reproduce （poc.c)
> #include "vmlinux.h"
> #include <bpf/bpf_helpers.h>
> /* BTF type tags for kptrs */
> #ifndef __kptr_untrusted
> #define __kptr_untrusted __attribute__((btf_type_tag("kptr_untrusted")))
> #endif
> struct map_value {
> struct task_struct __kptr_untrusted *ptr;
> };
> struct {
> __uint(type, BPF_MAP_TYPE_LRU_HASH);
> __uint(max_entries, 1);
> __type(key, int);
> __type(value, struct map_value);
> } crashing_map SEC(".maps");
> SEC("kprobe/htab_map_get_next_key")
> int trigger_crash(struct pt_regs *ctx)
> {
> int key = 0;
> u64 *val = bpf_map_lookup_elem(&crashing_map, &key);
> if (val) {
> /*
> * Trigger: Store a scalar (non-pointer) into a slot
> * designated as a kptr. The verifier's map_kptr_match_type
> * fails to handle the NULL reg->btf for scalars.
> */
> *val = 0xdeadbeef;
> }
> return 0;
> }
> char LICENSE[] SEC("license") = "GPL";
>
> ### Kernel Log Extract
> [   91.277247][ T7627] Oops: general protection fault, probably for
> non-canonical address 0xdffffc0000I
> [   91.279715][ T7627] KASAN: null-ptr-deref in range
> [0x00000000000000e8-0x00000000000000ef]
> [   91.280906][ T7627] CPU: 0 UID: 0 PID: 7627 Comm: bpftool Not
> tainted 7.0.0-g1f5ffc672165 #5 PREEMPT(full)
> [   91.282421][ T7627] Hardware name: QEMU Standard PC (i440FX + PIIX,
> 1996), BIOS 1.15.0-1 04/01/2014
> [   91.283556][ T7627] RIP: 0010:btf_is_kernel+0x2a/0x50
> ...
>
> ### Actual Results
> The kernel crashes during the verification phase. The verifier calls
> `map_kptr_match_type`, which subsequently calls
> `btf_is_kernel(reg->btf)`. Since the source register is a scalar,
> `reg->btf` is NULL, leading to a NULL pointer dereference.
>
> Detailed info including reproducible BPF program and kernel logs have
> been filed on Bugzilla:
>
>   https://bugzilla.kernel.org/show_bug.cgi?id=221372
>
> Please let me know if you need more information or if I can help test
> a patch.

Thanks for reporting the issue, I can reproduce it.
Looks like a simple fix resolves is:

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 9882475ee9da..91aa51a19c91 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -4544,6 +4544,9 @@ static int map_kptr_match_type(struct bpf_verifier_env *env,
        int perm_flags;
        const char *reg_name = "";
 
+       if (base_type(reg->type) != PTR_TO_BTF_ID)
+               goto bad_type;
+
        if (btf_is_kernel(reg->btf)) {
                perm_flags = PTR_MAYBE_NULL | PTR_TRUSTED | MEM_RCU;
 
@@ -4556,7 +4559,7 @@ static int map_kptr_match_type(struct bpf_verifier_env *env,
                        perm_flags |= MEM_PERCPU;
        }
 
-       if (base_type(reg->type) != PTR_TO_BTF_ID || (type_flag(reg->type) & ~perm_flags))
+       if (type_flag(reg->type) & ~perm_flags)
                goto bad_type;

^ permalink raw reply related

* Re: [PATCH v3 1/3] net: dsa: microchip: implement KSZ87xx Module 3 low-loss cable errata
From: Fidelio LAWSON @ 2026-04-16 11:53 UTC (permalink / raw)
  To: Marek Vasut, Andrew Lunn
  Cc: Woojung Huh, UNGLinuxDriver, Vladimir Oltean, David S. Miller,
	Eric Dumazet, Jakub Kicinski, Paolo Abeni, Maxime Chevallier,
	Simon Horman, Heiner Kallweit, Russell King, netdev, linux-kernel,
	Fidelio Lawson
In-Reply-To: <712cc46a-5ceb-4f0f-88bb-fa0a47002258@nabladev.com>

On 4/14/26 17:49, Marek Vasut wrote:
> On 4/14/26 2:40 PM, Andrew Lunn wrote:
>> On Tue, Apr 14, 2026 at 01:05:49PM +0200, Marek Vasut wrote:
>>> On 4/14/26 11:12 AM, Fidelio Lawson wrote:
>>>> Implement the "Module 3: Equalizer fix for short cables" erratum from
>>>> Microchip document DS80000687C for KSZ87xx switches.
>>>>
>>>> The issue affects short or low-loss cable links (e.g. CAT5e/CAT6),
>>>> where the PHY receiver equalizer may amplify high-amplitude signals
>>>> excessively, resulting in internal distortion and link establishment
>>>> failures.
>>>>
>>>> KSZ87xx devices require a workaround for the Module 3 low-loss cable
>>>> condition, controlled through the switch TABLE_LINK_MD_V indirect
>>>> registers.
>>>>
>>>> The affected registers are part of the switch address space and are not
>>>> directly accessible from the PHY driver. To keep the PHY-facing API
>>>> clean and avoid leaking switch-specific details, model this errata
>>>> control as vendor-specific Clause 22 PHY registers.
>>>>
>>>> A vendor-specific Clause 22 PHY register is introduced as a mode
>>>> selector in PHY_REG_LOW_LOSS_CTRL, and ksz8_r_phy() / ksz8_w_phy()
>>>> translate accesses to these bits into the appropriate indirect
>>>> TABLE_LINK_MD_V accesses.
>>>>
>>>> The control register defines the following modes:
>>>> 0: disabled (default behavior)
>>>> 1: EQ training workaround
>>>> 2: LPF 90 MHz
>>>> 3: LPF 62 MHz
>>>> 4: LPF 55 MHz
>>>> 5: LPF 44 MHz
>>> I may not fully understand this, but aren't the EQ and LPF settings
>>> orthogonal ?
>>
>> What is the real life experience using this feature? Is it needed for
>> 1cm cables, but most > 1m cables are O.K with the defaults? Do we need
>> all these configuration options? How is a user supposed to discover
>> the different options? Can we simplify it down to a Boolean?
> 
> The report I got was, that if the device is cooled down AND the user 
> used special short low-loss CAT6 cable, then there was packet loss until 
> the communication completely broke down.
> 
> With the LPF set to 62 MHz and DSP EQ initial value set to 0, that 
> situation improved and there was still up to 0.14% packet less, but it 
> is better than total breakdown of communication. We couldn't get the 
> packet loss down to 0% no matter which tuning we applied.
> 
>> Ethernet is just supposed to work with any valid length of cable,
>> KISS. So maybe we should try to keep this feature KISS. Just tell the
>> driver it is a short cable, pick different defaults which should work
>> with any short cable?
> 
> I think the user should be able to configure the LPF bandwidth and DSP 
> EQ initial value as needed. While the short cable improvement settings 
> are "LPF set to 62 MHz bandwidth and DSP EQ initial value to 0", there 
> might be future configurations which require different settings.
> 
> I think the ideal setup would be if those two settings were configurable 
> separately, with a bit of documentation explaining the two currently 
> known good settings:
> - Default (LPF 90 MHz BW, DSP EQ initial value as needed)
> - Short cable (LPF 62 MHz BW, DSP EQ initial value 0)
> But if the user needs to reduce the BW further e.g. to improve noise 
> resistance further, they shouldn't be prevented from doing so.
> 
>> A boolean should also help with making this tunable reusable with
>> other devices. It is unlikely any other devices have these same
>> configuration options, unless it is from the same vendor.
> Could the LPF PHY tunable simply take integer as a parameter ? Then it 
> would be portable across other PHYs I think ?
> 
> The DSP EQ initial value can also be an integer tunable.

Yes, I think a reasonable compromise could be to expose three tunables:

- a boolean "short-cable" tunable, which applies the known good settings
   (LPF 62 MHz BW, DSP EQ initial value 0).

- an integer LPF bandwidth tunable, for advanced use cases where further
   tuning is needed;

- an integer DSP EQ initial value tunable, for the same advanced cases.

The boolean tunable would follow the KISS principle and cover the common
scenario, while the more granular controls would remain optional.

What do you think?



^ permalink raw reply

* Re: [PATCH net v6 2/2] ice: fix missing dpll notifications for SW pins
From: Jiri Pirko @ 2026-04-16 11:46 UTC (permalink / raw)
  To: Petr Oros
  Cc: netdev, Vadim Fedorenko, Arkadiusz Kubalewski, Tony Nguyen,
	Przemek Kitszel, Andrew Lunn, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Simon Horman, linux-kernel,
	intel-wired-lan, Aleksandr Loktionov, Ivan Vecera, Rinitha S,
	Michal Schmidt, Jacob Keller
In-Reply-To: <20260416113952.389405-3-poros@redhat.com>

Thu, Apr 16, 2026 at 01:39:52PM +0200, poros@redhat.com wrote:
>The SMA/U.FL pin redesign (commit 2dd5d03c77e2 ("ice: redesign dpll
>sma/u.fl pins control")) introduced software-controlled pins that wrap
>backing CGU input/output pins, but never updated the notification and
>data paths to propagate pin events to these SW wrappers.
>
>The periodic work sends dpll_pin_change_ntf() only for direct CGU input
>pins.  SW pins that wrap these inputs never receive change or phase
>offset notifications, so userspace consumers such as synce4l monitoring
>SMA pins via dpll netlink never learn about state transitions or phase
>offset updates.  Similarly, ice_dpll_phase_offset_get() reads the SW
>pin's own phase_offset field which is never updated; the PPS monitor
>writes to the backing CGU input's field instead.
>
>On top of that, when SMA or U.FL pin state changes via PCA9575 GPIO
>write, the paired pin's state also changes because they share physical
>signal paths, but no notification is sent for the peer pin.
>
>Fix by introducing ice_dpll_pin_ntf(), a wrapper around
>dpll_pin_change_ntf() that also notifies any registered SMA/U.FL pin
>whose backing CGU input matches.  Replace all direct
>dpll_pin_change_ntf() calls in the periodic notification paths with
>this wrapper.  Fix ice_dpll_phase_offset_get() to return the backing
>CGU input's phase_offset for input-direction SW pins.  Add
>ice_dpll_sw_pin_notify_peer() to send a notification for the paired
>SW pin after PCA9575 writes.  The peer notification is called from
>the dpll_pin_ops callback wrappers after pf->dplls.lock is released,
>because dpll_pin_change_ntf() sends a netlink message that invokes
>driver callbacks which acquire the same lock.
>
>Fixes: 2dd5d03c77e2 ("ice: redesign dpll sma/u.fl pins control")
>Signed-off-by: Petr Oros <poros@redhat.com>
>---
>v6:
> - fix deadlock reported by Michal Schmidt: dpll_pin_change_ntf() in
>   peer notification was called with dpll_lock held, causing deadlock.
>   Move the peer notification calls out of ice_dpll_sma_direction_set()
>   and ice_dpll_ufl_pin_state_set() into their dpll_pin_ops callback
>   wrappers, after pf->dplls.lock is released, and use
>   __dpll_pin_change_ntf() because dpll_lock is still held by the dpll
>   netlink layer (dpll_pin_pre_doit).
>v5: https://lore.kernel.org/all/20260409102501.1447628-1-poros@redhat.com/
> - add ice_dpll_sw_pin_notify_peer() for SMA/U.FL peer notification
>   when PCA9575 routing changes affect the paired pin (reported by
>   Intel test: SMA state change did not log U.FL status change in
>   subscribe monitor)
>v4: https://lore.kernel.org/all/20260319205256.998876-1-poros@redhat.com/
>v3: https://lore.kernel.org/all/20260220140700.2910174-1-poros@redhat.com/
>v2: https://lore.kernel.org/all/20260219131500.2271897-1-poros@redhat.com/
>v1: https://lore.kernel.org/all/20260218211414.1411163-1-poros@redhat.com/
>---
> drivers/net/ethernet/intel/ice/ice_dpll.c | 80 +++++++++++++++++++----
> 1 file changed, 68 insertions(+), 12 deletions(-)

Can this patch be split to 2? Looks like 2 fixes to me.

^ permalink raw reply

* Re: [PATCH net v6 2/2] ice: fix missing dpll notifications for SW pins
From: Jiri Pirko @ 2026-04-16 11:43 UTC (permalink / raw)
  To: Petr Oros
  Cc: netdev, Vadim Fedorenko, Arkadiusz Kubalewski, Tony Nguyen,
	Przemek Kitszel, Andrew Lunn, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Simon Horman, linux-kernel,
	intel-wired-lan, Aleksandr Loktionov, Ivan Vecera, Rinitha S,
	Michal Schmidt, Jacob Keller
In-Reply-To: <20260416113952.389405-3-poros@redhat.com>

Thu, Apr 16, 2026 at 01:39:52PM +0200, poros@redhat.com wrote:

[..]

>@@ -1233,7 +1259,6 @@ static int ice_dpll_sma_direction_set(struct ice_dpll_pin *p,
> 			ret = ice_dpll_pin_state_update(p->pf, target,
> 							type, extack);
> 	}
>-

?

> 	return ret;
> }
> 

[..]

^ permalink raw reply

* Re: [PATCH net-next v2] net/smc: cap allocation order for SMC-R physically contiguous buffers
From: Sidraya Jayagond @ 2026-04-16 11:41 UTC (permalink / raw)
  To: D. Wythe, David S. Miller, Dust Li, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, Wenjia Zhang
  Cc: Mahanta Jambigi, Simon Horman, Tony Lu, Wen Gu, linux-kernel,
	linux-rdma, linux-s390, netdev, oliver.yang, pasic
In-Reply-To: <20260407124337.88128-1-alibuda@linux.alibaba.com>



On 07/04/26 6:13 pm, D. Wythe wrote:
> The alloc_pages() cannot satisfy requests exceeding MAX_PAGE_ORDER,
> and attempting such allocations will lead to guaranteed failures
> and potential kernel warnings.
> 
> For SMCR_PHYS_CONT_BUFS, cap the allocation order to MAX_PAGE_ORDER.
> This ensures the attempts to allocate the largest possible physically
> contiguous chunk succeed, instead of failing with an invalid order.
> This also avoids redundant "try-fail-degrade" cycles in
> __smc_buf_create().
> 
> For SMCR_MIXED_BUFS, no cap is needed: if the order exceeds
> MAX_PAGE_ORDER, alloc_pages() will silently fail (__GFP_NOWARN)
> and automatically fall back to virtual memory.
> 
> Signed-off-by: D. Wythe <alibuda@linux.alibaba.com>
> Reviewed-by: Dust Li <dust.li@linux.alibaba.com>
> ---
> Changes v1 -> v2:
> https://lore.kernel.org/netdev/20260312082154.36971-1-alibuda@linux.alibaba.com/
> 
> - Move the bufsize cap from smcr_new_buf_create() up to
>   __smc_buf_create(), which is simpler and avoids touching
>   the allocation logic itself.
> ---
>  net/smc/smc_core.c | 4 ++++
>  1 file changed, 4 insertions(+)
> 
> diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
> index e2d083daeb7e..cdd881746e21 100644
> --- a/net/smc/smc_core.c
> +++ b/net/smc/smc_core.c
> @@ -2440,6 +2440,10 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb)
>  		/* use socket send buffer size (w/o overhead) as start value */
>  		bufsize = smc->sk.sk_sndbuf / 2;
>  
> +	/* limit bufsize for physically contiguous buffers */
> +	if (!is_smcd && lgr->buf_type == SMCR_PHYS_CONT_BUFS)
> +		bufsize = min_t(int, bufsize, (PAGE_SIZE << MAX_PAGE_ORDER));
> +
>  	for (bufsize_comp = smc_compress_bufsize(bufsize, is_smcd, is_rmb);
>  	     bufsize_comp >= 0; bufsize_comp--) {
>  		if (is_rmb) {

Code changes looks good to me.
Thanks
Reviewed-by: Sidraya Jayagond <sidraya@linux.ibm.com>

^ permalink raw reply

* [PATCH v2 iwl-net] i40e: keep q_vectors array in sync with channel count changes
From: Maciej Fijalkowski @ 2026-04-16 11:40 UTC (permalink / raw)
  To: intel-wired-lan
  Cc: netdev, magnus.karlsson, kuba, pabeni, horms, przemyslaw.kitszel,
	jacob.e.keller, Maciej Fijalkowski

For the main VSI, i40e_set_num_rings_in_vsi() always derives
num_q_vectors from pf->num_lan_msix. At the same time, ethtool -L stores
the user requested channel count in vsi->req_queue_pairs and the queue
setup path uses that value for the effective number of queue pairs.

This leaves queue and vector counts out of sync after shrinking channel
count via ethtool -L. The active queue configuration is reduced, but the
VSI still keeps the full PF-sized q_vector topology.

That mismatch breaks reconfiguration flows which rely on vector/NAPI
state matching the effective channel configuration. In particular,
toggling /sys/class/net/<dev>/threaded after reducing the channel count
can hang, and later channel-count changes can fail because VSI reinit
does not rebuild q_vectors to match the new vector count.

Fix this by making the main VSI num_q_vectors follow the effective
requested channel count, capped by the available MSI-X vectors. Update
i40e_vsi_reinit_setup() to rebuild q_vectors during VSI reinit so the
vector topology is refreshed together with the ring arrays when channel
count changes.

Keep alloc_queue_pairs unchanged and based on pf->num_lan_qps so the VSI
retains its full queue capacity.

Selftest napi_threaded.py was originally used when Jakub reported hang
on /sys/class/net/<dev>/threaded toggle. In order to make it pass on
i40e, use persistent NAPI configuration for q_vector NAPIs so NAPI
identity and threaded settings survive q_vector reallocation across
channel-count changes. This is achieved by using netif_napi_add_config()
when configuring q_vectors.

$ export NETIF=ens259f1np1
$ sudo -E env PATH="$PATH" ./tools/testing/selftests/drivers/net/napi_threaded.py
TAP version 13
1..3
ok 1 napi_threaded.napi_init
ok 2 napi_threaded.change_num_queues
ok 3 napi_threaded.enable_dev_threaded_disable_napi_threaded
Totals: pass:3 fail:0 xfail:0 xpass:0 skip:0 error:0

Reported-by: Jakub Kicinski <kuba@kernel.org>
Closes: https://lore.kernel.org/intel-wired-lan/20260316133100.6054a11f@kernel.org/
Fixes: d2a69fefd756 ("i40e: Fix changing previously set num_queue_pairs for PFs")
Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
---
v2:
- NULL vsi->tx_rings in i40e_vsi_alloc_arrays() (Sashiko)
---
 drivers/net/ethernet/intel/i40e/i40e_main.c | 35 +++++++++++++++++----
 1 file changed, 29 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 926d001b2150..1d2a4181966f 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -11403,10 +11403,14 @@ static void i40e_service_timer(struct timer_list *t)
 static int i40e_set_num_rings_in_vsi(struct i40e_vsi *vsi)
 {
 	struct i40e_pf *pf = vsi->back;
+	u16 qps;
 
 	switch (vsi->type) {
 	case I40E_VSI_MAIN:
 		vsi->alloc_queue_pairs = pf->num_lan_qps;
+		qps = vsi->req_queue_pairs ?
+		      min_t(u16, vsi->req_queue_pairs, pf->num_lan_qps) :
+		      pf->num_lan_qps;
 		if (!vsi->num_tx_desc)
 			vsi->num_tx_desc = ALIGN(I40E_DEFAULT_NUM_DESCRIPTORS,
 						 I40E_REQ_DESCRIPTOR_MULTIPLE);
@@ -11414,7 +11418,8 @@ static int i40e_set_num_rings_in_vsi(struct i40e_vsi *vsi)
 			vsi->num_rx_desc = ALIGN(I40E_DEFAULT_NUM_DESCRIPTORS,
 						 I40E_REQ_DESCRIPTOR_MULTIPLE);
 		if (test_bit(I40E_FLAG_MSIX_ENA, pf->flags))
-			vsi->num_q_vectors = pf->num_lan_msix;
+			vsi->num_q_vectors = max_t(int, 1,
+						   min_t(int, qps, pf->num_lan_msix));
 		else
 			vsi->num_q_vectors = 1;
 
@@ -11503,6 +11508,7 @@ static int i40e_vsi_alloc_arrays(struct i40e_vsi *vsi, bool alloc_qvectors)
 
 err_vectors:
 	kfree(vsi->tx_rings);
+	vsi->tx_rings = NULL;
 	return ret;
 }
 
@@ -12043,7 +12049,8 @@ static int i40e_vsi_alloc_q_vector(struct i40e_vsi *vsi, int v_idx)
 	cpumask_copy(&q_vector->affinity_mask, cpu_possible_mask);
 
 	if (vsi->netdev)
-		netif_napi_add(vsi->netdev, &q_vector->napi, i40e_napi_poll);
+		netif_napi_add_config(vsi->netdev, &q_vector->napi,
+				      i40e_napi_poll, v_idx);
 
 	/* tie q_vector and vsi together */
 	vsi->q_vectors[v_idx] = q_vector;
@@ -14265,12 +14272,27 @@ static struct i40e_vsi *i40e_vsi_reinit_setup(struct i40e_vsi *vsi)
 
 	pf = vsi->back;
 
+	if (test_bit(I40E_FLAG_MSIX_ENA, pf->flags)) {
+		i40e_put_lump(pf->irq_pile, vsi->base_vector, vsi->idx);
+		vsi->base_vector = 0;
+	}
+
 	i40e_put_lump(pf->qp_pile, vsi->base_queue, vsi->idx);
 	i40e_vsi_clear_rings(vsi);
 
-	i40e_vsi_free_arrays(vsi, false);
+	i40e_vsi_free_q_vectors(vsi);
+	i40e_vsi_free_arrays(vsi, true);
 	i40e_set_num_rings_in_vsi(vsi);
-	ret = i40e_vsi_alloc_arrays(vsi, false);
+
+	ret = i40e_vsi_alloc_arrays(vsi, true);
+	if (ret)
+		goto err_vsi;
+
+	/* Rebuild q_vectors during VSI reinit because the effective channel
+	 * count may change num_q_vectors. Keep vector topology aligned with the
+	 * queue configuration after ethtool's .set_channels() callback.
+	 */
+	ret = i40e_vsi_setup_vectors(vsi);
 	if (ret)
 		goto err_vsi;
 
@@ -14282,7 +14304,7 @@ static struct i40e_vsi *i40e_vsi_reinit_setup(struct i40e_vsi *vsi)
 		dev_info(&pf->pdev->dev,
 			 "failed to get tracking for %d queues for VSI %d err %d\n",
 			 alloc_queue_pairs, vsi->seid, ret);
-		goto err_vsi;
+		goto err_lump;
 	}
 	vsi->base_queue = ret;
 
@@ -14306,7 +14328,6 @@ static struct i40e_vsi *i40e_vsi_reinit_setup(struct i40e_vsi *vsi)
 	return vsi;
 
 err_rings:
-	i40e_vsi_free_q_vectors(vsi);
 	if (vsi->netdev_registered) {
 		vsi->netdev_registered = false;
 		unregister_netdev(vsi->netdev);
@@ -14316,6 +14337,8 @@ static struct i40e_vsi *i40e_vsi_reinit_setup(struct i40e_vsi *vsi)
 	if (vsi->type == I40E_VSI_MAIN)
 		i40e_devlink_destroy_port(pf);
 	i40e_aq_delete_element(&pf->hw, vsi->seid, NULL);
+err_lump:
+	i40e_vsi_free_q_vectors(vsi);
 err_vsi:
 	i40e_vsi_clear(vsi);
 	return NULL;
-- 
2.43.0


^ permalink raw reply related

* [PATCH net v6 2/2] ice: fix missing dpll notifications for SW pins
From: Petr Oros @ 2026-04-16 11:39 UTC (permalink / raw)
  To: netdev
  Cc: Petr Oros, Vadim Fedorenko, Arkadiusz Kubalewski, Jiri Pirko,
	Tony Nguyen, Przemek Kitszel, Andrew Lunn, David S. Miller,
	Eric Dumazet, Jakub Kicinski, Paolo Abeni, Simon Horman,
	linux-kernel, intel-wired-lan, Aleksandr Loktionov, Ivan Vecera,
	Rinitha S, Michal Schmidt, Jacob Keller
In-Reply-To: <20260416113952.389405-1-poros@redhat.com>

The SMA/U.FL pin redesign (commit 2dd5d03c77e2 ("ice: redesign dpll
sma/u.fl pins control")) introduced software-controlled pins that wrap
backing CGU input/output pins, but never updated the notification and
data paths to propagate pin events to these SW wrappers.

The periodic work sends dpll_pin_change_ntf() only for direct CGU input
pins.  SW pins that wrap these inputs never receive change or phase
offset notifications, so userspace consumers such as synce4l monitoring
SMA pins via dpll netlink never learn about state transitions or phase
offset updates.  Similarly, ice_dpll_phase_offset_get() reads the SW
pin's own phase_offset field which is never updated; the PPS monitor
writes to the backing CGU input's field instead.

On top of that, when SMA or U.FL pin state changes via PCA9575 GPIO
write, the paired pin's state also changes because they share physical
signal paths, but no notification is sent for the peer pin.

Fix by introducing ice_dpll_pin_ntf(), a wrapper around
dpll_pin_change_ntf() that also notifies any registered SMA/U.FL pin
whose backing CGU input matches.  Replace all direct
dpll_pin_change_ntf() calls in the periodic notification paths with
this wrapper.  Fix ice_dpll_phase_offset_get() to return the backing
CGU input's phase_offset for input-direction SW pins.  Add
ice_dpll_sw_pin_notify_peer() to send a notification for the paired
SW pin after PCA9575 writes.  The peer notification is called from
the dpll_pin_ops callback wrappers after pf->dplls.lock is released,
because dpll_pin_change_ntf() sends a netlink message that invokes
driver callbacks which acquire the same lock.

Fixes: 2dd5d03c77e2 ("ice: redesign dpll sma/u.fl pins control")
Signed-off-by: Petr Oros <poros@redhat.com>
---
v6:
 - fix deadlock reported by Michal Schmidt: dpll_pin_change_ntf() in
   peer notification was called with dpll_lock held, causing deadlock.
   Move the peer notification calls out of ice_dpll_sma_direction_set()
   and ice_dpll_ufl_pin_state_set() into their dpll_pin_ops callback
   wrappers, after pf->dplls.lock is released, and use
   __dpll_pin_change_ntf() because dpll_lock is still held by the dpll
   netlink layer (dpll_pin_pre_doit).
v5: https://lore.kernel.org/all/20260409102501.1447628-1-poros@redhat.com/
 - add ice_dpll_sw_pin_notify_peer() for SMA/U.FL peer notification
   when PCA9575 routing changes affect the paired pin (reported by
   Intel test: SMA state change did not log U.FL status change in
   subscribe monitor)
v4: https://lore.kernel.org/all/20260319205256.998876-1-poros@redhat.com/
v3: https://lore.kernel.org/all/20260220140700.2910174-1-poros@redhat.com/
v2: https://lore.kernel.org/all/20260219131500.2271897-1-poros@redhat.com/
v1: https://lore.kernel.org/all/20260218211414.1411163-1-poros@redhat.com/
---
 drivers/net/ethernet/intel/ice/ice_dpll.c | 80 +++++++++++++++++++----
 1 file changed, 68 insertions(+), 12 deletions(-)

diff --git a/drivers/net/ethernet/intel/ice/ice_dpll.c b/drivers/net/ethernet/intel/ice/ice_dpll.c
index 3a90a2940fdc6e..117c6a8590a279 100644
--- a/drivers/net/ethernet/intel/ice/ice_dpll.c
+++ b/drivers/net/ethernet/intel/ice/ice_dpll.c
@@ -1154,6 +1154,32 @@ ice_dpll_input_state_get(const struct dpll_pin *pin, void *pin_priv,
 				      extack, ICE_DPLL_PIN_TYPE_INPUT);
 }
 
+/**
+ * ice_dpll_sw_pin_notify_peer - notify the paired SW pin after a state change
+ * @d: pointer to dplls struct
+ * @changed: the SW pin that was explicitly changed (already notified by dpll core)
+ *
+ * SMA and U.FL pins share physical signal paths in pairs (SMA1/U.FL1 and
+ * SMA2/U.FL2).  When one pin's routing changes via the PCA9575 GPIO
+ * expander, the paired pin's state may also change.  Send a change
+ * notification for the peer pin so userspace consumers monitoring the
+ * peer via dpll netlink learn about the update.
+ *
+ * Context: Called from dpll_pin_ops callbacks after pf->dplls.lock is
+ *          released.  Uses __dpll_pin_change_ntf() because dpll_lock is
+ *          still held by the dpll netlink layer.
+ */
+static void ice_dpll_sw_pin_notify_peer(struct ice_dplls *d,
+					struct ice_dpll_pin *changed)
+{
+	struct ice_dpll_pin *peer;
+
+	peer = (changed >= d->sma && changed < d->sma + ICE_DPLL_PIN_SW_NUM) ?
+		&d->ufl[changed->idx] : &d->sma[changed->idx];
+	if (peer->pin)
+		__dpll_pin_change_ntf(peer->pin);
+}
+
 /**
  * ice_dpll_sma_direction_set - set direction of SMA pin
  * @p: pointer to a pin
@@ -1233,7 +1259,6 @@ static int ice_dpll_sma_direction_set(struct ice_dpll_pin *p,
 			ret = ice_dpll_pin_state_update(p->pf, target,
 							type, extack);
 	}
-
 	return ret;
 }
 
@@ -1344,6 +1369,8 @@ ice_dpll_ufl_pin_state_set(const struct dpll_pin *pin, void *pin_priv,
 
 unlock:
 	mutex_unlock(&pf->dplls.lock);
+	if (!ret)
+		ice_dpll_sw_pin_notify_peer(&pf->dplls, p);
 
 	return ret;
 }
@@ -1462,6 +1489,8 @@ ice_dpll_sma_pin_state_set(const struct dpll_pin *pin, void *pin_priv,
 
 unlock:
 	mutex_unlock(&pf->dplls.lock);
+	if (!ret)
+		ice_dpll_sw_pin_notify_peer(&pf->dplls, sma);
 
 	return ret;
 }
@@ -1657,6 +1686,8 @@ ice_dpll_pin_sma_direction_set(const struct dpll_pin *pin, void *pin_priv,
 	mutex_lock(&pf->dplls.lock);
 	ret = ice_dpll_sma_direction_set(p, direction, extack);
 	mutex_unlock(&pf->dplls.lock);
+	if (!ret)
+		ice_dpll_sw_pin_notify_peer(&pf->dplls, p);
 
 	return ret;
 }
@@ -1963,7 +1994,10 @@ ice_dpll_phase_offset_get(const struct dpll_pin *pin, void *pin_priv,
 				       d->active_input == p->input->pin))
 		*phase_offset = d->phase_offset * ICE_DPLL_PHASE_OFFSET_FACTOR;
 	else if (d->phase_offset_monitor_period)
-		*phase_offset = p->phase_offset * ICE_DPLL_PHASE_OFFSET_FACTOR;
+		*phase_offset = (p->input &&
+				 p->direction == DPLL_PIN_DIRECTION_INPUT ?
+				 p->input->phase_offset :
+				 p->phase_offset) * ICE_DPLL_PHASE_OFFSET_FACTOR;
 	else
 		*phase_offset = 0;
 	mutex_unlock(&pf->dplls.lock);
@@ -2659,6 +2693,27 @@ static u64 ice_generate_clock_id(struct ice_pf *pf)
 	return pci_get_dsn(pf->pdev);
 }
 
+/**
+ * ice_dpll_pin_ntf - notify pin change including any SW pin wrappers
+ * @dplls: pointer to dplls struct
+ * @pin: the dpll_pin that changed
+ *
+ * Send a change notification for @pin and for any registered SMA/U.FL pin
+ * whose backing CGU input matches @pin.
+ */
+static void ice_dpll_pin_ntf(struct ice_dplls *dplls, struct dpll_pin *pin)
+{
+	dpll_pin_change_ntf(pin);
+	for (int i = 0; i < ICE_DPLL_PIN_SW_NUM; i++) {
+		if (dplls->sma[i].pin && dplls->sma[i].input &&
+		    dplls->sma[i].input->pin == pin)
+			dpll_pin_change_ntf(dplls->sma[i].pin);
+		if (dplls->ufl[i].pin && dplls->ufl[i].input &&
+		    dplls->ufl[i].input->pin == pin)
+			dpll_pin_change_ntf(dplls->ufl[i].pin);
+	}
+}
+
 /**
  * ice_dpll_notify_changes - notify dpll subsystem about changes
  * @d: pointer do dpll
@@ -2667,6 +2722,7 @@ static u64 ice_generate_clock_id(struct ice_pf *pf)
  */
 static void ice_dpll_notify_changes(struct ice_dpll *d)
 {
+	struct ice_dplls *dplls = &d->pf->dplls;
 	bool pin_notified = false;
 
 	if (d->prev_dpll_state != d->dpll_state) {
@@ -2675,17 +2731,17 @@ static void ice_dpll_notify_changes(struct ice_dpll *d)
 	}
 	if (d->prev_input != d->active_input) {
 		if (d->prev_input)
-			dpll_pin_change_ntf(d->prev_input);
+			ice_dpll_pin_ntf(dplls, d->prev_input);
 		d->prev_input = d->active_input;
 		if (d->active_input) {
-			dpll_pin_change_ntf(d->active_input);
+			ice_dpll_pin_ntf(dplls, d->active_input);
 			pin_notified = true;
 		}
 	}
 	if (d->prev_phase_offset != d->phase_offset) {
 		d->prev_phase_offset = d->phase_offset;
 		if (!pin_notified && d->active_input)
-			dpll_pin_change_ntf(d->active_input);
+			ice_dpll_pin_ntf(dplls, d->active_input);
 	}
 }
 
@@ -2714,6 +2770,7 @@ static bool ice_dpll_is_pps_phase_monitor(struct ice_pf *pf)
 
 /**
  * ice_dpll_pins_notify_mask - notify dpll subsystem about bulk pin changes
+ * @dplls: pointer to dplls struct
  * @pins: array of ice_dpll_pin pointers registered within dpll subsystem
  * @pin_num: number of pins
  * @phase_offset_ntf_mask: bitmask of pin indexes to notify
@@ -2723,15 +2780,14 @@ static bool ice_dpll_is_pps_phase_monitor(struct ice_pf *pf)
  *
  * Context: Must be called while pf->dplls.lock is released.
  */
-static void ice_dpll_pins_notify_mask(struct ice_dpll_pin *pins,
+static void ice_dpll_pins_notify_mask(struct ice_dplls *dplls,
+				      struct ice_dpll_pin *pins,
 				      u8 pin_num,
 				      u32 phase_offset_ntf_mask)
 {
-	int i = 0;
-
-	for (i = 0; i < pin_num; i++)
-		if (phase_offset_ntf_mask & (1 << i))
-			dpll_pin_change_ntf(pins[i].pin);
+	for (int i = 0; i < pin_num; i++)
+		if (phase_offset_ntf_mask & BIT(i))
+			ice_dpll_pin_ntf(dplls, pins[i].pin);
 }
 
 /**
@@ -2907,7 +2963,7 @@ static void ice_dpll_periodic_work(struct kthread_work *work)
 	ice_dpll_notify_changes(de);
 	ice_dpll_notify_changes(dp);
 	if (phase_offset_ntf)
-		ice_dpll_pins_notify_mask(d->inputs, d->num_inputs,
+		ice_dpll_pins_notify_mask(d, d->inputs, d->num_inputs,
 					  phase_offset_ntf);
 
 resched:
-- 
2.52.0


^ permalink raw reply related

* [PATCH net v6 1/2] dpll: export __dpll_pin_change_ntf() for use under dpll_lock
From: Petr Oros @ 2026-04-16 11:39 UTC (permalink / raw)
  To: netdev
  Cc: Ivan Vecera, Petr Oros, Vadim Fedorenko, Arkadiusz Kubalewski,
	Jiri Pirko, Tony Nguyen, Przemek Kitszel, Andrew Lunn,
	David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Simon Horman, linux-kernel, intel-wired-lan, Aleksandr Loktionov,
	Rinitha S, Michal Schmidt, Jacob Keller
In-Reply-To: <20260416113952.389405-1-poros@redhat.com>

From: Ivan Vecera <ivecera@redhat.com>

Export __dpll_pin_change_ntf() so that drivers can send pin change
notifications from within pin callbacks, which are already called
under dpll_lock. Using dpll_pin_change_ntf() in that context would
deadlock.

Add lockdep_assert_held() to catch misuse without the lock held.

Signed-off-by: Ivan Vecera <ivecera@redhat.com>
Signed-off-by: Petr Oros <poros@redhat.com>
---
 drivers/dpll/dpll_netlink.c | 10 ++++++++++
 drivers/dpll/dpll_netlink.h |  2 --
 include/linux/dpll.h        |  1 +
 3 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/drivers/dpll/dpll_netlink.c b/drivers/dpll/dpll_netlink.c
index 83cbd64abf5a47..95ae786e98aab3 100644
--- a/drivers/dpll/dpll_netlink.c
+++ b/drivers/dpll/dpll_netlink.c
@@ -842,11 +842,21 @@ int dpll_pin_delete_ntf(struct dpll_pin *pin)
 	return dpll_pin_event_send(DPLL_CMD_PIN_DELETE_NTF, pin);
 }
 
+/**
+ * __dpll_pin_change_ntf - notify that the pin has been changed
+ * @pin: registered pin pointer
+ *
+ * Context: caller must hold dpll_lock. Suitable for use inside pin
+ *          callbacks which are already invoked under dpll_lock.
+ * Return: 0 if succeeds, error code otherwise.
+ */
 int __dpll_pin_change_ntf(struct dpll_pin *pin)
 {
+	lockdep_assert_held(&dpll_lock);
 	dpll_pin_notify(pin, DPLL_PIN_CHANGED);
 	return dpll_pin_event_send(DPLL_CMD_PIN_CHANGE_NTF, pin);
 }
+EXPORT_SYMBOL_GPL(__dpll_pin_change_ntf);
 
 /**
  * dpll_pin_change_ntf - notify that the pin has been changed
diff --git a/drivers/dpll/dpll_netlink.h b/drivers/dpll/dpll_netlink.h
index dd28b56d27c56d..a9cfd55f57fc42 100644
--- a/drivers/dpll/dpll_netlink.h
+++ b/drivers/dpll/dpll_netlink.h
@@ -11,5 +11,3 @@ int dpll_device_delete_ntf(struct dpll_device *dpll);
 int dpll_pin_create_ntf(struct dpll_pin *pin);
 
 int dpll_pin_delete_ntf(struct dpll_pin *pin);
-
-int __dpll_pin_change_ntf(struct dpll_pin *pin);
diff --git a/include/linux/dpll.h b/include/linux/dpll.h
index 2ce295b46b8cdc..8f97120ee7b37d 100644
--- a/include/linux/dpll.h
+++ b/include/linux/dpll.h
@@ -276,6 +276,7 @@ int dpll_pin_ref_sync_pair_add(struct dpll_pin *pin,
 
 int dpll_device_change_ntf(struct dpll_device *dpll);
 
+int __dpll_pin_change_ntf(struct dpll_pin *pin);
 int dpll_pin_change_ntf(struct dpll_pin *pin);
 
 int register_dpll_notifier(struct notifier_block *nb);
-- 
2.52.0


^ permalink raw reply related

* [PATCH net v6 0/2] ice: fix missing dpll notifications for SW pins
From: Petr Oros @ 2026-04-16 11:39 UTC (permalink / raw)
  To: netdev
  Cc: Petr Oros, Vadim Fedorenko, Arkadiusz Kubalewski, Jiri Pirko,
	Tony Nguyen, Przemek Kitszel, Andrew Lunn, David S. Miller,
	Eric Dumazet, Jakub Kicinski, Paolo Abeni, Simon Horman,
	linux-kernel, intel-wired-lan, Aleksandr Loktionov, Ivan Vecera,
	Rinitha S, Michal Schmidt, Jacob Keller

The SMA/U.FL pin redesign never propagated dpll notifications to the
software-controlled pin wrappers.  This series fixes that by adding
peer notification for paired SMA/U.FL pins and by wrapping
dpll_pin_change_ntf() calls in the periodic work to also cover SW pins.

Patch 1 exports __dpll_pin_change_ntf() so ice can send peer
notifications from callback context where dpll_lock is already held.

Patch 2 fixes the notification gaps: periodic work HW-to-SW
propagation, SW-to-SW peer notification on PCA9575 routing changes,
and SW pin phase offset reporting.

Ivan Vecera (1):
  dpll: export __dpll_pin_change_ntf() for use under dpll_lock

Petr Oros (1):
  ice: fix missing dpll notifications for SW pins

 drivers/dpll/dpll_netlink.c               | 10 +++
 drivers/dpll/dpll_netlink.h               |  2 -
 drivers/net/ethernet/intel/ice/ice_dpll.c | 80 +++++++++++++++++++----
 include/linux/dpll.h                      |  1 +
 4 files changed, 79 insertions(+), 14 deletions(-)

---
v6:
 - fix deadlock reported by Michal Schmidt: dpll_pin_change_ntf() in
   peer notification was called with dpll_lock held, causing deadlock.
   Move the peer notification calls out of ice_dpll_sma_direction_set()
   and ice_dpll_ufl_pin_state_set() into their dpll_pin_ops callback
   wrappers, after pf->dplls.lock is released, and use
   __dpll_pin_change_ntf() because dpll_lock is still held by the dpll
   netlink layer (dpll_pin_pre_doit).
v5: https://lore.kernel.org/all/20260409102501.1447628-1-poros@redhat.com/
v4: https://lore.kernel.org/all/20260319205256.998876-1-poros@redhat.com/
v3: https://lore.kernel.org/all/20260220140700.2910174-1-poros@redhat.com/
v2: https://lore.kernel.org/all/20260219131500.2271897-1-poros@redhat.com/
v1: https://lore.kernel.org/all/20260218211414.1411163-1-poros@redhat.com/


^ permalink raw reply

* Re: [PATCH net 14/14] netfilter: nf_tables: add hook transactions for device deletions
From: Paolo Abeni @ 2026-04-16 11:36 UTC (permalink / raw)
  To: Pablo Neira Ayuso, netfilter-devel
  Cc: davem, netdev, kuba, edumazet, fw, horms
In-Reply-To: <20260416013101.221555-15-pablo@netfilter.org>

On 4/16/26 3:31 AM, Pablo Neira Ayuso wrote:
> @@ -10920,9 +11007,8 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
>  				nf_tables_chain_notify(&ctx, NFT_MSG_DELCHAIN,
>  						       &nft_trans_chain_hooks(trans));

AI notes that nf_tables_chain_notify() can now receive struct
nft_trans_hook arguments and it ends up calling nft_dump_basechain_hook
which expects nft_hook, possibly causing out-of-bounds slab read when
accessing hook->ifname.

It looks real to me. Possibly worthy strip this patch from the PR?

/P

^ permalink raw reply

* [PATCH v4] net: wwan: t7xx: validate port_count against message length in t7xx_port_enum_msg_handler
From: Pavitra Jha @ 2026-04-16 11:32 UTC (permalink / raw)
  To: w; +Cc: pabeni, chandrashekar.devegowda, linux-wwan, netdev, stable,
	Pavitra Jha
In-Reply-To: <ad5p7XlSOKoaQC5D@1wt.eu>

t7xx_port_enum_msg_handler() uses the modem-supplied port_count field as
a loop bound over port_msg->data[] without checking that the message buffer
contains sufficient data. A modem sending port_count=65535 in a 12-byte
buffer triggers a slab-out-of-bounds read of up to 262140 bytes.

Add a struct_size() check after extracting port_count and before the loop.
Pass msg_len to t7xx_port_enum_msg_handler() and use it to validate
the message size before accessing port_msg->data[].
Pass msg_len from both call sites: skb->len at the DPMAIF path after
skb_pull(), and the captured rt_feature->data_len at the handshake path.

Fixes: 39d439047f1d ("net: wwan: t7xx: Add control DMA interface")
Cc: stable@vger.kernel.org
Signed-off-by: Pavitra Jha <jhapavitra98@gmail.com>
---
 drivers/net/wwan/t7xx/t7xx_modem_ops.c     | 14 +++++++-------
 drivers/net/wwan/t7xx/t7xx_port_ctrl_msg.c | 12 +++++++++---
 drivers/net/wwan/t7xx/t7xx_port_proxy.h    |  2 +-
 3 files changed, 17 insertions(+), 11 deletions(-)

diff --git a/drivers/net/wwan/t7xx/t7xx_modem_ops.c b/drivers/net/wwan/t7xx/t7xx_modem_ops.c
index 7968e208d..d0559fe16 100644
--- a/drivers/net/wwan/t7xx/t7xx_modem_ops.c
+++ b/drivers/net/wwan/t7xx/t7xx_modem_ops.c
@@ -453,25 +453,25 @@ static int t7xx_parse_host_rt_data(struct t7xx_fsm_ctl *ctl, struct t7xx_sys_inf
 {
 	enum mtk_feature_support_type ft_spt_st, ft_spt_cfg;
 	struct mtk_runtime_feature *rt_feature;
+	size_t feat_data_len;
 	int i, offset;
 
 	offset = sizeof(struct feature_query);
 	for (i = 0; i < FEATURE_COUNT && offset < data_length; i++) {
 		rt_feature = data + offset;
-		offset += sizeof(*rt_feature) + le32_to_cpu(rt_feature->data_len);
-
+		feat_data_len = le32_to_cpu(rt_feature->data_len);
+		offset += sizeof(*rt_feature) + feat_data_len;
 		ft_spt_cfg = FIELD_GET(FEATURE_MSK, core->feature_set[i]);
 		if (ft_spt_cfg != MTK_FEATURE_MUST_BE_SUPPORTED)
 			continue;
 
 		ft_spt_st = FIELD_GET(FEATURE_MSK, rt_feature->support_info);
 		if (ft_spt_st != MTK_FEATURE_MUST_BE_SUPPORTED)
 			return -EINVAL;
 
-		if (i == RT_ID_MD_PORT_ENUM || i == RT_ID_AP_PORT_ENUM)
-			t7xx_port_enum_msg_handler(ctl->md, rt_feature->data);
+		if (i == RT_ID_MD_PORT_ENUM || i == RT_ID_AP_PORT_ENUM) {
+			t7xx_port_enum_msg_handler(ctl->md, rt_feature->data,
+						   feat_data_len);
+		}
 	}
 
 	return 0;
 }
 
diff --git a/drivers/net/wwan/t7xx/t7xx_port_ctrl_msg.c b/drivers/net/wwan/t7xx/t7xx_port_ctrl_msg.c
index ae632ef96..d984a688d 100644
--- a/drivers/net/wwan/t7xx/t7xx_port_ctrl_msg.c
+++ b/drivers/net/wwan/t7xx/t7xx_port_ctrl_msg.c
@@ -124,8 +124,9 @@ static int fsm_ee_message_handler(struct t7xx_port *port, struct t7xx_fsm_ctl *c
  * * 0		- Success.
  * * -EFAULT	- Message check failure.
+ * @msg_len: Length of @msg in bytes.
  */
-int t7xx_port_enum_msg_handler(struct t7xx_modem *md, void *msg)
+int t7xx_port_enum_msg_handler(struct t7xx_modem *md, void *msg, size_t msg_len)
 {
 	struct device *dev = &md->t7xx_dev->pdev->dev;
 	unsigned int version, port_count, i;
@@ -141,6 +141,13 @@ int t7xx_port_enum_msg_handler(struct t7xx_modem *md, void *msg)
 	}
 
 	port_count = FIELD_GET(PORT_MSG_PRT_CNT, le32_to_cpu(port_msg->info));
+
+	if (msg_len < struct_size(port_msg, data, port_count)) {
+		dev_err(dev, "Port enum msg too short: need %zu, have %zu\n",
+			struct_size(port_msg, data, port_count), msg_len);
+		return -EINVAL;
+	}
+
 	for (i = 0; i < port_count; i++) {
 		u32 port_info = le32_to_cpu(port_msg->data[i]);
 		unsigned int ch_id;
@@ -154,7 +161,6 @@ int t7xx_port_enum_msg_handler(struct t7xx_modem *md, void *msg)
 
 	return 0;
 }
 
 static int control_msg_handler(struct t7xx_port *port, struct sk_buff *skb)
 {
 	const struct t7xx_port_conf *port_conf = port->port_conf;
@@ -191,7 +197,7 @@ static int control_msg_handler(struct t7xx_port *port, struct sk_buff *skb)
 
 	case CTL_ID_PORT_ENUM:
 		skb_pull(skb, sizeof(*ctrl_msg_h));
-		ret = t7xx_port_enum_msg_handler(ctl->md, (struct port_msg *)skb->data);
+		ret = t7xx_port_enum_msg_handler(ctl->md, (struct port_msg *)skb->data, skb->len);
 		if (!ret)
 			ret = port_ctl_send_msg_to_md(port, CTL_ID_PORT_ENUM, 0);
 		else
diff --git a/drivers/net/wwan/t7xx/t7xx_port_proxy.h b/drivers/net/wwan/t7xx/t7xx_port_proxy.h
index f0918b36e..7c3190bf0 100644
--- a/drivers/net/wwan/t7xx/t7xx_port_proxy.h
+++ b/drivers/net/wwan/t7xx/t7xx_port_proxy.h
@@ -103,7 +103,7 @@ void t7xx_port_proxy_reset(struct port_proxy *port_prox);
 void t7xx_port_proxy_uninit(struct port_proxy *port_prox);
 int t7xx_port_proxy_init(struct t7xx_modem *md);
 void t7xx_port_proxy_md_status_notify(struct port_proxy *port_prox, unsigned int state);
-int t7xx_port_enum_msg_handler(struct t7xx_modem *md, void *msg);
+int t7xx_port_enum_msg_handler(struct t7xx_modem *md, void *msg, size_t msg_len);
 int t7xx_port_proxy_chl_enable_disable(struct port_proxy *port_prox, unsigned int ch_id,
 				       bool en_flag);
 void t7xx_port_proxy_set_cfg(struct t7xx_modem *md, enum port_cfg_id cfg_id);
-- 
2.53.0


^ permalink raw reply related

* Re: [patch 27/38] m68k: Select ARCH_HAS_RANDOM_ENTROPY
From: Geert Uytterhoeven @ 2026-04-16 11:22 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: LKML, linux-m68k, Arnd Bergmann, x86, Lu Baolu, iommu,
	Michael Grzeschik, netdev, linux-wireless, Herbert Xu,
	linux-crypto, Vlastimil Babka, linux-mm, David Woodhouse,
	Bernie Thompson, linux-fbdev, Theodore Tso, linux-ext4,
	Andrew Morton, Uladzislau Rezki, Marco Elver, Dmitry Vyukov,
	kasan-dev, Andrey Ryabinin, Thomas Sailer, linux-hams,
	Jason A. Donenfeld, Richard Henderson, linux-alpha, Russell King,
	linux-arm-kernel, Catalin Marinas, Huacai Chen, loongarch,
	Dinh Nguyen, Jonas Bonn, linux-openrisc, Helge Deller,
	linux-parisc, Michael Ellerman, linuxppc-dev, Paul Walmsley,
	linux-riscv, Heiko Carstens, linux-s390, David S. Miller,
	sparclinux
In-Reply-To: <20260410120319.397219631@kernel.org>

On Fri, 10 Apr 2026 at 14:20, Thomas Gleixner <tglx@kernel.org> wrote:
> The only remaining usage of get_cycles() is to provide
> random_get_entropy().
>
> Switch m68k over to the new scheme of selecting ARCH_HAS_RANDOM_ENTROPY and
> providing random_get_entropy() in asm/random.h.
>
> Remove asm/timex.h as it has no functionality anymore.
>
> Signed-off-by: Thomas Gleixner <tglx@kernel.org>

Reviewed-by: Geert Uytterhoeven <geert@linux-m68k.org>
Acked-by: Geert Uytterhoeven <geert@linux-m68k.org>

Gr{oetje,eeting}s,

                        Geert


--
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- geert@linux-m68k.org

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
                                -- Linus Torvalds

^ permalink raw reply

* Re: [patch 07/38] treewide: Consolidate cycles_t
From: Geert Uytterhoeven @ 2026-04-16 11:22 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: LKML, Arnd Bergmann, x86, Lu Baolu, iommu, Michael Grzeschik,
	netdev, linux-wireless, Herbert Xu, linux-crypto, Vlastimil Babka,
	linux-mm, David Woodhouse, Bernie Thompson, linux-fbdev,
	Theodore Tso, linux-ext4, Andrew Morton, Uladzislau Rezki,
	Marco Elver, Dmitry Vyukov, kasan-dev, Andrey Ryabinin,
	Thomas Sailer, linux-hams, Jason A. Donenfeld, Richard Henderson,
	linux-alpha, Russell King, linux-arm-kernel, Catalin Marinas,
	Huacai Chen, loongarch, Dinh Nguyen, Jonas Bonn, linux-openrisc,
	Helge Deller, linux-parisc, Michael Ellerman, linuxppc-dev,
	Paul Walmsley, linux-riscv, Heiko Carstens, linux-s390,
	David S. Miller, sparclinux
In-Reply-To: <20260410120318.045532623@kernel.org>

On Fri, 10 Apr 2026 at 14:19, Thomas Gleixner <tglx@kernel.org> wrote:
> Most architectures define cycles_t as unsigned long execpt:
>
>  - x86 requires it to be 64-bit independent of the 32-bit/64-bit build.
>
>  - parisc and mips define it as unsigned int
>
>    parisc has no real reason to do so as there are only a few usage sites
>    which either expand it to a 64-bit value or utilize only the lower
>    32bits.
>
>    mips has no real requirement either.
>
> Move the typedef to types.h and provide a config switch to enforce the
> 64-bit type for x86.
>
> Signed-off-by: Thomas Gleixner <tglx@kernel.org>

>  arch/m68k/include/asm/timex.h      |    2 --

Acked-by: Geert Uytterhoeven <geert@linux-m68k.org> # m68k

Gr{oetje,eeting}s,

                        Geert


--
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- geert@linux-m68k.org

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
                                -- Linus Torvalds

^ permalink raw reply

* Re: [PATCH net] ipv6: fix possible UAF in icmpv6_rcv()
From: Fernando Fernandez Mancera @ 2026-04-16 11:25 UTC (permalink / raw)
  To: Eric Dumazet, David S . Miller, Jakub Kicinski, Paolo Abeni
  Cc: Simon Horman, David Ahern, Ido Schimmel, netdev, eric.dumazet
In-Reply-To: <20260416103505.2380753-1-edumazet@google.com>

On 4/16/26 12:35 PM, Eric Dumazet wrote:
> Caching saddr and daddr before pskb_pull() is problematic
> since skb->head can change.
> 
> Remove these temporary variables:
> 
> - We only access &ipv6_hdr(skb)->saddr and &ipv6_hdr(skb)->daddr
>    when net_dbg_ratelimited() is called in the slow path.
> 
> - Avoid potential future misuse after pskb_pull() call.
> 
> Fixes: 4b3418fba0fe ("ipv6: icmp: include addresses in debug messages")
> Signed-off-by: Eric Dumazet <edumazet@google.com>

Reviewed-by: Fernando Fernandez Mancera <fmancera@suse.de>

Thanks!

^ permalink raw reply

* [PATCH bpf v2 2/2] selftests/bpf: Test TCP_NODELAY in TCP hdr opt callbacks
From: KaFai Wan @ 2026-04-16 11:23 UTC (permalink / raw)
  To: martin.lau, daniel, john.fastabend, sdf, ast, andrii, eddyz87,
	memxor, song, yonghong.song, jolsa, davem, edumazet, kuba, pabeni,
	horms, shuah, jiayuan.chen, kafai.wan, bpf, netdev, linux-kernel,
	linux-kselftest
In-Reply-To: <20260416112308.1820332-1-kafai.wan@linux.dev>

Add a sockops selftest for the TCP_NODELAY restriction in
BPF_SOCK_OPS_HDR_OPT_LEN_CB and BPF_SOCK_OPS_WRITE_HDR_OPT_CB.

The test program calls bpf_setsockopt(TCP_NODELAY) from
ACTIVE_ESTABLISHED_CB and PASSIVE_ESTABLISHED_CB to verify that it is
still allowed outside the TCP header option callbacks.

It then enables BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG, sends data to
exercise the TCP header option path, and checks that
bpf_setsockopt(TCP_NODELAY) returns -EOPNOTSUPP from both
BPF_SOCK_OPS_HDR_OPT_LEN_CB and BPF_SOCK_OPS_WRITE_HDR_OPT_CB while the
connection continues to make forward progress.

Signed-off-by: KaFai Wan <kafai.wan@linux.dev>
---
 .../bpf/prog_tests/tcp_hdr_options.c          | 54 +++++++++++++++++++
 .../bpf/progs/test_misc_tcp_hdr_options.c     | 40 ++++++++++++++
 2 files changed, 94 insertions(+)

diff --git a/tools/testing/selftests/bpf/prog_tests/tcp_hdr_options.c b/tools/testing/selftests/bpf/prog_tests/tcp_hdr_options.c
index 56685fc03c7e..2d738c0c4259 100644
--- a/tools/testing/selftests/bpf/prog_tests/tcp_hdr_options.c
+++ b/tools/testing/selftests/bpf/prog_tests/tcp_hdr_options.c
@@ -513,6 +513,59 @@ static void misc(void)
 	bpf_link__destroy(link);
 }
 
+static void hdr_sockopt(void)
+{
+	const char send_msg[] = "MISC!!!";
+	char recv_msg[sizeof(send_msg)];
+	const unsigned int nr_data = 2;
+	struct bpf_link *link;
+	struct sk_fds sk_fds;
+	int i, ret, true_val = 1;
+
+	lport_linum_map_fd = bpf_map__fd(misc_skel->maps.lport_linum_map);
+
+	link = bpf_program__attach_cgroup(misc_skel->progs.misc_hdr_sockopt, cg_fd);
+	if (!ASSERT_OK_PTR(link, "attach_cgroup(misc_hdr_sockopt)"))
+		return;
+
+	if (sk_fds_connect(&sk_fds, false)) {
+		bpf_link__destroy(link);
+		return;
+	}
+
+	ret = setsockopt(sk_fds.active_fd, SOL_TCP, TCP_NODELAY, &true_val, sizeof(true_val));
+	if (!ASSERT_OK(ret, "setsockopt(TCP_NODELAY) active"))
+		goto check_linum;
+
+	ret = setsockopt(sk_fds.passive_fd, SOL_TCP, TCP_NODELAY, &true_val, sizeof(true_val));
+	if (!ASSERT_OK(ret, "setsockopt(TCP_NODELAY) passive"))
+		goto check_linum;
+
+	for (i = 0; i < nr_data; i++) {
+		ret = send(sk_fds.active_fd, send_msg, sizeof(send_msg), 0);
+		if (!ASSERT_EQ(ret, sizeof(send_msg), "send(msg)"))
+			goto check_linum;
+
+		ret = read(sk_fds.passive_fd, recv_msg, sizeof(recv_msg));
+		if (!ASSERT_EQ(ret, sizeof(send_msg), "read(msg)"))
+			goto check_linum;
+	}
+
+	ASSERT_NEQ(misc_skel->bss->nr_hdr_sockopt_estab, 0, "nr_hdr_sockopt_estab");
+	ASSERT_EQ(misc_skel->bss->nr_hdr_sockopt_estab_err, 0, "nr_hdr_sockopt_estab_err");
+
+	ASSERT_NEQ(misc_skel->bss->nr_hdr_sockopt_len, 0, "nr_hdr_sockopt_len");
+	ASSERT_EQ(misc_skel->bss->nr_hdr_sockopt_len_err, 0, "nr_hdr_sockopt_len_err");
+
+	ASSERT_NEQ(misc_skel->bss->nr_hdr_sockopt_write, 0, "nr_hdr_sockopt_write");
+	ASSERT_EQ(misc_skel->bss->nr_hdr_sockopt_write_err, 0, "nr_hdr_sockopt_write_err");
+
+check_linum:
+	ASSERT_FALSE(check_error_linum(&sk_fds), "check_error_linum");
+	sk_fds_close(&sk_fds);
+	bpf_link__destroy(link);
+}
+
 struct test {
 	const char *desc;
 	void (*run)(void);
@@ -526,6 +579,7 @@ static struct test tests[] = {
 	DEF_TEST(fastopen_estab),
 	DEF_TEST(fin),
 	DEF_TEST(misc),
+	DEF_TEST(hdr_sockopt),
 };
 
 void test_tcp_hdr_options(void)
diff --git a/tools/testing/selftests/bpf/progs/test_misc_tcp_hdr_options.c b/tools/testing/selftests/bpf/progs/test_misc_tcp_hdr_options.c
index d487153a839d..a8cf7c4e7ed2 100644
--- a/tools/testing/selftests/bpf/progs/test_misc_tcp_hdr_options.c
+++ b/tools/testing/selftests/bpf/progs/test_misc_tcp_hdr_options.c
@@ -28,6 +28,12 @@ unsigned int nr_data = 0;
 unsigned int nr_syn = 0;
 unsigned int nr_fin = 0;
 unsigned int nr_hwtstamp = 0;
+unsigned int nr_hdr_sockopt_estab = 0;
+unsigned int nr_hdr_sockopt_estab_err = 0;
+unsigned int nr_hdr_sockopt_len = 0;
+unsigned int nr_hdr_sockopt_len_err = 0;
+unsigned int nr_hdr_sockopt_write = 0;
+unsigned int nr_hdr_sockopt_write_err = 0;
 
 /* Check the header received from the active side */
 static int __check_active_hdr_in(struct bpf_sock_ops *skops, bool check_syn)
@@ -326,4 +332,38 @@ int misc_estab(struct bpf_sock_ops *skops)
 	return CG_OK;
 }
 
+SEC("sockops")
+int misc_hdr_sockopt(struct bpf_sock_ops *skops)
+{
+	int true_val = 1;
+	int ret;
+
+	switch (skops->op) {
+	case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB:
+	case BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB:
+		nr_hdr_sockopt_estab++;
+		set_hdr_cb_flags(skops, 0);
+		ret = bpf_setsockopt(skops, SOL_TCP, TCP_NODELAY, &true_val, sizeof(true_val));
+		if (ret)
+			nr_hdr_sockopt_estab_err++;
+		break;
+	case BPF_SOCK_OPS_HDR_OPT_LEN_CB:
+		nr_hdr_sockopt_len++;
+		ret = bpf_setsockopt(skops, SOL_TCP, TCP_NODELAY, &true_val, sizeof(true_val));
+		if (ret != -EOPNOTSUPP)
+			nr_hdr_sockopt_len_err++;
+		/* just trigger BPF_SOCK_OPS_WRITE_HDR_OPT_CB */
+		bpf_reserve_hdr_opt(skops, 12, 0);
+		break;
+	case BPF_SOCK_OPS_WRITE_HDR_OPT_CB:
+		nr_hdr_sockopt_write++;
+		ret = bpf_setsockopt(skops, SOL_TCP, TCP_NODELAY, &true_val, sizeof(true_val));
+		if (ret != -EOPNOTSUPP)
+			nr_hdr_sockopt_write_err++;
+		break;
+	}
+
+	return CG_OK;
+}
+
 char _license[] SEC("license") = "GPL";
-- 
2.43.0


^ permalink raw reply related

* [PATCH bpf v2 1/2] bpf: Reject TCP_NODELAY in TCP header option callbacks
From: KaFai Wan @ 2026-04-16 11:23 UTC (permalink / raw)
  To: martin.lau, daniel, john.fastabend, sdf, ast, andrii, eddyz87,
	memxor, song, yonghong.song, jolsa, davem, edumazet, kuba, pabeni,
	horms, shuah, jiayuan.chen, kafai.wan, bpf, netdev, linux-kernel,
	linux-kselftest
  Cc: Quan Sun, Yinhao Hu, Kaiyan Mei
In-Reply-To: <20260416112308.1820332-1-kafai.wan@linux.dev>

A BPF_SOCK_OPS program can enable
BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG and then call
bpf_setsockopt(TCP_NODELAY) from BPF_SOCK_OPS_HDR_OPT_LEN_CB or
BPF_SOCK_OPS_WRITE_HDR_OPT_CB.

In these callbacks, bpf_setsockopt(TCP_NODELAY) can reach
__tcp_sock_set_nodelay(), which can call tcp_push_pending_frames().

From BPF_SOCK_OPS_HDR_OPT_LEN_CB, tcp_push_pending_frames() can call
tcp_current_mss(), which calls tcp_established_options() and re-enters
bpf_skops_hdr_opt_len().

BPF_SOCK_OPS_HDR_OPT_LEN_CB
  -> bpf_setsockopt(TCP_NODELAY)
    -> tcp_push_pending_frames()
      -> tcp_current_mss()
        -> tcp_established_options()
          -> bpf_skops_hdr_opt_len()
            -> BPF_SOCK_OPS_HDR_OPT_LEN_CB

From BPF_SOCK_OPS_WRITE_HDR_OPT_CB, tcp_push_pending_frames() can call
tcp_write_xmit(), which calls tcp_transmit_skb().  That path recomputes
header option length through tcp_established_options() and
bpf_skops_hdr_opt_len() before re-entering bpf_skops_write_hdr_opt().

BPF_SOCK_OPS_WRITE_HDR_OPT_CB
  -> bpf_setsockopt(TCP_NODELAY)
    -> tcp_push_pending_frames()
      -> tcp_write_xmit()
        -> tcp_transmit_skb()
          -> tcp_established_options()
            -> bpf_skops_hdr_opt_len()
          -> bpf_skops_write_hdr_opt()
            -> BPF_SOCK_OPS_WRITE_HDR_OPT_CB

This leads to unbounded recursion and can overflow the kernel stack.

Reject TCP_NODELAY with -EOPNOTSUPP in bpf_sock_ops_setsockopt()
when bpf_setsockopt() is called from
BPF_SOCK_OPS_HDR_OPT_LEN_CB or BPF_SOCK_OPS_WRITE_HDR_OPT_CB.

Reported-by: Quan Sun <2022090917019@std.uestc.edu.cn>
Reported-by: Yinhao Hu <dddddd@hust.edu.cn>
Reported-by: Kaiyan Mei <M202472210@hust.edu.cn>
Closes: https://lore.kernel.org/bpf/d1d523c9-6901-4454-a183-94462b8f3e4e@std.uestc.edu.cn/
Fixes: 7e41df5dbba2 ("bpf: Add a few optnames to bpf_setsockopt")
Signed-off-by: KaFai Wan <kafai.wan@linux.dev>
---
 net/core/filter.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/net/core/filter.c b/net/core/filter.c
index fcfcb72663ca..911ff04bca5a 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -5833,6 +5833,11 @@ BPF_CALL_5(bpf_sock_ops_setsockopt, struct bpf_sock_ops_kern *, bpf_sock,
 	if (!is_locked_tcp_sock_ops(bpf_sock))
 		return -EOPNOTSUPP;

+	if ((bpf_sock->op == BPF_SOCK_OPS_HDR_OPT_LEN_CB ||
+	     bpf_sock->op == BPF_SOCK_OPS_WRITE_HDR_OPT_CB) &&
+	    IS_ENABLED(CONFIG_INET) && level == SOL_TCP && optname == TCP_NODELAY)
+		return -EOPNOTSUPP;
+
 	return _bpf_setsockopt(bpf_sock->sk, level, optname, optval, optlen);
 }

-- 
2.43.0

^ permalink raw reply related

* [PATCH bpf v2 0/2] bpf: Reject TCP_NODELAY in TCP header option callbacks
From: KaFai Wan @ 2026-04-16 11:23 UTC (permalink / raw)
  To: martin.lau, daniel, john.fastabend, sdf, ast, andrii, eddyz87,
	memxor, song, yonghong.song, jolsa, davem, edumazet, kuba, pabeni,
	horms, shuah, jiayuan.chen, kafai.wan, bpf, netdev, linux-kernel,
	linux-kselftest

This small patchset is about avoid infinite recursion in TCP header option
callbacks via TCP_NODELAY setsockopt.

v2:
 - Reject TCP_NODELAY in bpf_sock_ops_setsockopt() (AI and Martin)

v1:
 https://lore.kernel.org/bpf/20260414112310.1285783-1-kafai.wan@linux.dev/

---
KaFai Wan (2):
  bpf: Reject TCP_NODELAY in TCP header option callbacks
  selftests/bpf: Test TCP_NODELAY in TCP hdr opt callbacks

 net/core/filter.c                             |  5 ++
 .../bpf/prog_tests/tcp_hdr_options.c          | 54 +++++++++++++++++++
 .../bpf/progs/test_misc_tcp_hdr_options.c     | 40 ++++++++++++++
 3 files changed, 99 insertions(+)

-- 
2.43.0


^ permalink raw reply

* Re: [patch 05/38] treewide: Remove CLOCK_TICK_RATE
From: Geert Uytterhoeven @ 2026-04-16 11:22 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: LKML, Arnd Bergmann, x86, Lu Baolu, iommu, Michael Grzeschik,
	netdev, linux-wireless, Herbert Xu, linux-crypto, Vlastimil Babka,
	linux-mm, David Woodhouse, Bernie Thompson, linux-fbdev,
	Theodore Tso, linux-ext4, Andrew Morton, Uladzislau Rezki,
	Marco Elver, Dmitry Vyukov, kasan-dev, Andrey Ryabinin,
	Thomas Sailer, linux-hams, Jason A. Donenfeld, Richard Henderson,
	linux-alpha, Russell King, linux-arm-kernel, Catalin Marinas,
	Huacai Chen, loongarch, linux-m68k, Dinh Nguyen, Jonas Bonn,
	linux-openrisc, Helge Deller, linux-parisc, Michael Ellerman,
	linuxppc-dev, Paul Walmsley, linux-riscv, Heiko Carstens,
	linux-s390, David S. Miller, sparclinux
In-Reply-To: <20260410120317.910770161@kernel.org>

On Fri, 10 Apr 2026 at 14:18, Thomas Gleixner <tglx@kernel.org> wrote:
> This has been scheduled for removal more than a decade ago and the comments
> related to it have been dutifully ignored. The last dependencies are gone.
>
> Remove it along with various now empty asm/timex.h files.
>
> Signed-off-by: Thomas Gleixner <tglx@kernel.org>

>  arch/m68k/include/asm/timex.h       |   15 ---------------

Acked-by: Geert Uytterhoeven <geert@linux-m68k.org> # m68k

Gr{oetje,eeting}s,

                        Geert


--
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- geert@linux-m68k.org

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
                                -- Linus Torvalds

^ permalink raw reply

* Re: [PATCH v2] net/sched: sch_cake: fix NAT destination port not being updated in cake_update_flowkeys
From: patchwork-bot+netdevbpf @ 2026-04-16 11:20 UTC (permalink / raw)
  To: Dudu Lu; +Cc: netdev, toke, jhs, jiri
In-Reply-To: <20260413110041.44704-1-phx0fer@gmail.com>

Hello:

This patch was applied to netdev/net.git (main)
by Paolo Abeni <pabeni@redhat.com>:

On Mon, 13 Apr 2026 19:00:41 +0800 you wrote:
> cake_update_flowkeys() is supposed to update the flow dissector keys
> with the NAT-translated addresses and ports from conntrack, so that
> CAKE's per-flow fairness correctly identifies post-NAT flows as
> belonging to the same connection.
> 
> For the source port, this works correctly:
>     keys->ports.src = port;
> 
> [...]

Here is the summary with links:
  - [v2] net/sched: sch_cake: fix NAT destination port not being updated in cake_update_flowkeys
    https://git.kernel.org/netdev/net/c/f9e406647069

You are awesome, thank you!
-- 
Deet-doot-dot, I am a bot.
https://korg.docs.kernel.org/patchwork/pwbot.html



^ permalink raw reply

* Re: [RFC PATCH 2/2] kernel/module: Decouple klp and ftrace from load_module
From: Petr Pavlu @ 2026-04-16 11:18 UTC (permalink / raw)
  To: Song Chen
  Cc: rafael, lenb, mturquette, sboyd, viresh.kumar, agk, snitzer,
	mpatocka, bmarzins, song, yukuai, linan122, jason.wessel, danielt,
	dianders, horms, davem, edumazet, kuba, pabeni, paulmck, frederic,
	mcgrof, da.gomez, samitolvanen, atomlin, jpoimboe, jikos, mbenes,
	pmladek, joe.lawrence, rostedt, mhiramat, mark.rutland,
	mathieu.desnoyers, linux-modules, linux-kernel,
	linux-trace-kernel, linux-acpi, linux-clk, linux-pm,
	live-patching, dm-devel, linux-raid, kgdb-bugreport, netdev
In-Reply-To: <a35f5f94-7d5a-4347-974b-b270c89ef241@189.cn>

On 4/15/26 8:43 AM, Song Chen wrote:
> On 4/14/26 22:33, Petr Pavlu wrote:
>> On 4/13/26 10:07 AM, chensong_2000@189.cn wrote:
>>> diff --git a/include/linux/module.h b/include/linux/module.h
>>> index 14f391b186c6..0bdd56f9defd 100644
>>> --- a/include/linux/module.h
>>> +++ b/include/linux/module.h
>>> @@ -308,6 +308,14 @@ enum module_state {
>>>       MODULE_STATE_COMING,    /* Full formed, running module_init. */
>>>       MODULE_STATE_GOING,    /* Going away. */
>>>       MODULE_STATE_UNFORMED,    /* Still setting it up. */
>>> +    MODULE_STATE_FORMED,
>>
>> I don't see a reason to add a new module state. Why is it necessary and
>> how does it fit with the existing states?
>>
> because once notifier fails in state MODULE_STATE_UNFORMED (now only ftrace has someting to do in this state), notifier chain will roll back by calling blocking_notifier_call_chain_robust, i'm afraid MODULE_STATE_GOING is going to jeopardise the notifers which don't handle it appropriately, like:
> 
> case MODULE_STATE_COMING:
>      kmalloc();
> case MODULE_STATE_GOING:
>      kfree();

My understanding is that the current module "state machine" operates as
follows. Transitions marked with an asterisk (*) are announced via the
module notifier.

---> UNFORMED --*> COMING --*> LIVE --*> GOING -.
        ^            |                     ^    |
        |            '---------------------*    |
        '---------------------------------------'

The new code aims to replace the current ftrace_module_init() call in
load_module(). To achieve this, it adds a notification for the UNFORMED
state (only when loading a module) and introduces a new FORMED state for
rollback. FORMED is purely a fake state because it never appears in
module::state. The new structure is as follows:

        ,--*> (FORMED)
        |
--*> UNFORMED --*> COMING --*> LIVE --*> GOING -.
        ^            |                     ^    |
        |            '---------------------*    |
        '---------------------------------------'

I'm afraid this is quite complex and inconsistent. Unless it can be kept
simple, we would be just replacing one special handling with a different
complexity, which is not worth it.

>>
>>> +    if (err)
>>> +        goto ddebug_cleanup;
>>>         /* Finally it's fully formed, ready to start executing. */
>>>       err = complete_formation(mod, info);
>>> -    if (err)
>>> +    if (err) {
>>> +        blocking_notifier_call_chain_reverse(&module_notify_list,
>>> +                MODULE_STATE_FORMED, mod);
>>>           goto ddebug_cleanup;
>>> +    }
>>>   -    err = prepare_coming_module(mod);
>>> +    err = prepare_module_state_transaction(mod,
>>> +                MODULE_STATE_COMING, MODULE_STATE_GOING);
>>>       if (err)
>>>           goto bug_cleanup;
>>>   @@ -3522,7 +3519,6 @@ static int load_module(struct load_info *info, const char __user *uargs,
>>>       destroy_params(mod->kp, mod->num_kp);
>>>       blocking_notifier_call_chain(&module_notify_list,
>>>                        MODULE_STATE_GOING, mod);
>>
>> My understanding is that all notifier chains for MODULE_STATE_GOING
>> should be reversed.
> yes, all, from lowest priority notifier to highest.
> I will resend patch 1 which was failed due to my proxy setting.

What I meant here is that the call:

blocking_notifier_call_chain(&module_notify_list, MODULE_STATE_GOING, mod);

should be replaced with:

blocking_notifier_call_chain_reverse(&module_notify_list, MODULE_STATE_GOING, mod);

> 
>>
>>> -    klp_module_going(mod);
>>>    bug_cleanup:
>>>       mod->state = MODULE_STATE_GOING;
>>>       /* module_bug_cleanup needs module_mutex protection */
>>
>> The patch removes the klp_module_going() cleanup call in load_module().
>> Similarly, the ftrace_release_mod() call under the ddebug_cleanup label
>> should be removed and appropriately replaced with a cleanup via
>> a notifier.
>>
>     err = prepare_module_state_transaction(mod,
>                 MODULE_STATE_UNFORMED, MODULE_STATE_FORMED);
>     if (err)
>         goto ddebug_cleanup;
> 
> ftrace will be cleanup in blocking_notifier_call_chain_robust rolling back.
> 
>     err = prepare_module_state_transaction(mod,
>                 MODULE_STATE_COMING, MODULE_STATE_GOING);
> 
> each notifier including ftrace and klp will be cleanup in blocking_notifier_call_chain_robust rolling back.
> 
> if all notifiers are successful in MODULE_STATE_COMING, they all will be clean up in
>  coming_cleanup:
>     mod->state = MODULE_STATE_GOING;
>     destroy_params(mod->kp, mod->num_kp);
>     blocking_notifier_call_chain(&module_notify_list,
>                      MODULE_STATE_GOING, mod);
> 
> if  something wrong underneath.

My point is that the patch leaves a call to ftrace_release_mod() in
load_module(), which I expected to be handled via a notifier.

-- 
Thanks,
Petr

^ permalink raw reply

* Re: [PATCH iwl-net] i40e: keep q_vectors array in sync with channel count changes
From: Maciej Fijalkowski @ 2026-04-16 11:12 UTC (permalink / raw)
  To: intel-wired-lan
  Cc: netdev, magnus.karlsson, kuba, pabeni, horms, przemyslaw.kitszel,
	jacob.e.keller
In-Reply-To: <20260414121405.631092-1-maciej.fijalkowski@intel.com>

On Tue, Apr 14, 2026 at 02:14:05PM +0200, Maciej Fijalkowski wrote:
> For the main VSI, i40e_set_num_rings_in_vsi() always derives
> num_q_vectors from pf->num_lan_msix. At the same time, ethtool -L stores
> the user requested channel count in vsi->req_queue_pairs and the queue
> setup path uses that value for the effective number of queue pairs.
> 
> This leaves queue and vector counts out of sync after shrinking channel
> count via ethtool -L. The active queue configuration is reduced, but the
> VSI still keeps the full PF-sized q_vector topology.
> 
> That mismatch breaks reconfiguration flows which rely on vector/NAPI
> state matching the effective channel configuration. In particular,
> toggling /sys/class/net/<dev>/threaded after reducing the channel count
> can hang, and later channel-count changes can fail because VSI reinit
> does not rebuild q_vectors to match the new vector count.
> 
> Fix this by making the main VSI num_q_vectors follow the effective
> requested channel count, capped by the available MSI-X vectors. Update
> i40e_vsi_reinit_setup() to rebuild q_vectors during VSI reinit so the
> vector topology is refreshed together with the ring arrays when channel
> count changes.
> 
> Keep alloc_queue_pairs unchanged and based on pf->num_lan_qps so the VSI
> retains its full queue capacity.
> 
> Selftest napi_threaded.py was originally used when Jakub reported hang
> on /sys/class/net/<dev>/threaded toggle. In order to make it pass on
> i40e, use persistent NAPI configuration for q_vector NAPIs so NAPI
> identity and threaded settings survive q_vector reallocation across
> channel-count changes. This is achieved by using netif_napi_add_config()
> when configuring q_vectors.
> 
> $ export NETIF=ens259f1np1
> $ sudo -E env PATH="$PATH" ./tools/testing/selftests/drivers/net/napi_threaded.py
> TAP version 13
> 1..3
> ok 1 napi_threaded.napi_init
> ok 2 napi_threaded.change_num_queues
> ok 3 napi_threaded.enable_dev_threaded_disable_napi_threaded
> Totals: pass:3 fail:0 xfail:0 xpass:0 skip:0 error:0
> 
> Reported-by: Jakub Kicinski <kuba@kernel.org>
> Closes: https://lore.kernel.org/intel-wired-lan/20260316133100.6054a11f@kernel.org/
> Fixes: d2a69fefd756 ("i40e: Fix changing previously set num_queue_pairs for PFs")
> Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
> ---
>  drivers/net/ethernet/intel/i40e/i40e_main.c | 34 +++++++++++++++++----
>  1 file changed, 28 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
> index 926d001b2150..5636ad71f940 100644
> --- a/drivers/net/ethernet/intel/i40e/i40e_main.c
> +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
> @@ -11403,10 +11403,14 @@ static void i40e_service_timer(struct timer_list *t)
>  static int i40e_set_num_rings_in_vsi(struct i40e_vsi *vsi)
>  {
>  	struct i40e_pf *pf = vsi->back;
> +	u16 qps;
>  
>  	switch (vsi->type) {
>  	case I40E_VSI_MAIN:
>  		vsi->alloc_queue_pairs = pf->num_lan_qps;
> +		qps = vsi->req_queue_pairs ?
> +		      min_t(u16, vsi->req_queue_pairs, pf->num_lan_qps) :
> +		      pf->num_lan_qps;
>  		if (!vsi->num_tx_desc)
>  			vsi->num_tx_desc = ALIGN(I40E_DEFAULT_NUM_DESCRIPTORS,
>  						 I40E_REQ_DESCRIPTOR_MULTIPLE);
> @@ -11414,7 +11418,8 @@ static int i40e_set_num_rings_in_vsi(struct i40e_vsi *vsi)
>  			vsi->num_rx_desc = ALIGN(I40E_DEFAULT_NUM_DESCRIPTORS,
>  						 I40E_REQ_DESCRIPTOR_MULTIPLE);
>  		if (test_bit(I40E_FLAG_MSIX_ENA, pf->flags))
> -			vsi->num_q_vectors = pf->num_lan_msix;
> +			vsi->num_q_vectors = max_t(int, 1,
> +						   min_t(int, qps, pf->num_lan_msix));
>  		else
>  			vsi->num_q_vectors = 1;
>  
> @@ -12043,7 +12048,8 @@ static int i40e_vsi_alloc_q_vector(struct i40e_vsi *vsi, int v_idx)
>  	cpumask_copy(&q_vector->affinity_mask, cpu_possible_mask);
>  
>  	if (vsi->netdev)
> -		netif_napi_add(vsi->netdev, &q_vector->napi, i40e_napi_poll);
> +		netif_napi_add_config(vsi->netdev, &q_vector->napi,
> +				      i40e_napi_poll, v_idx);
>  
>  	/* tie q_vector and vsi together */
>  	vsi->q_vectors[v_idx] = q_vector;
> @@ -14265,12 +14271,27 @@ static struct i40e_vsi *i40e_vsi_reinit_setup(struct i40e_vsi *vsi)
>  
>  	pf = vsi->back;
>  
> +	if (test_bit(I40E_FLAG_MSIX_ENA, pf->flags)) {
> +		i40e_put_lump(pf->irq_pile, vsi->base_vector, vsi->idx);
> +		vsi->base_vector = 0;
> +	}
> +
>  	i40e_put_lump(pf->qp_pile, vsi->base_queue, vsi->idx);
>  	i40e_vsi_clear_rings(vsi);
>  
> -	i40e_vsi_free_arrays(vsi, false);
> +	i40e_vsi_free_q_vectors(vsi);
> +	i40e_vsi_free_arrays(vsi, true);
>  	i40e_set_num_rings_in_vsi(vsi);
> -	ret = i40e_vsi_alloc_arrays(vsi, false);
> +
> +	ret = i40e_vsi_alloc_arrays(vsi, true);
> +	if (ret)
> +		goto err_vsi;

Sashiko warns about potential double-free on vsi->tx_rings. I will send a
v2 where I include NULLing this ptr in i40e_vsi_alloc_arrays().

Thanks,
Maciej

> +
> +	/* Rebuild q_vectors during VSI reinit because the effective channel
> +	 * count may change num_q_vectors. Keep vector topology aligned with the
> +	 * queue configuration after ethtool's .set_channels() callback.
> +	 */
> +	ret = i40e_vsi_setup_vectors(vsi);
>  	if (ret)
>  		goto err_vsi;
>  
> @@ -14282,7 +14303,7 @@ static struct i40e_vsi *i40e_vsi_reinit_setup(struct i40e_vsi *vsi)
>  		dev_info(&pf->pdev->dev,
>  			 "failed to get tracking for %d queues for VSI %d err %d\n",
>  			 alloc_queue_pairs, vsi->seid, ret);
> -		goto err_vsi;
> +		goto err_lump;
>  	}
>  	vsi->base_queue = ret;
>  
> @@ -14306,7 +14327,6 @@ static struct i40e_vsi *i40e_vsi_reinit_setup(struct i40e_vsi *vsi)
>  	return vsi;
>  
>  err_rings:
> -	i40e_vsi_free_q_vectors(vsi);
>  	if (vsi->netdev_registered) {
>  		vsi->netdev_registered = false;
>  		unregister_netdev(vsi->netdev);
> @@ -14316,6 +14336,8 @@ static struct i40e_vsi *i40e_vsi_reinit_setup(struct i40e_vsi *vsi)
>  	if (vsi->type == I40E_VSI_MAIN)
>  		i40e_devlink_destroy_port(pf);
>  	i40e_aq_delete_element(&pf->hw, vsi->seid, NULL);
> +err_lump:
> +	i40e_vsi_free_q_vectors(vsi);
>  err_vsi:
>  	i40e_vsi_clear(vsi);
>  	return NULL;
> -- 
> 2.43.0
> 

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox