Netdev List

Netdev List
 help / color / mirror / Atom feed

* [RFC net-next 13/17] tls: disable device offload for mptcp sockets
From: Geliang Tang @ 2026-06-22 10:43 UTC (permalink / raw)
  To: Matthieu Baerts, Mat Martineau, Geliang Tang, David S. Miller,
	Eric Dumazet, Jakub Kicinski, Paolo Abeni, Simon Horman,
	Neal Cardwell, Kuniyuki Iwashima, John Fastabend, Sabrina Dubroca,
	Hannes Reinecke
  Cc: Geliang Tang, netdev, mptcp, Gang Yan, Zqiang
In-Reply-To: <cover.1782123118.git.tanggeliang@kylinos.cn>

From: Geliang Tang <tanggeliang@kylinos.cn>

MPTCP TLS hardware offload is not yet implemented. Return -EOPNOTSUPP
when attempting to enable device offload on MPTCP sockets.

Co-developed-by: Gang Yan <yangang@kylinos.cn>
Signed-off-by: Gang Yan <yangang@kylinos.cn>
Co-developed-by: Zqiang <qiang.zhang@linux.dev>
Signed-off-by: Zqiang <qiang.zhang@linux.dev>
Signed-off-by: Geliang Tang <tanggeliang@kylinos.cn>
---
 net/tls/tls_device.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c
index c44a59d9d715..e535edc23d0d 100644
--- a/net/tls/tls_device.c
+++ b/net/tls/tls_device.c
@@ -1074,6 +1074,9 @@ int tls_set_device_offload(struct sock *sk)
 	ctx = tls_get_ctx(sk);
 	prot = &ctx->prot_info;
 
+	if (sk->sk_protocol == IPPROTO_MPTCP)
+		return -EOPNOTSUPP;
+
 	if (ctx->priv_ctx_tx)
 		return -EEXIST;
 
@@ -1196,6 +1199,9 @@ int tls_set_device_offload_rx(struct sock *sk, struct tls_context *ctx)
 	struct net_device *netdev;
 	int rc = 0;
 
+	if (sk->sk_protocol == IPPROTO_MPTCP)
+		return -EOPNOTSUPP;
+
 	if (ctx->crypto_recv.info.version != TLS_1_2_VERSION)
 		return -EOPNOTSUPP;
 
-- 
2.53.0


^ permalink raw reply related

* [RFC net-next 14/17] mptcp: update mptcp_check_readable helper
From: Geliang Tang @ 2026-06-22 10:43 UTC (permalink / raw)
  To: Matthieu Baerts, Mat Martineau, Geliang Tang, David S. Miller,
	Eric Dumazet, Jakub Kicinski, Paolo Abeni, Simon Horman,
	Neal Cardwell, Kuniyuki Iwashima, John Fastabend, Sabrina Dubroca,
	Hannes Reinecke
  Cc: Gang Yan, netdev, mptcp
In-Reply-To: <cover.1782123118.git.tanggeliang@kylinos.cn>

From: Gang Yan <yangang@kylinos.cn>

This patch makes mptcp_check_readable() aligned with TCP, and renames it to
mptcp_stream_is_readable(). It will be used in the case of KTLS, because
'prot' will be modified, tls_sw_sock_is_readable() is expected to be called
from prot->sock_is_readable().

Co-developed-by: Geliang Tang <geliang@kernel.org>
Signed-off-by: Geliang Tang <geliang@kernel.org>
Signed-off-by: Gang Yan <yangang@kylinos.cn>
---
 net/mptcp/protocol.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 169bd468f212..4951b1dd013b 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -3410,9 +3410,11 @@ void __mptcp_unaccepted_force_close(struct sock *sk)
 	__mptcp_destroy_sock(sk);
 }
 
-static __poll_t mptcp_check_readable(struct sock *sk)
+static bool mptcp_stream_is_readable(struct sock *sk)
 {
-	return mptcp_epollin_ready(sk) ? EPOLLIN | EPOLLRDNORM : 0;
+	if (mptcp_epollin_ready(sk))
+		return true;
+	return sk_is_readable(sk);
 }
 
 static void mptcp_check_listen_stop(struct sock *sk)
@@ -4476,7 +4478,8 @@ __poll_t mptcp_poll(struct file *file, struct socket *sock,
 		mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
 
 	if (state != TCP_SYN_SENT && state != TCP_SYN_RECV) {
-		mask |= mptcp_check_readable(sk);
+		if (mptcp_stream_is_readable(sk))
+			mask |= EPOLLIN | EPOLLRDNORM;
 		if (shutdown & SEND_SHUTDOWN)
 			mask |= EPOLLOUT | EPOLLWRNORM;
 		else
-- 
2.53.0


^ permalink raw reply related

* [RFC net-next 15/17] mptcp: implement ulp getsockopt for tls support
From: Geliang Tang @ 2026-06-22 10:43 UTC (permalink / raw)
  To: Matthieu Baerts, Mat Martineau, Geliang Tang, David S. Miller,
	Eric Dumazet, Jakub Kicinski, Paolo Abeni, Simon Horman,
	Neal Cardwell, Kuniyuki Iwashima, John Fastabend, Sabrina Dubroca,
	Hannes Reinecke
  Cc: Geliang Tang, netdev, mptcp, Gang Yan, Zqiang
In-Reply-To: <cover.1782123118.git.tanggeliang@kylinos.cn>

From: Geliang Tang <tanggeliang@kylinos.cn>

Add mptcp_getsockopt_tcp_ulp() to handle TCP_ULP getsockopt on MPTCP
sockets. The helper reads the user length once, checks for negative
value, takes the socket lock, caps the length to TCP_ULP_NAME_MAX, and
copies the ULP name (or sets the length to zero if no ULP is attached)
to userspace. The lock ensures safe access to icsk->icsk_ulp_ops.

Co-developed-by: Gang Yan <yangang@kylinos.cn>
Signed-off-by: Gang Yan <yangang@kylinos.cn>
Co-developed-by: Zqiang <qiang.zhang@linux.dev>
Signed-off-by: Zqiang <qiang.zhang@linux.dev>
Signed-off-by: Geliang Tang <tanggeliang@kylinos.cn>
---
 net/mptcp/sockopt.c | 34 ++++++++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)

diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c
index fcf6feb2a9eb..cc45491cd3b2 100644
--- a/net/mptcp/sockopt.c
+++ b/net/mptcp/sockopt.c
@@ -1408,6 +1408,39 @@ static int mptcp_put_int_option(struct mptcp_sock *msk, char __user *optval,
 	return 0;
 }
 
+static int mptcp_getsockopt_tcp_ulp(struct sock *sk,
+				    char __user *optval,
+				    int __user *optlen)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	int ret = 0, len;
+
+	if (copy_from_sockptr(&len, USER_SOCKPTR(optlen), sizeof(int)))
+		return -EFAULT;
+
+	if (len < 0)
+		return -EINVAL;
+
+	lock_sock(sk);
+	len = min_t(unsigned int, len, TCP_ULP_NAME_MAX);
+	if (!icsk->icsk_ulp_ops) {
+		len = 0;
+		if (copy_to_sockptr(USER_SOCKPTR(optlen), &len, sizeof(int)))
+			ret = -EFAULT;
+		goto out;
+	}
+	if (copy_to_sockptr(USER_SOCKPTR(optlen), &len, sizeof(int))) {
+		ret = -EFAULT;
+		goto out;
+	}
+	if (copy_to_sockptr(USER_SOCKPTR(optval), icsk->icsk_ulp_ops->name,
+			    len))
+		ret = -EFAULT;
+out:
+	release_sock(sk);
+	return ret;
+}
+
 static int mptcp_getsockopt_sol_tcp(struct mptcp_sock *msk, int optname,
 				    char __user *optval, int __user *optlen)
 {
@@ -1415,6 +1448,7 @@ static int mptcp_getsockopt_sol_tcp(struct mptcp_sock *msk, int optname,
 
 	switch (optname) {
 	case TCP_ULP:
+		return mptcp_getsockopt_tcp_ulp(sk, optval, optlen);
 	case TCP_CONGESTION:
 	case TCP_INFO:
 	case TCP_CC_INFO:
-- 
2.53.0


^ permalink raw reply related

* [RFC net-next 16/17] mptcp: implement ulp setsockopt for tls support
From: Geliang Tang @ 2026-06-22 10:43 UTC (permalink / raw)
  To: Matthieu Baerts, Mat Martineau, Geliang Tang, David S. Miller,
	Eric Dumazet, Jakub Kicinski, Paolo Abeni, Simon Horman,
	Neal Cardwell, Kuniyuki Iwashima, John Fastabend, Sabrina Dubroca,
	Hannes Reinecke
  Cc: Geliang Tang, netdev, mptcp, Gang Yan, Zqiang
In-Reply-To: <cover.1782123118.git.tanggeliang@kylinos.cn>

From: Geliang Tang <tanggeliang@kylinos.cn>

Allow MPTCP sockets to set the TCP_ULP socket option to enable TLS.
Add mptcp_setsockopt_tcp_ulp() which validates the socket state (must
not be CLOSE or LISTEN), only accepts "tls" as the ULP name, and then
calls tcp_set_ulp().

Include TCP_ULP in the list of supported options in supported_sockopt(),
and handle it in setsockopt_sol_tcp() instead of returning -EOPNOTSUPP.

Call tcp_cleanup_ulp() in mptcp_destroy_common() to release ULP module's
reference count.

Co-developed-by: Gang Yan <yangang@kylinos.cn>
Signed-off-by: Gang Yan <yangang@kylinos.cn>
Co-developed-by: Zqiang <qiang.zhang@linux.dev>
Signed-off-by: Zqiang <qiang.zhang@linux.dev>
Signed-off-by: Geliang Tang <tanggeliang@kylinos.cn>
---
 net/mptcp/protocol.c |  1 +
 net/mptcp/sockopt.c  | 34 +++++++++++++++++++++++++++++++++-
 2 files changed, 34 insertions(+), 1 deletion(-)

diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 4951b1dd013b..a13acee67688 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -3765,6 +3765,7 @@ static void mptcp_destroy(struct sock *sk)
 	/* allow the following to close even the initial subflow */
 	msk->free_first = 1;
 	mptcp_destroy_common(msk);
+	tcp_cleanup_ulp(sk);
 	sk_sockets_allocated_dec(sk);
 }
 
diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c
index cc45491cd3b2..eeb348336195 100644
--- a/net/mptcp/sockopt.c
+++ b/net/mptcp/sockopt.c
@@ -577,6 +577,7 @@ static bool mptcp_supported_sockopt(int level, int optname)
 		case TCP_FASTOPEN_CONNECT:
 		case TCP_FASTOPEN_KEY:
 		case TCP_FASTOPEN_NO_COOKIE:
+		case TCP_ULP:
 			return true;
 		}
 
@@ -830,6 +831,37 @@ static int mptcp_setsockopt_all_sf(struct mptcp_sock *msk, int level,
 	return ret;
 }
 
+static int mptcp_setsockopt_tcp_ulp(struct sock *sk, sockptr_t optval,
+				    unsigned int optlen)
+{
+	char name[TCP_ULP_NAME_MAX];
+	int err = 0;
+	size_t len;
+	int val;
+
+	if (optlen < 1)
+		return -EINVAL;
+
+	len = min_t(long, TCP_ULP_NAME_MAX - 1, optlen);
+	val = strncpy_from_sockptr(name, optval, len);
+	if (val < 0)
+		return -EFAULT;
+	name[val] = 0;
+
+	if (strcmp(name, "tls"))
+		return -EOPNOTSUPP;
+
+	sockopt_lock_sock(sk);
+	if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) {
+		err = -ENOTCONN;
+		goto out;
+	}
+	err = tcp_set_ulp(sk, name);
+out:
+	sockopt_release_sock(sk);
+	return err;
+}
+
 static int mptcp_setsockopt_sol_tcp(struct mptcp_sock *msk, int optname,
 				    sockptr_t optval, unsigned int optlen)
 {
@@ -838,7 +870,7 @@ static int mptcp_setsockopt_sol_tcp(struct mptcp_sock *msk, int optname,
 
 	switch (optname) {
 	case TCP_ULP:
-		return -EOPNOTSUPP;
+		return mptcp_setsockopt_tcp_ulp(sk, optval, optlen);
 	case TCP_CONGESTION:
 		return mptcp_setsockopt_sol_tcp_congestion(msk, optval, optlen);
 	case TCP_DEFER_ACCEPT:
-- 
2.53.0


^ permalink raw reply related

* [RFC net-next 17/17] selftests: mptcp: connect: use espintcp for ulp test
From: Geliang Tang @ 2026-06-22 10:43 UTC (permalink / raw)
  To: Matthieu Baerts, Mat Martineau, Geliang Tang, David S. Miller,
	Eric Dumazet, Jakub Kicinski, Paolo Abeni, Simon Horman,
	Neal Cardwell, Kuniyuki Iwashima, John Fastabend, Sabrina Dubroca,
	Hannes Reinecke
  Cc: Geliang Tang, netdev, mptcp, Gang Yan, Zqiang
In-Reply-To: <cover.1782123118.git.tanggeliang@kylinos.cn>

From: Geliang Tang <tanggeliang@kylinos.cn>

With KTLS being implemented, "tls" should no longer be used in
sock_test_tcpulp(), it breaks mptcp_connect.sh tests. Another ULP
name, "espintcp", is set instead in this patch.

Co-developed-by: Gang Yan <yangang@kylinos.cn>
Signed-off-by: Gang Yan <yangang@kylinos.cn>
Co-developed-by: Zqiang <qiang.zhang@linux.dev>
Signed-off-by: Zqiang <qiang.zhang@linux.dev>
Signed-off-by: Geliang Tang <tanggeliang@kylinos.cn>
---
 tools/testing/selftests/net/mptcp/config          | 4 ++++
 tools/testing/selftests/net/mptcp/mptcp_connect.c | 4 ++--
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/net/mptcp/config b/tools/testing/selftests/net/mptcp/config
index 59051ee2a986..f48bd5183fb3 100644
--- a/tools/testing/selftests/net/mptcp/config
+++ b/tools/testing/selftests/net/mptcp/config
@@ -34,3 +34,7 @@ CONFIG_NFT_SOCKET=m
 CONFIG_NFT_TPROXY=m
 CONFIG_SYN_COOKIES=y
 CONFIG_VETH=y
+CONFIG_INET_ESP=y
+CONFIG_INET_ESPINTCP=y
+CONFIG_INET6_ESP=y
+CONFIG_INET6_ESPINTCP=y
diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.c b/tools/testing/selftests/net/mptcp/mptcp_connect.c
index cbe573c4ab3a..299a7a02d6f5 100644
--- a/tools/testing/selftests/net/mptcp/mptcp_connect.c
+++ b/tools/testing/selftests/net/mptcp/mptcp_connect.c
@@ -285,11 +285,11 @@ static void sock_test_tcpulp(int sock, int proto, unsigned int line)
 	if (buflen > 0) {
 		if (strcmp(buf, "mptcp") != 0)
 			xerror("unexpected ULP '%s' for proto %d at line %u", buf, proto, line);
-		ret = do_ulp_so(sock, "tls");
+		ret = do_ulp_so(sock, "espintcp");
 		if (ret == 0)
 			X("setsockopt");
 	} else if (proto == IPPROTO_MPTCP) {
-		ret = do_ulp_so(sock, "tls");
+		ret = do_ulp_so(sock, "espintcp");
 		if (ret != -1)
 			X("setsockopt");
 	}
-- 
2.53.0


^ permalink raw reply related

* Re: [PATCH v3 0/7] Prepare mutable list iterators to cache cursor state
From: Andy Shevchenko @ 2026-06-22 10:46 UTC (permalink / raw)
  To: Kaitao Cheng
  Cc: Alexei Starovoitov, Andrew Morton, David Hildenbrand, Jens Axboe,
	Tejun Heo, Alexander Viro, Christian Brauner, Alexei Starovoitov,
	Daniel Borkmann, Andrii Nakryiko, Johannes Weiner, Peter Zijlstra,
	Ingo Molnar, Arnaldo Carvalho de Melo, Namhyung Kim,
	Thomas Gleixner, Juri Lelli, Vincent Guittot, Paul Moore,
	Paul E. McKenney, Shakeel Butt, Christian König,
	David Howells, Simona Vetter, Randy Dunlap, Luca Ceresoli,
	Philipp Stanner, linux-block, LKML,
	open list:CONTROL GROUP (CGROUP), linux-ntfs-dev, Linux-Fsdevel,
	io-uring, audit, bpf, Network Development, dri-devel,
	linux-perf-use., linux-trace-kernel, kexec, live-patching,
	linux-modules, Linux Crypto Mailing List, Linux Power Management,
	rcu, sched-ext, linux-mm, virtualization, damon,
	clang-built-linux, chengkaitao, Muchun Song
In-Reply-To: <8c8f1849-86d3-4c69-be27-30bbdffdf616@linux.dev>

On Mon, Jun 22, 2026 at 02:15:01PM +0800, Kaitao Cheng wrote:
> 在 2026/6/22 13:28, Alexei Starovoitov 写道:
> > On Sun, Jun 21, 2026 at 9:06 PM Kaitao Cheng <kaitao.cheng@linux.dev> wrote:

...

> >>  block/bfq-iosched.c                 |  17 +-
> >>  block/blk-cgroup.c                  |  12 +-
> >>  block/blk-flush.c                   |   4 +-
> >>  block/blk-iocost.c                  |  18 +-
> >>  block/blk-mq.c                      |   8 +-
> >>  block/blk-throttle.c                |   4 +-
> >>  block/kyber-iosched.c               |   4 +-
> >>  block/partitions/ldm.c              |   8 +-
> >>  block/sed-opal.c                    |   4 +-
> >>  include/linux/list.h                | 269 ++++++++++++++++++++++++----
> >>  include/linux/llist.h               |  81 +++++++--
> >>  init/initramfs.c                    |   5 +-
> >>  io_uring/cancel.c                   |   6 +-
> >>  io_uring/poll.c                     |   3 +-
> >>  io_uring/rw.c                       |   4 +-
> >>  io_uring/timeout.c                  |   8 +-
> >>  io_uring/uring_cmd.c                |   3 +-
> >>  kernel/audit_tree.c                 |   4 +-
> >>  kernel/audit_watch.c                |  16 +-
> >>  kernel/auditfilter.c                |   4 +-
> >>  kernel/auditsc.c                    |   4 +-
> >>  kernel/bpf/arena.c                  |  10 +-
> >>  kernel/bpf/arraymap.c               |   8 +-
> >>  kernel/bpf/bpf_local_storage.c      |   3 +-
> >>  kernel/bpf/bpf_lru_list.c           |  25 ++-
> >>  kernel/bpf/btf.c                    |  18 +-
> >>  kernel/bpf/cgroup.c                 |   7 +-
> >>  kernel/bpf/cpumap.c                 |   4 +-
> >>  kernel/bpf/devmap.c                 |  10 +-
> >>  kernel/bpf/helpers.c                |   8 +-
> >>  kernel/bpf/local_storage.c          |   4 +-
> >>  kernel/bpf/memalloc.c               |  16 +-
> >>  kernel/bpf/offload.c                |   8 +-
> >>  kernel/bpf/states.c                 |   4 +-
> >>  kernel/bpf/stream.c                 |   4 +-
> >>  kernel/bpf/verifier.c               |   6 +-
> >>  kernel/cgroup/cgroup-v1.c           |   4 +-
> >>  kernel/cgroup/cgroup.c              |  54 +++---
> >>  kernel/cgroup/dmem.c                |  12 +-
> >>  kernel/cgroup/rdma.c                |   8 +-
> >>  kernel/events/core.c                |  44 +++--
> >>  kernel/events/uprobes.c             |  12 +-
> >>  kernel/exit.c                       |   8 +-
> >>  kernel/fail_function.c              |   4 +-
> >>  kernel/gcov/clang.c                 |   4 +-
> >>  kernel/irq_work.c                   |   4 +-
> >>  kernel/kexec_core.c                 |   4 +-
> >>  kernel/kprobes.c                    |  16 +-
> >>  kernel/livepatch/core.c             |   4 +-
> >>  kernel/livepatch/core.h             |   4 +-
> >>  kernel/liveupdate/kho_block.c       |   4 +-
> >>  kernel/liveupdate/luo_flb.c         |   4 +-
> >>  kernel/locking/rwsem.c              |   2 +-
> >>  kernel/locking/test-ww_mutex.c      |   2 +-
> >>  kernel/module/main.c                |  11 +-
> >>  kernel/padata.c                     |   4 +-
> >>  kernel/power/snapshot.c             |   8 +-
> >>  kernel/power/wakelock.c             |   4 +-
> >>  kernel/printk/printk.c              |  11 +-
> >>  kernel/ptrace.c                     |   4 +-
> >>  kernel/rcu/rcutorture.c             |   3 +-
> >>  kernel/rcu/tasks.h                  |   9 +-
> >>  kernel/rcu/tree.c                   |   6 +-
> >>  kernel/resource.c                   |   4 +-
> >>  kernel/sched/core.c                 |   4 +-
> >>  kernel/sched/ext.c                  |  22 +--
> >>  kernel/sched/fair.c                 |  28 +--
> >>  kernel/sched/topology.c             |   4 +-
> >>  kernel/sched/wait.c                 |   4 +-
> >>  kernel/seccomp.c                    |   4 +-
> >>  kernel/signal.c                     |  11 +-
> >>  kernel/smp.c                        |   4 +-
> >>  kernel/taskstats.c                  |   8 +-
> >>  kernel/time/clockevents.c           |   6 +-
> >>  kernel/time/clocksource.c           |   4 +-
> >>  kernel/time/posix-cpu-timers.c      |   4 +-
> >>  kernel/time/posix-timers.c          |   3 +-
> >>  kernel/torture.c                    |   3 +-
> >>  kernel/trace/bpf_trace.c            |   4 +-
> >>  kernel/trace/ftrace.c               |  49 +++--
> >>  kernel/trace/ring_buffer.c          |  25 ++-
> >>  kernel/trace/trace.c                |  12 +-
> >>  kernel/trace/trace_dynevent.c       |   6 +-
> >>  kernel/trace/trace_dynevent.h       |   5 +-
> >>  kernel/trace/trace_events.c         |  35 ++--
> >>  kernel/trace/trace_events_filter.c  |   4 +-
> >>  kernel/trace/trace_events_hist.c    |   8 +-
> >>  kernel/trace/trace_events_trigger.c |  17 +-
> >>  kernel/trace/trace_events_user.c    |  16 +-
> >>  kernel/trace/trace_stat.c           |   4 +-
> >>  kernel/user-return-notifier.c       |   3 +-
> >>  kernel/workqueue.c                  |  16 +-
> >>  mm/backing-dev.c                    |   8 +-
> >>  mm/balloon.c                        |   8 +-
> >>  mm/cma.c                            |   4 +-
> >>  mm/compaction.c                     |   4 +-
> >>  mm/damon/core.c                     |   4 +-
> >>  mm/damon/sysfs-schemes.c            |   4 +-
> >>  mm/dmapool.c                        |   4 +-
> >>  mm/huge_memory.c                    |   8 +-
> >>  mm/hugetlb.c                        |  56 +++---
> >>  mm/hugetlb_vmemmap.c                |  16 +-
> >>  mm/khugepaged.c                     |  14 +-
> >>  mm/kmemleak.c                       |   7 +-
> >>  mm/ksm.c                            |  25 +--
> >>  mm/list_lru.c                       |   4 +-
> >>  mm/memcontrol-v1.c                  |   8 +-
> >>  mm/memory-failure.c                 |  12 +-
> >>  mm/memory-tiers.c                   |   4 +-
> >>  mm/migrate.c                        |  23 ++-
> >>  mm/mmu_notifier.c                   |   9 +-
> >>  mm/page_alloc.c                     |   8 +-
> >>  mm/page_reporting.c                 |   2 +-
> >>  mm/percpu.c                         |  11 +-
> >>  mm/pgtable-generic.c                |   4 +-
> >>  mm/rmap.c                           |  10 +-
> >>  mm/shmem.c                          |   9 +-
> >>  mm/slab_common.c                    |  14 +-
> >>  mm/slub.c                           |  33 ++--
> >>  mm/swapfile.c                       |   4 +-
> >>  mm/userfaultfd.c                    |  12 +-
> >>  mm/vmalloc.c                        |  24 +--
> >>  mm/vmscan.c                         |   7 +-
> >>  mm/zsmalloc.c                       |   4 +-
> >>  124 files changed, 875 insertions(+), 681 deletions(-)
> > 
> > Not sure what you were thinking, but this diff stat
> > is not landable.
> 
> [PATCH v3 1/7] and [PATCH v3 2/7] contain the main logic and can
> be merged directly. They are also compatible with the old API.
> [PATCH v3 3/7] through [PATCH v3 7/7] are just simple interface
> replacements and do not change any functional logic. They can be
> left unmerged for now; individual modules can pick them up later
> if needed.
> 
> In v2, Andy Shevchenko mentioned: "If it's done by Linus himself
> during the day when he prepares -rc1, it's fine."

Yes, but you need to get his blessing first to go with this.
Have you communicated with him on this?

> Even so, the
> changes in this patch series are indeed quite large and touch
> almost every subsystem. I have only converted part of them for
> now, so I wanted to send this out first and see what people think.

That's why it's better to provide a script to convert (e.g., coccinelle)
instead of tons of patches.

-- 
With Best Regards,
Andy Shevchenko



^ permalink raw reply

* Re: [Intel-wired-lan] [PATCH net v6 0/4] Fix i40e/ice/iavf VF bonding after netdev lock changes
From: Jose Ignacio Tornos Martinez @ 2026-06-22 10:51 UTC (permalink / raw)
  To: horms
  Cc: aleksandr.loktionov, anthony.l.nguyen, davem, edumazet,
	intel-wired-lan, jacob.e.keller, jesse.brandeburg, jtornosm, kuba,
	netdev, pabeni, przemyslaw.kitszel
In-Reply-To: <20260622103159.GZ827683@horms.kernel.org>

Hello Simon,

> Unfortunately the Netdev CI was unable to apply this series cleanly against net.
> Would you be able to rebase and repost?
Sure, I'll rebase on current net and send v7 shortly.

Thanks

Best regards
José Ignacio


^ permalink raw reply

* Re: [PATCH net] octeontx2-af: Block VFs from clobbering special CGX PKIND state
From: Simon Horman @ 2026-06-22 10:51 UTC (permalink / raw)
  To: rkannoth
  Cc: Simon Horman, davem, gakula, linux-kernel, netdev, sgoutham,
	andrew+netdev, edumazet, kuba, pabeni, hkelam
In-Reply-To: <20260619041002.1773822-1-rkannoth@marvell.com>

This is an AI-generated review of your patch. The human sending this
email has considered the AI review valid, or at least plausible.
Full review at: https://sashiko.dev
---
(Note: Due to recitation restrictions, the standard unified diff is omitted 
and the findings are summarized below instead of inline quoting.)

commit 40480a8afd593111da0b00679b782cbe187572e6
Author: Hariprasad Kelam <hkelam@marvell.com>

octeontx2-af: Block VFs from clobbering special CGX PKIND state

This commit adds checks to prevent VFs from overwriting LMAC RX PKIND or
default TX parse configs when a shared LMAC uses HIGIG2 or EDSA parsing.
It introduces helpers to verify if the PKIND configuration is permitted
before applying updates.

[Severity: High]
Can a Time-of-Check to Time-of-Use (TOCTOU) race occur between a VF checking
the LMAC PKIND state and subsequently updating it?

Looking at rvu_cgx_is_pkind_config_permitted() in rvu_cgx.c, it reads the
hardware LMAC PKIND register locklessly to check if HIGIG or EDSA parsing
is enabled.

If the check returns true, the VF proceeds to write its own default PKIND
to the LMAC via cgx_set_pkind() in nix_interface_init(), or updates the TX
parse config in rvu_mbox_handler_nix_lf_alloc() in rvu_nix.c.

Since VF mailbox messages are handled by rvu_afvf_mbox_handler() on a separate
workqueue without holding rvu->mbox_lock, could a PF executing a mailbox
message like rvu_mbox_handler_npc_set_pkind interleave between the VF's
lockless read and its subsequent write?

If so, the VF might observe the default PKIND, the PF could then set
HIGIG_PKIND or EDSA_PKIND, and the VF would then erroneously overwrite the
PF's configuration. Does this sequence need to be protected by a lock such
as cgx_cfg_lock or rsrc_lock to prevent bypassing the new protection?

^ permalink raw reply

* RE: [PATCH net v3] rtase: Workaround for TX hang caused by short UDP packets entering hardware PTP parsing
From: Justin Lai @ 2026-06-22 10:46 UTC (permalink / raw)
  To: Simon Horman
  Cc: kuba@kernel.org, davem@davemloft.net, edumazet@google.com,
	pabeni@redhat.com, andrew+netdev@lunn.ch,
	linux-kernel@vger.kernel.org, netdev@vger.kernel.org,
	stable@vger.kernel.org, richardcochran@gmail.com,
	david.laight.linux@gmail.com, aleksander.lobakin@intel.com,
	Ping-Ke Shih, Larry Chiu
In-Reply-To: <20260617085416.841731-2-horms@kernel.org>

Simon Horman <horms@kernel.org> wrote:
> 
> This is an AI-generated review of your patch. The human sending this
> email has considered the AI review valid, or at least plausible.
> Full review at: https://netdev-ai.bots.linux.dev/sashiko/
> ---
> rtase: Workaround for TX hang caused by short UDP packets entering
> hardware PTP parsing
> 
> This patch adds a workaround for a hardware TX hang that occurs when
> short UDP packets matching destination ports 319/320 enter a PTP parsing
> path that reads beyond the packet end. The workaround pads affected
> packets so the transport data reaches RTASE_MIN_PAD_LEN before
> transmission.
> 
> > diff --git a/drivers/net/ethernet/realtek/rtase/rtase_main.c
> b/drivers/net/ethernet/realtek/rtase/rtase_main.c
> > index 55105d34bc79..4c295a39c7a0 100644
> > --- a/drivers/net/ethernet/realtek/rtase/rtase_main.c
> > +++ b/drivers/net/ethernet/realtek/rtase/rtase_main.c
> 
> [ ... ]
> 
> > +static bool rtase_get_udp_offset(struct sk_buff *skb, u32 *udp_offset)
> > +{
> > +     int no = skb_network_offset(skb);
> > +     struct ipv6hdr *i6h, _i6h;
> > +     struct iphdr *ih, _ih;
> > +
> > +     switch (vlan_get_protocol(skb)) {
> > +     case htons(ETH_P_IP):
> > +             ih = skb_header_pointer(skb, no, sizeof(_ih), &_ih);
> > +             if (!ih)
> > +                     return false;
> > +
> > +             if (ih->ihl < 5)
> > +                     return false;
> > +
> > +             if (ih->protocol != IPPROTO_UDP)
> > +                     return false;
> > +
> > +             *udp_offset = no + ih->ihl * 4;
> 
> [Low]
> Should ih->ihl also be checked against the actual data present in the
> skb? With ihl up to 15 (60 bytes), if the skb only contains the basic
> 20-byte IP header, the computed udp_offset can exceed skb->len.
> 
> Downstream in rtase_skb_pad():
> 
>         trans_data_len = skb->len - udp_offset;
> 
> trans_data_len is u32, so when udp_offset > skb->len this underflows to
> a very large value, which trivially satisfies:
> 
>         trans_data_len >= RTASE_MIN_PAD_LEN
> 
> and rtase_skb_pad() silently skips the padding. A malformed packet from
> an AF_PACKET sender with CAP_NET_RAW could craft a mismatched ihl and
> bypass the workaround. Would clamping or validating udp_offset against
> skb->len before the subtraction be appropriate?
> 
Thanks for pointing this out.
 
I'll add a check for udp_offset > skb->len before calculating
trans_data_len to avoid the potential underflow.
 
If the computed UDP offset is beyond the packet length, the packet is
malformed and cannot be safely handled by this workaround, so it should
not be transmitted.

> > +
> > +             return true;
> > +     case htons(ETH_P_IPV6):
> > +             i6h = skb_header_pointer(skb, no, sizeof(_i6h), &_i6h);
> > +             if (!i6h)
> > +                     return false;
> > +
> > +             if (i6h->nexthdr != IPPROTO_UDP)
> > +                     return false;
> > +
> > +             *udp_offset = no + sizeof(*i6h);
> > +
> > +             return true;
> > +     default:
> > +             return false;
> > +     }
> > +}
> > +
> > +static bool rtase_skb_pad(struct sk_buff *skb)
> > +{
> > +     __be16 *dest, _dest;
> > +     u32 trans_data_len;
> > +     u32 udp_offset;
> > +     u16 dest_port;
> > +     u32 pad_len;
> > +
> > +     if (!rtase_get_udp_offset(skb, &udp_offset))
> > +             return true;
> > +
> > +     trans_data_len = skb->len - udp_offset;
> > +     if (trans_data_len < offsetof(struct udphdr, len) ||
> > +         trans_data_len >= RTASE_MIN_PAD_LEN)
> > +             return true;
> 
> [Low]
> When trans_data_len is less than offsetof(struct udphdr, len) (i.e., the
> transport region is under 4 bytes), this returns true and skips the
> padding entirely.
> 
> The commit message states the hardware "may access data beyond the end
> of the packet" during PTP parsing. If the hardware also reads out of
> bounds while classifying the packet, an IPv4 last fragment with 1-3
> trailing bytes could still be misclassified as PTP based on whatever
> garbage memory the hardware happens to read, and the workaround would
> be bypassed.
> 
> The implicit assumption here seems to be "if the driver cannot read the
> dest port from the skb, the hardware cannot classify it either", which
> appears to contradict the premise that the hardware reads beyond the
> packet boundary. Should these very short transport-data cases also be
> padded to be safe?
> 


Thanks for the review.

The out-of-bounds access occurs after the packet has already matched
the PTP classification criteria (UDP destination port 319/320) and
enters the hardware PTP parsing path.

If the transport data is shorter than the destination port field
itself, the packet cannot match the affected PTP classification path.

Thanks,
Justin

> > +
> > +     dest = skb_header_pointer(skb,
> > +                               udp_offset + offsetof(struct udphdr,
> dest),
> > +                               sizeof(_dest), &_dest);
> 
> [ ... ]

^ permalink raw reply

* [PATCH net] net: do not acquire dev->tx_global_lock in netdev_watchdog_up()
From: Eric Dumazet @ 2026-06-22 11:01 UTC (permalink / raw)
  To: David S . Miller, Jakub Kicinski, Paolo Abeni
  Cc: Simon Horman, netdev, eric.dumazet, Eric Dumazet,
	Marek Szyprowski

Marek Szyprowski reported a deadlock during system resume when virtio_net
driver is used.

The deadlock occurs because netif_device_attach() is called while holding
dev->tx_global_lock (via netif_tx_lock_bh() in virtnet_restore_up()).
netif_device_attach() calls __netdev_watchdog_up(), which now also tries
to acquire dev->tx_global_lock to synchronize with dev_watchdog().

This recursive lock acquisition results in a deadlock.

Fix this by removing the tx_global_lock acquisition from netdev_watchdog_up().

The critical state (watchdog_timer and watchdog_ref_held) is already
protected by dev->watchdog_lock, which was introduced in the blamed commit.

Fixes: 8eed5519e496 ("net: watchdog: fix refcount tracking races")
Reported-by: Marek Szyprowski <m.szyprowski@samsung.com>
Closes: https://lore.kernel.org/netdev/a443376e-5187-4268-93b3-58047ef113a8@samsung.com/
Signed-off-by: Eric Dumazet <edumazet@google.com>
Tested-by: Marek Szyprowski <m.szyprowski@samsung.com>
---
 net/sched/sch_generic.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 3f1c510df850dbdbaf10d483547c7b1f3a5d5482..ef2b4bf51564173751c74fefe17e3913ed2fa056 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -594,9 +594,8 @@ void netdev_watchdog_up(struct net_device *dev)
 		return;
 	if (dev->watchdog_timeo <= 0)
 		dev->watchdog_timeo = 5*HZ;
-	spin_lock_bh(&dev->tx_global_lock);
 
-	spin_lock(&dev->watchdog_lock);
+	spin_lock_bh(&dev->watchdog_lock);
 	if (!mod_timer(&dev->watchdog_timer,
 		       round_jiffies(jiffies + dev->watchdog_timeo))) {
 		if (!dev->watchdog_ref_held) {
@@ -605,9 +604,7 @@ void netdev_watchdog_up(struct net_device *dev)
 			dev->watchdog_ref_held = true;
 		}
 	}
-	spin_unlock(&dev->watchdog_lock);
-
-	spin_unlock_bh(&dev->tx_global_lock);
+	spin_unlock_bh(&dev->watchdog_lock);
 }
 EXPORT_SYMBOL_GPL(netdev_watchdog_up);
 
-- 
2.55.0.rc0.786.g65d90a0328-goog


^ permalink raw reply related

* RE: [PATCH net v3] rtase: Workaround for TX hang caused by short UDP packets entering hardware PTP parsing
From: Justin Lai @ 2026-06-22 10:56 UTC (permalink / raw)
  To: Simon Horman
  Cc: kuba@kernel.org, davem@davemloft.net, edumazet@google.com,
	pabeni@redhat.com, andrew+netdev@lunn.ch,
	linux-kernel@vger.kernel.org, netdev@vger.kernel.org,
	stable@vger.kernel.org, richardcochran@gmail.com,
	david.laight.linux@gmail.com, aleksander.lobakin@intel.com,
	Ping-Ke Shih, Larry Chiu
In-Reply-To: <20260617085942.843250-2-horms@kernel.org>

Simon Horman <horms@kernel.org> wrote:
> 
> This is an AI-generated review of your patch. The human sending this
> email has considered the AI review valid, or at least plausible.
> Full review at: https://sashiko.dev
> ---
> rtase: Workaround for TX hang caused by short UDP packets entering
> hardware PTP parsing
> 
> The patch adds a workaround for a hardware bug in rtase where short UDP
> packets triggering PTP parsing can cause a TX hang due to out-of-bounds
> reads.
> It pads affected packets so the transport data reaches a minimum length.
> 
> Cc: stable@vger.kernel.org
> 
> > diff --git a/drivers/net/ethernet/realtek/rtase/rtase_main.c
> b/drivers/net/ethernet/realtek/rtase/rtase_main.c
> > index 55105d34bc797..4c295a39c7a0c 100644
> > --- a/drivers/net/ethernet/realtek/rtase/rtase_main.c
> > +++ b/drivers/net/ethernet/realtek/rtase/rtase_main.c
> [ ... ]
> > @@ -1249,6 +1250,81 @@ static u32 rtase_tx_csum(struct sk_buff *skb,
> const struct net_device *dev)
> >       return csum_cmd;
> >  }
> >
> > +static bool rtase_get_udp_offset(struct sk_buff *skb, u32 *udp_offset)
> > +{
> > +     int no = skb_network_offset(skb);
> > +     struct ipv6hdr *i6h, _i6h;
> > +     struct iphdr *ih, _ih;
> > +
> > +     switch (vlan_get_protocol(skb)) {
> > +     case htons(ETH_P_IP):
> 
> [Severity: High]
> Does relying on vlan_get_protocol(skb) allow an unprivileged user to bypass
> this workaround?
> 
> If an attacker uses an AF_PACKET/SOCK_RAW socket to inject a raw Ethernet
> frame containing an IPv4/UDP packet, they could set skb->protocol to an
> arbitrary value like htons(ETH_P_ALL). The switch statement will fall
> through to the default case and return false, causing rtase_skb_pad() to
> skip the padding.
> 
> Since the hardware independently parses the raw frame based on the
> EtherType,
> could it still process the unpadded packet and trigger the TX hang?
> 

Thanks for pointing this out.
 
Since the hardware parser operates on the actual Ethernet frame
contents, relying on skb protocol metadata is not ideal for this
workaround.
 
I'll update the code to derive the L3 protocol and network offset from
the Ethernet/VLAN headers in the skb data before parsing IPv4/IPv6.
This makes the software check match what the hardware parser sees and
avoids depending on skb protocol metadata.

> > +             ih = skb_header_pointer(skb, no, sizeof(_ih), &_ih);
> > +             if (!ih)
> > +                     return false;
> [ ... ]
> > +static bool rtase_skb_pad(struct sk_buff *skb)
> > +{
> > +     __be16 *dest, _dest;
> > +     u32 trans_data_len;
> > +     u32 udp_offset;
> > +     u16 dest_port;
> > +     u32 pad_len;
> > +
> > +     if (!rtase_get_udp_offset(skb, &udp_offset))
> > +             return true;
> > +
> > +     trans_data_len = skb->len - udp_offset;
> > +     if (trans_data_len < offsetof(struct udphdr, len) ||
> > +         trans_data_len >= RTASE_MIN_PAD_LEN)
> > +             return true;
> 
> [Severity: High]
> Does this logic inadvertently skip padding for underflowed lengths and tiny
> payloads?
> 
> If the IP header is malformed such that skb->len < udp_offset, the unsigned
> subtraction for trans_data_len will underflow to a large positive value.
> This causes trans_data_len >= RTASE_MIN_PAD_LEN to be true, bypassing the
> padding.
> 
> Additionally, if the packet has fewer than 4 bytes of UDP payload, meaning
> trans_data_len < offsetof(struct udphdr, len), it explicitly returns true.
> 
> Since the hardware blindly fetches the UDP destination port at
> udp_offset + 2 via DMA, won't transmitting these short or malformed packets
> without padding still expose the hardware to out-of-bounds reads?
> 

Thanks for the review.
 
I'll add a check for udp_offset > skb->len before calculating
trans_data_len to avoid the potential underflow.
 
If the computed UDP offset is beyond the packet length, the packet is
malformed and cannot be safely handled by this workaround, so it should
not be transmitted.
 
The out-of-bounds access occurs after the packet has already matched
the PTP classification criteria (UDP destination port 319/320) and
enters the hardware PTP parsing path.
 
If the transport data is shorter than the destination port field
itself, the packet cannot match the affected PTP classification path.

> > +
> > +     dest = skb_header_pointer(skb,
> > +                               udp_offset + offsetof(struct udphdr,
> dest),
> > +                               sizeof(_dest), &_dest);
> [ ... ]
> > @@ -1362,6 +1438,9 @@ static netdev_tx_t rtase_start_xmit(struct sk_buff
> *skb,
> >               opts2 |= rtase_tx_csum(skb, dev);
> >       }
> >
> > +     if (!rtase_skb_pad(skb))
> > +             goto err_dma_0;
> > +
> >       frags = rtase_xmit_frags(ring, skb, opts1, opts2);
> >       if (unlikely(frags < 0))
> >               goto err_dma_0;
> 
> [Severity: High]
> This is a pre-existing issue, but does the err_dma_1 error path in
> rtase_start_xmit() contain a double free of the skb?
> 
> When transmitting a fragmented packet, rtase_xmit_frags() maps the
> fragments and assigns ring->skbuff[entry] = skb for the last fragment
> descriptor. Back in rtase_start_xmit(), if dma_map_single() fails for the
> linear part, it jumps to err_dma_1:
> 
> err_dma_1:
>         ring->skbuff[entry] = NULL;
>         rtase_tx_clear_range(ring, ring->cur_idx + 1, frags);
> 
> err_dma_0:
>         tp->stats.tx_dropped++;
>         dev_kfree_skb_any(skb);
>         return NETDEV_TX_OK;
> 
> rtase_tx_clear_range() iterates over the mapped fragments and calls
> dev_kfree_skb_any() when it finds the skb pointer. Execution then falls
> through to err_dma_0, which unconditionally calls dev_kfree_skb_any(skb) a
> second time on the same skb pointer.

Thanks for pointing this out.

This appears to be a pre-existing issue and is unrelated to the change
in this patch.

I'll investigate the reported double free scenario further and address
it separately if a fix is needed.

Thanks,
Justin

^ permalink raw reply

* Re: [patch V2 18/25] timekeeping: Prepare for cross timestamps on arbitrary clock IDs
From: Thomas Gleixner @ 2026-06-22 11:07 UTC (permalink / raw)
  To: David Woodhouse, LKML
  Cc: Miroslav Lichvar, John Stultz, Stephen Boyd, Anna-Maria Behnsen,
	Frederic Weisbecker, thomas.weissschuh, Arthur Kiyanovski,
	Rodolfo Giometti, Vincent Donnefort, Marc Zyngier, Oliver Upton,
	kvmarm, Oliver Upton, Richard Cochran, netdev, Takashi Iwai,
	Miri Korenblit, Johannes Berg, Jacob Keller, Tony Nguyen,
	Saeed Mahameed, Peter Hilber, Michael S. Tsirkin, virtualization,
	linux-wireless, linux-sound, Vadim Fedorenko
In-Reply-To: <b296182e2e2c1ed2fe1c4879fd6f12d67a7ad22f.camel@infradead.org>

On Mon, Jun 22 2026 at 09:55, David Woodhouse wrote:
> On Fri, 2026-05-29 at 22:01 +0200, Thomas Gleixner wrote:
>> From: Thomas Gleixner <tglx@kernel.org>
>> 
>> PTP device system crosstime stamps support only CLOCK_REALTIME, which is
>> meaningless for AUX clocks. The PTP core hands in the clock ID already, so
>> prepare the core code to honor it.
>> 
>>  - Add a new sys_systime field to struct system_device_crosststamp which
>>    aliases the sys_realtime field. Once all users are converted
>>    sys_realtime can be removed.
>> 
>>  - Prepare get_device_system_crosststamp() and the related code for it by
>>    switching to sys_systime and providing the initial changes to utilize
>>    different time keepers.
>> 
>> No functional change intended.
>
> We ended up with ktime_get_snapshot_id() also supporting CLOCK_BOOTTIME
> and CLOCK_MONOTONIC_RAW, but not get_device_system_crosststamp().
> Should we make that consistent?

Maybe. The BOOTTIME support is only there for that ARM64 hyper trace muck,
but has no other relevance.

MONORAW is there for the PTP EXTENDED IOCTL, but with PRECISE the
snapshot already contains the raw value and you'd have to prevent the
historical adjustment part for RAW. So I don't see the actual value, but
I don't have a strong opinion either.

Thanks,

        tglx




^ permalink raw reply

* [PATCH] fsl/fman: Free init resources on KeyGen failure in fman_init()
From: Haoxiang Li @ 2026-06-22 11:16 UTC (permalink / raw)
  To: madalin.bucur, sean.anderson, andrew+netdev, davem, edumazet,
	kuba, pabeni, florinel.iordache
  Cc: netdev, linux-kernel, Haoxiang Li, stable

fman_muram_alloc() allocates initialization resources before
initializing the KeyGen block. If keygen_init() fails, the
function returns -EINVAL directly and leaves those resources
allocated. Free the initialization resources before returning
from the KeyGen failure path.

While at it, drop the unused error check around enable(), which
always returns 0.

Fixes: 7472f4f281d0 ("fsl/fman: enable FMan Keygen")
Cc: stable@kernel.org
Signed-off-by: Haoxiang Li <haoxiang_li2024@163.com>
---
 drivers/net/ethernet/freescale/fman/fman.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/freescale/fman/fman.c b/drivers/net/ethernet/freescale/fman/fman.c
index 013273a2de32..3a2a57207e55 100644
--- a/drivers/net/ethernet/freescale/fman/fman.c
+++ b/drivers/net/ethernet/freescale/fman/fman.c
@@ -1995,12 +1995,12 @@ static int fman_init(struct fman *fman)
 
 	/* Init KeyGen */
 	fman->keygen = keygen_init(fman->kg_regs);
-	if (!fman->keygen)
+	if (!fman->keygen) {
+		free_init_resources(fman);
 		return -EINVAL;
+	}
 
-	err = enable(fman, cfg);
-	if (err != 0)
-		return err;
+	enable(fman, cfg);
 
 	enable_time_stamp(fman);
 
-- 
2.25.1


^ permalink raw reply related

* [PATCH net] veth: fix NAPI leak in XDP enable error path
From: Eric Dumazet @ 2026-06-22 11:18 UTC (permalink / raw)
  To: David S . Miller, Jakub Kicinski, Paolo Abeni
  Cc: Simon Horman, netdev, eric.dumazet, Eric Dumazet, Guenter Roeck,
	Björn Töpel, Daniel Borkmann, Ilias Apalodimas,
	Michael S. Tsirkin, Tariq Toukan

During XDP enablement in veth, if xdp_rxq_info_reg() or
xdp_rxq_info_reg_mem_model() fails, the driver rolls back the changes.

However, the rollback loop:
	for (i--; i >= start; i--) {

decrements the loop index 'i' before the first iteration. This
correctly skips unregistering the rxq for the failed index 'i' (as
registration failed or was already cleaned up), but it also
erroneously skips calling netif_napi_deli() for rq[i].xdp_napi.

Since netif_napi_add() was already called for index 'i', this leaves
a dangling napi_struct in the device's napi_list. When the veth
device is later destroyed, the freed queue memory (which contains the
leaked NAPI structure) can be reused.

The subsequent device teardown iterates the NAPI list and
corrupts the reallocated memory, leading to UAF.

Fix this by explicitly deleting the NAPI association for the failed
index 'i' before rolling back the successfully configured queues.

Fixes: b02e5a0ebb17 ("xsk: Propagate napi_id to XDP socket Rx path")
Reported-by: Guenter Roeck <groeck@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Björn Töpel <bjorn.topel@intel.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Ilias Apalodimas <ilias.apalodimas@linaro.org>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Tariq Toukan <tariqt@nvidia.com>
---
 drivers/net/veth.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/net/veth.c b/drivers/net/veth.c
index 0cfb19b760dd54eb896f469c02bb02ecf5eef504..1c5142149175369a642342849addfbb9c07404bc 100644
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c
@@ -1137,6 +1137,8 @@ static int veth_enable_xdp_range(struct net_device *dev, int start, int end,
 err_reg_mem:
 	xdp_rxq_info_unreg(&priv->rq[i].xdp_rxq);
 err_rxq_reg:
+	if (!napi_already_on)
+		netif_napi_del(&priv->rq[i].xdp_napi);
 	for (i--; i >= start; i--) {
 		struct veth_rq *rq = &priv->rq[i];

-- 
2.55.0.rc0.786.g65d90a0328-goog

^ permalink raw reply related

* RE: [PATCH net v6 4/4] ice: skip unnecessary VF reset when setting trust
From: Loktionov, Aleksandr @ 2026-06-22 11:18 UTC (permalink / raw)
  To: Jose Ignacio Tornos Martinez, netdev@vger.kernel.org
  Cc: intel-wired-lan@lists.osuosl.org, Kitszel, Przemyslaw,
	Keller, Jacob E, horms@kernel.org, jesse.brandeburg@intel.com,
	Nguyen, Anthony L, davem@davemloft.net, edumazet@google.com,
	kuba@kernel.org, pabeni@redhat.com
In-Reply-To: <20260619061321.8554-5-jtornosm@redhat.com>



> -----Original Message-----
> From: Jose Ignacio Tornos Martinez <jtornosm@redhat.com>
> Sent: Friday, June 19, 2026 8:13 AM
> To: netdev@vger.kernel.org
> Cc: intel-wired-lan@lists.osuosl.org; Kitszel, Przemyslaw
> <przemyslaw.kitszel@intel.com>; Loktionov, Aleksandr
> <aleksandr.loktionov@intel.com>; Keller, Jacob E
> <jacob.e.keller@intel.com>; horms@kernel.org;
> jesse.brandeburg@intel.com; Nguyen, Anthony L
> <anthony.l.nguyen@intel.com>; davem@davemloft.net;
> edumazet@google.com; kuba@kernel.org; pabeni@redhat.com; Jose Ignacio
> Tornos Martinez <jtornosm@redhat.com>
> Subject: [PATCH net v6 4/4] ice: skip unnecessary VF reset when
> setting trust
> 
> Similar to the i40e fix, ice_set_vf_trust() unconditionally calls
> ice_reset_vf() when the trust setting changes. While the delay is
> smaller than i40e, this reset is still unnecessary in most cases.
> 
> When granting trust, no reset is needed - we can just set the
> capability flag to allow privileged operations.
> 
> When revoking trust, we only need to reset (conservative approach) if
> the VF has actually configured advanced features that require cleanup
> (MAC LLDP filters, promiscuous mode). For VFs in a clean state, we can
> safely change the trust setting without the disruptive reset.
> 
> When we do reset, we maintain the original ice pattern that has been
> reliable in production: cleanup LLDP filters first, then set vf-
> >trusted, then reset. This ensures the privilege capability bit is
> handled correctly during reset rebuild.
> 
> When we don't reset, we manually handle the capability flag via helper
> function, eliminating the delay.
> 
> Signed-off-by: Jose Ignacio Tornos Martinez <jtornosm@redhat.com>
> ---
> v6: AI review identified issues with v5's reset-before-cleanup
> approach. Revert
>     to original reset procedure (cleanup before reset) which has
> proven reliable,
>     just adding the conditional check to skip reset when VF has no
> advanced
>     features configured.
> v5: https://lore.kernel.org/all/20260429102426.210750-5-
> jtornosm@redhat.com/
> 
>  drivers/net/ethernet/intel/ice/ice_sriov.c | 33 +++++++++++++++++++--
> -
>  1 file changed, 29 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/net/ethernet/intel/ice/ice_sriov.c
> b/drivers/net/ethernet/intel/ice/ice_sriov.c
> index 7e00e091756d..XXXXXXXXXXXXXXXX 100644
> --- a/drivers/net/ethernet/intel/ice/ice_sriov.c
> +++ b/drivers/net/ethernet/intel/ice/ice_sriov.c
> @@ -1364,6 +1364,23 @@ int ice_set_vf_mac(struct net_device *netdev,
> int vf_id, u8 *mac)
>  	return __ice_set_vf_mac(ice_netdev_to_pf(netdev), vf_id, mac);
> }
> 
> +/**
> + * ice_setup_vf_trust - Enable/disable VF trust mode without reset
> + * @vf: VF to configure
> + * @setting: trust setting
> + *
> + * Update VF flags when changing trust without performing a VF reset.
> + * This is only called when it's safe to skip the reset (VF has no
> +advanced
> + * features configured that need cleanup).
> + */
> +static void ice_setup_vf_trust(struct ice_vf *vf, bool setting) {
> +	if (setting)
> +		set_bit(ICE_VIRTCHNL_VF_CAP_PRIVILEGE, &vf->vf_caps);
> +	else
> +		clear_bit(ICE_VIRTCHNL_VF_CAP_PRIVILEGE, &vf->vf_caps);
> }
> +
>  /**
>   * ice_set_vf_trust
>   * @netdev: network interface device structure @@ -1399,11 +1416,19
> @@ int ice_set_vf_trust(struct net_device *netdev, int vf_id, bool
> trusted)
> 
>  	mutex_lock(&vf->cfg_lock);
> 
> -	while (!trusted && vf->num_mac_lldp)
> -		ice_vf_update_mac_lldp_num(vf, ice_get_vf_vsi(vf),
> false);
> -
> +	/* Reset only if revoking trust and VF has advanced features
> configured */
> +	if (!trusted &&
> +	    (vf->num_mac_lldp > 0 ||
> +	     test_bit(ICE_VF_STATE_UC_PROMISC, vf->vf_states) ||
> +	     test_bit(ICE_VF_STATE_MC_PROMISC, vf->vf_states))) {
> +		while (vf->num_mac_lldp)
> +			ice_vf_update_mac_lldp_num(vf,
> ice_get_vf_vsi(vf), false);
> +		vf->trusted = trusted;
> +		ice_reset_vf(vf, ICE_VF_RESET_NOTIFY);
> +	} else {
> +		vf->trusted = trusted;
> +		ice_setup_vf_trust(vf, trusted);
> +	}
> -	vf->trusted = trusted;
> -	ice_reset_vf(vf, ICE_VF_RESET_NOTIFY);
>  	dev_info(ice_pf_to_dev(pf), "VF %u is now %strusted\n",
>  		 vf_id, trusted ? "" : "un");
> 
> --
> 2.43.0

Reviewed-by: Aleksandr Loktionov <aleksandr.loktionov@intel.com>


^ permalink raw reply

* RE: [PATCH net v6 4/4] ice: skip unnecessary VF reset when setting trust
From: Loktionov, Aleksandr @ 2026-06-22 11:19 UTC (permalink / raw)
  To: Jose Ignacio Tornos Martinez, netdev@vger.kernel.org
  Cc: intel-wired-lan@lists.osuosl.org, Kitszel, Przemyslaw,
	Keller, Jacob E, horms@kernel.org, jesse.brandeburg@intel.com,
	Nguyen, Anthony L, davem@davemloft.net, edumazet@google.com,
	kuba@kernel.org, pabeni@redhat.com
In-Reply-To: <20260619061321.8554-5-jtornosm@redhat.com>



> -----Original Message-----
> From: Jose Ignacio Tornos Martinez <jtornosm@redhat.com>
> Sent: Friday, June 19, 2026 8:13 AM
> To: netdev@vger.kernel.org
> Cc: intel-wired-lan@lists.osuosl.org; Kitszel, Przemyslaw
> <przemyslaw.kitszel@intel.com>; Loktionov, Aleksandr
> <aleksandr.loktionov@intel.com>; Keller, Jacob E
> <jacob.e.keller@intel.com>; horms@kernel.org;
> jesse.brandeburg@intel.com; Nguyen, Anthony L
> <anthony.l.nguyen@intel.com>; davem@davemloft.net;
> edumazet@google.com; kuba@kernel.org; pabeni@redhat.com; Jose Ignacio
> Tornos Martinez <jtornosm@redhat.com>
> Subject: [PATCH net v6 4/4] ice: skip unnecessary VF reset when
> setting trust
> 
> Similar to the i40e fix, ice_set_vf_trust() unconditionally calls
> ice_reset_vf() when the trust setting changes. While the delay is
> smaller than i40e, this reset is still unnecessary in most cases.
> 
> When granting trust, no reset is needed - we can just set the
> capability flag to allow privileged operations.
> 
> When revoking trust, we only need to reset (conservative approach) if
> the VF has actually configured advanced features that require cleanup
> (MAC LLDP filters, promiscuous mode). For VFs in a clean state, we can
> safely change the trust setting without the disruptive reset.
> 
> When we do reset, we maintain the original ice pattern that has been
> reliable in production: cleanup LLDP filters first, then set vf-
> >trusted, then reset. This ensures the privilege capability bit is
> handled correctly during reset rebuild.
> 
> When we don't reset, we manually handle the capability flag via helper
> function, eliminating the delay.
> 
> Signed-off-by: Jose Ignacio Tornos Martinez <jtornosm@redhat.com>
> ---
> v6: AI review identified issues with v5's reset-before-cleanup
> approach. Revert
>     to original reset procedure (cleanup before reset) which has
> proven reliable,
>     just adding the conditional check to skip reset when VF has no
> advanced
>     features configured.
> v5: https://lore.kernel.org/all/20260429102426.210750-5-
> jtornosm@redhat.com/
> 
>  drivers/net/ethernet/intel/ice/ice_sriov.c | 33 +++++++++++++++++++--
> -
>  1 file changed, 29 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/net/ethernet/intel/ice/ice_sriov.c
> b/drivers/net/ethernet/intel/ice/ice_sriov.c
> index 7e00e091756d..XXXXXXXXXXXXXXXX 100644
> --- a/drivers/net/ethernet/intel/ice/ice_sriov.c
> +++ b/drivers/net/ethernet/intel/ice/ice_sriov.c
> @@ -1364,6 +1364,23 @@ int ice_set_vf_mac(struct net_device *netdev,
> int vf_id, u8 *mac)
>  	return __ice_set_vf_mac(ice_netdev_to_pf(netdev), vf_id, mac);
> }
> 
> +/**
> + * ice_setup_vf_trust - Enable/disable VF trust mode without reset
> + * @vf: VF to configure
> + * @setting: trust setting
> + *
> + * Update VF flags when changing trust without performing a VF reset.
> + * This is only called when it's safe to skip the reset (VF has no
> +advanced
> + * features configured that need cleanup).
> + */
> +static void ice_setup_vf_trust(struct ice_vf *vf, bool setting) {
> +	if (setting)
> +		set_bit(ICE_VIRTCHNL_VF_CAP_PRIVILEGE, &vf->vf_caps);
> +	else
> +		clear_bit(ICE_VIRTCHNL_VF_CAP_PRIVILEGE, &vf->vf_caps);
> }
> +
>  /**
>   * ice_set_vf_trust
>   * @netdev: network interface device structure @@ -1399,11 +1416,19
> @@ int ice_set_vf_trust(struct net_device *netdev, int vf_id, bool
> trusted)
> 
>  	mutex_lock(&vf->cfg_lock);
> 
> -	while (!trusted && vf->num_mac_lldp)
> -		ice_vf_update_mac_lldp_num(vf, ice_get_vf_vsi(vf),
> false);
> -
> +	/* Reset only if revoking trust and VF has advanced features
> configured */
> +	if (!trusted &&
> +	    (vf->num_mac_lldp > 0 ||
> +	     test_bit(ICE_VF_STATE_UC_PROMISC, vf->vf_states) ||
> +	     test_bit(ICE_VF_STATE_MC_PROMISC, vf->vf_states))) {
> +		while (vf->num_mac_lldp)
> +			ice_vf_update_mac_lldp_num(vf,
> ice_get_vf_vsi(vf), false);
> +		vf->trusted = trusted;
> +		ice_reset_vf(vf, ICE_VF_RESET_NOTIFY);
> +	} else {
> +		vf->trusted = trusted;
> +		ice_setup_vf_trust(vf, trusted);
> +	}
> -	vf->trusted = trusted;
> -	ice_reset_vf(vf, ICE_VF_RESET_NOTIFY);
>  	dev_info(ice_pf_to_dev(pf), "VF %u is now %strusted\n",
>  		 vf_id, trusted ? "" : "un");
> 
> --
> 2.43.0

Reviewed-by: Aleksandr Loktionov <aleksandr.loktionov@intel.com>

^ permalink raw reply

* Re: [PATCH v3 0/7] Prepare mutable list iterators to cache cursor state
From: David Hildenbrand (Arm) @ 2026-06-22 11:27 UTC (permalink / raw)
  To: Alexei Starovoitov, Kaitao Cheng
  Cc: Andrew Morton, Jens Axboe, Tejun Heo, Alexander Viro,
	Christian Brauner, Alexei Starovoitov, Daniel Borkmann,
	Andrii Nakryiko, Johannes Weiner, Peter Zijlstra, Ingo Molnar,
	Arnaldo Carvalho de Melo, Namhyung Kim, Thomas Gleixner,
	Juri Lelli, Vincent Guittot, Paul Moore, Andy Shevchenko,
	Paul E. McKenney, Shakeel Butt, Christian König,
	David Howells, Simona Vetter, Randy Dunlap, Luca Ceresoli,
	Philipp Stanner, linux-block, LKML,
	open list:CONTROL GROUP (CGROUP), linux-ntfs-dev, Linux-Fsdevel,
	io-uring, audit, bpf, Network Development, dri-devel,
	linux-perf-use., linux-trace-kernel, kexec, live-patching,
	linux-modules, Linux Crypto Mailing List, Linux Power Management,
	rcu, sched-ext, linux-mm, virtualization, damon,
	clang-built-linux, chengkaitao
In-Reply-To: <CAADnVQJmPWFT01b7DuLdtafv=8FyB84GYHNZ8zSTck+9Aw0JpA@mail.gmail.com>

On 6/22/26 07:28, Alexei Starovoitov wrote:
> On Sun, Jun 21, 2026 at 9:06 PM Kaitao Cheng <kaitao.cheng@linux.dev> wrote:
>>
>> From: chengkaitao <chengkaitao@kylinos.cn>
>>
>> The list_for_each*_safe() helpers are used when the loop body may remove
>> the current entry.  Their current interface, however, forces every caller
>> to define a temporary cursor outside the macro and pass it in, even when
>> the caller never uses that cursor directly.  For most call sites this
>> extra cursor is just boilerplate required by the macro implementation.
>>
>> This is awkward because the saved next pointer is an internal detail of
>> the iteration.  Callers that only remove or move the current entry do not
>> need to spell it out.
>>
>> The _safe() suffix has also caused confusion.  Christian Koenig pointed
>> out that the name is easy to read as a thread-safe variant, especially
>> for beginners, even though it only means that the iterator keeps enough
>> state to tolerate removal of the current entry.  He suggested _mutable()
>> as a clearer description of what the loop permits.
>>
>> Add *_mutable() iterator variants for list, hlist and llist.  The new
>> helpers are variadic and support both forms.  In the common case, the
>> caller omits the temporary cursor and the macro creates a unique internal
>> cursor with typeof(pos) and __UNIQUE_ID().  If a loop really needs an
>> explicit temporary cursor, the caller can still pass it and the helper
>> keeps the existing *_safe() behaviour.
>>
>> For example, a call site may use the shorter form:
>>
>>   list_for_each_entry_mutable(pos, head, member)
>>
>> or keep the explicit temporary cursor form:
>>
>>   list_for_each_entry_mutable(pos, tmp, head, member)
>>
>> The existing *_safe() helpers remain available for compatibility.  This
>> series only converts users in mm, block, kernel, init and io_uring.  If
>> this approach looks acceptable, the remaining users can be converted in
>> follow-up series.
>>
>> Changes in v3 (Christian König, Andy Shevchenko):
>> - Convert safe list walks to mutable iterators
>>
>> Changes in v2 (Muchun Song, Andy Shevchenko):
>> - Drop the list_for_each_entry_mutable*() helpers from v1 and make the
>>   cursor change directly in the existing list_for_each_entry*() helpers.
>> - Open-code special list walks that rely on updating the loop cursor in
>>   the body, preserving their existing traversal semantics.
>>
>> Link to v2:
>> https://lore.kernel.org/all/20260609061347.93688-1-kaitao.cheng@linux.dev/
>>
>> Link to v1:
>> https://lore.kernel.org/all/20260529082149.76764-1-kaitao.cheng@linux.dev/
>>
>> Kaitao Cheng (7):
>>   list: Add mutable iterator variants
>>   llist: Add mutable iterator variants
>>   mm: Use mutable list iterators
>>   block: Use mutable list iterators
>>   kernel: Use mutable list iterators
>>   initramfs: Use mutable list iterator
>>   io_uring: Use mutable list iterators
>>
>>  block/bfq-iosched.c                 |  17 +-
>>  block/blk-cgroup.c                  |  12 +-
>>  block/blk-flush.c                   |   4 +-
>>  block/blk-iocost.c                  |  18 +-
>>  block/blk-mq.c                      |   8 +-
>>  block/blk-throttle.c                |   4 +-
>>  block/kyber-iosched.c               |   4 +-
>>  block/partitions/ldm.c              |   8 +-
>>  block/sed-opal.c                    |   4 +-
>>  include/linux/list.h                | 269 ++++++++++++++++++++++++----
>>  include/linux/llist.h               |  81 +++++++--
>>  init/initramfs.c                    |   5 +-
>>  io_uring/cancel.c                   |   6 +-
>>  io_uring/poll.c                     |   3 +-
>>  io_uring/rw.c                       |   4 +-
>>  io_uring/timeout.c                  |   8 +-
>>  io_uring/uring_cmd.c                |   3 +-
>>  kernel/audit_tree.c                 |   4 +-
>>  kernel/audit_watch.c                |  16 +-
>>  kernel/auditfilter.c                |   4 +-
>>  kernel/auditsc.c                    |   4 +-
>>  kernel/bpf/arena.c                  |  10 +-
>>  kernel/bpf/arraymap.c               |   8 +-
>>  kernel/bpf/bpf_local_storage.c      |   3 +-
>>  kernel/bpf/bpf_lru_list.c           |  25 ++-
>>  kernel/bpf/btf.c                    |  18 +-
>>  kernel/bpf/cgroup.c                 |   7 +-
>>  kernel/bpf/cpumap.c                 |   4 +-
>>  kernel/bpf/devmap.c                 |  10 +-
>>  kernel/bpf/helpers.c                |   8 +-
>>  kernel/bpf/local_storage.c          |   4 +-
>>  kernel/bpf/memalloc.c               |  16 +-
>>  kernel/bpf/offload.c                |   8 +-
>>  kernel/bpf/states.c                 |   4 +-
>>  kernel/bpf/stream.c                 |   4 +-
>>  kernel/bpf/verifier.c               |   6 +-
>>  kernel/cgroup/cgroup-v1.c           |   4 +-
>>  kernel/cgroup/cgroup.c              |  54 +++---
>>  kernel/cgroup/dmem.c                |  12 +-
>>  kernel/cgroup/rdma.c                |   8 +-
>>  kernel/events/core.c                |  44 +++--
>>  kernel/events/uprobes.c             |  12 +-
>>  kernel/exit.c                       |   8 +-
>>  kernel/fail_function.c              |   4 +-
>>  kernel/gcov/clang.c                 |   4 +-
>>  kernel/irq_work.c                   |   4 +-
>>  kernel/kexec_core.c                 |   4 +-
>>  kernel/kprobes.c                    |  16 +-
>>  kernel/livepatch/core.c             |   4 +-
>>  kernel/livepatch/core.h             |   4 +-
>>  kernel/liveupdate/kho_block.c       |   4 +-
>>  kernel/liveupdate/luo_flb.c         |   4 +-
>>  kernel/locking/rwsem.c              |   2 +-
>>  kernel/locking/test-ww_mutex.c      |   2 +-
>>  kernel/module/main.c                |  11 +-
>>  kernel/padata.c                     |   4 +-
>>  kernel/power/snapshot.c             |   8 +-
>>  kernel/power/wakelock.c             |   4 +-
>>  kernel/printk/printk.c              |  11 +-
>>  kernel/ptrace.c                     |   4 +-
>>  kernel/rcu/rcutorture.c             |   3 +-
>>  kernel/rcu/tasks.h                  |   9 +-
>>  kernel/rcu/tree.c                   |   6 +-
>>  kernel/resource.c                   |   4 +-
>>  kernel/sched/core.c                 |   4 +-
>>  kernel/sched/ext.c                  |  22 +--
>>  kernel/sched/fair.c                 |  28 +--
>>  kernel/sched/topology.c             |   4 +-
>>  kernel/sched/wait.c                 |   4 +-
>>  kernel/seccomp.c                    |   4 +-
>>  kernel/signal.c                     |  11 +-
>>  kernel/smp.c                        |   4 +-
>>  kernel/taskstats.c                  |   8 +-
>>  kernel/time/clockevents.c           |   6 +-
>>  kernel/time/clocksource.c           |   4 +-
>>  kernel/time/posix-cpu-timers.c      |   4 +-
>>  kernel/time/posix-timers.c          |   3 +-
>>  kernel/torture.c                    |   3 +-
>>  kernel/trace/bpf_trace.c            |   4 +-
>>  kernel/trace/ftrace.c               |  49 +++--
>>  kernel/trace/ring_buffer.c          |  25 ++-
>>  kernel/trace/trace.c                |  12 +-
>>  kernel/trace/trace_dynevent.c       |   6 +-
>>  kernel/trace/trace_dynevent.h       |   5 +-
>>  kernel/trace/trace_events.c         |  35 ++--
>>  kernel/trace/trace_events_filter.c  |   4 +-
>>  kernel/trace/trace_events_hist.c    |   8 +-
>>  kernel/trace/trace_events_trigger.c |  17 +-
>>  kernel/trace/trace_events_user.c    |  16 +-
>>  kernel/trace/trace_stat.c           |   4 +-
>>  kernel/user-return-notifier.c       |   3 +-
>>  kernel/workqueue.c                  |  16 +-
>>  mm/backing-dev.c                    |   8 +-
>>  mm/balloon.c                        |   8 +-
>>  mm/cma.c                            |   4 +-
>>  mm/compaction.c                     |   4 +-
>>  mm/damon/core.c                     |   4 +-
>>  mm/damon/sysfs-schemes.c            |   4 +-
>>  mm/dmapool.c                        |   4 +-
>>  mm/huge_memory.c                    |   8 +-
>>  mm/hugetlb.c                        |  56 +++---
>>  mm/hugetlb_vmemmap.c                |  16 +-
>>  mm/khugepaged.c                     |  14 +-
>>  mm/kmemleak.c                       |   7 +-
>>  mm/ksm.c                            |  25 +--
>>  mm/list_lru.c                       |   4 +-
>>  mm/memcontrol-v1.c                  |   8 +-
>>  mm/memory-failure.c                 |  12 +-
>>  mm/memory-tiers.c                   |   4 +-
>>  mm/migrate.c                        |  23 ++-
>>  mm/mmu_notifier.c                   |   9 +-
>>  mm/page_alloc.c                     |   8 +-
>>  mm/page_reporting.c                 |   2 +-
>>  mm/percpu.c                         |  11 +-
>>  mm/pgtable-generic.c                |   4 +-
>>  mm/rmap.c                           |  10 +-
>>  mm/shmem.c                          |   9 +-
>>  mm/slab_common.c                    |  14 +-
>>  mm/slub.c                           |  33 ++--
>>  mm/swapfile.c                       |   4 +-
>>  mm/userfaultfd.c                    |  12 +-
>>  mm/vmalloc.c                        |  24 +--
>>  mm/vmscan.c                         |   7 +-
>>  mm/zsmalloc.c                       |   4 +-
>>  124 files changed, 875 insertions(+), 681 deletions(-)
> 
> Not sure what you were thinking, but this diff stat
> is not landable.

Agreed. If we decide we want this, I guess we should target per-subsystem
conversions.

If this goes through the MM tree, I would even appreciate doing this on a per-MM
component granularity.

(unless we have some magic "Linus converts all of them" script, which I doubt we
will have)

Is there a way forward to replace list_for_each_*_safe entirely, possibly just
reusing the old name but simply the parameter?

-- 
Cheers,

David

^ permalink raw reply

* [PATCH net 0/3] net/mlx5e: Report zero bandwidth for non-ETS traffic
From: Tariq Toukan @ 2026-06-22 11:29 UTC (permalink / raw)
  To: Andrew Lunn, David S. Miller, Eric Dumazet, Jakub Kicinski,
	netdev, Paolo Abeni
  Cc: Alexei Lazar, Carolina Jubran, Leon Romanovsky, linux-kernel,
	linux-rdma, Mark Bloch, Saeed Mahameed, Tariq Toukan,
	Gal Pressman

Hi,

The IEEE 802.1Qaz standard restricts bandwidth allocation percentages
to Enhanced Transmission Selection (ETS) traffic classes; STRICT,
VENDOR, and CB Shaper TSA types carry no bandwidth semantics.  Two
problems exist in the mlx5e DCBNL ETS implementation: the get path
reports 100% bandwidth for all TCs regardless of TSA type due to a
hardware limitation, and the set path neither rejects non-zero
bandwidth values for non-ETS TCs nor rejects the unsupported CB
Shaper TSA altogether.  The first issue was introduced by commit
820c2c5e773d ("net/mlx5e: Read ETS settings directly from firmware")
and the latter two by commit 08fb1dacdd76 ("net/mlx5e: Support DCBNL
IEEE ETS").

This series by Alexei Lazar fixes the get path to report zero
bandwidth for non-ETS traffic classes, adds validation to reject
non-zero bandwidth for non-ETS TCs on the set path, and rejects CB
Shaper TSA configurations that the driver does not support.

Regards,
Tariq

Alexei Lazar (3):
  net/mlx5e: Report zero bandwidth for non-ETS traffic classes
  net/mlx5e: Validate bandwidth for non-ETS traffic classes
  net/mlx5e: Reject unsupported CB Shaper TSA in ETS validation

 .../ethernet/mellanox/mlx5/core/en_dcbnl.c    | 24 +++++++++++++++++++
 1 file changed, 24 insertions(+)

base-commit: d07d80b6a129a44538cda1549b7acf95154fb197
-- 
2.44.0

^ permalink raw reply

* [PATCH net 1/3] net/mlx5e: Report zero bandwidth for non-ETS traffic classes
From: Tariq Toukan @ 2026-06-22 11:29 UTC (permalink / raw)
  To: Andrew Lunn, David S. Miller, Eric Dumazet, Jakub Kicinski,
	netdev, Paolo Abeni
  Cc: Alexei Lazar, Carolina Jubran, Leon Romanovsky, linux-kernel,
	linux-rdma, Mark Bloch, Saeed Mahameed, Tariq Toukan,
	Gal Pressman
In-Reply-To: <20260622112925.624795-1-tariqt@nvidia.com>

From: Alexei Lazar <alazar@nvidia.com>

The IEEE 802.1Qaz standard defines that bandwidth allocation percentages
only apply to Enhanced Transmission Selection (ETS) traffic classes.
For STRICT and VENDOR transmission selection algorithms, bandwidth
percentage values are not applicable.

Currently for non-ETS 100 bandwidth is being reported for all traffic
classes in the get operation due to hardware limitation, regardless of
their TSA type.

Fix this by reporting 0 for non-ETS traffic classes.

Fixes: 820c2c5e773d ("net/mlx5e: Read ETS settings directly from firmware")
Signed-off-by: Alexei Lazar <alazar@nvidia.com>
Reviewed-by: Carolina Jubran <cjubran@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c b/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c
index 4b86df6d5b9e..762f0a46c120 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c
@@ -173,6 +173,13 @@ static int mlx5e_dcbnl_ieee_getets(struct net_device *netdev,
 	}
 	memcpy(ets->tc_tsa, priv->dcbx.tc_tsa, sizeof(ets->tc_tsa));
 
+	/* Report 0 for non ETS TSA */
+	for (i = 0; i < ets->ets_cap; i++) {
+		if (ets->tc_tx_bw[i] == MLX5E_MAX_BW_ALLOC &&
+		    priv->dcbx.tc_tsa[i] != IEEE_8021QAZ_TSA_ETS)
+			ets->tc_tx_bw[i] = 0;
+	}
+
 	return err;
 }
 
-- 
2.44.0


^ permalink raw reply related

* [PATCH net 2/3] net/mlx5e: Validate bandwidth for non-ETS traffic classes
From: Tariq Toukan @ 2026-06-22 11:29 UTC (permalink / raw)
  To: Andrew Lunn, David S. Miller, Eric Dumazet, Jakub Kicinski,
	netdev, Paolo Abeni
  Cc: Alexei Lazar, Carolina Jubran, Leon Romanovsky, linux-kernel,
	linux-rdma, Mark Bloch, Saeed Mahameed, Tariq Toukan,
	Gal Pressman
In-Reply-To: <20260622112925.624795-1-tariqt@nvidia.com>

From: Alexei Lazar <alazar@nvidia.com>

The IEEE 802.1Qaz standard defines that bandwidth allocation percentages
only apply to ETS traffic classes.

Reject ETS configurations that specify non-zero bandwidth for traffic
classes.

Fixes: 08fb1dacdd76 ("net/mlx5e: Support DCBNL IEEE ETS")
Signed-off-by: Alexei Lazar <alazar@nvidia.com>
Reviewed-by: Carolina Jubran <cjubran@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c b/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c
index 762f0a46c120..e4161603cdc0 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c
@@ -324,6 +324,17 @@ static int mlx5e_dbcnl_validate_ets(struct net_device *netdev,
 		}
 	}
 
+	/* Validate Non ETS BW */
+	for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++) {
+		if (ets->tc_tsa[i] != IEEE_8021QAZ_TSA_ETS &&
+		    ets->tc_tx_bw[i]) {
+			netdev_err(netdev,
+				   "Failed to validate ETS: tc=%d BW is not 0 for non-ETS TC (tsa=%u, bw=%u)\n",
+				   i, ets->tc_tsa[i], ets->tc_tx_bw[i]);
+			return -EINVAL;
+		}
+	}
+
 	/* Validate Bandwidth Sum */
 	for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++) {
 		if (ets->tc_tsa[i] == IEEE_8021QAZ_TSA_ETS) {
-- 
2.44.0


^ permalink raw reply related

* [PATCH net 3/3] net/mlx5e: Reject unsupported CB Shaper TSA in ETS validation
From: Tariq Toukan @ 2026-06-22 11:29 UTC (permalink / raw)
  To: Andrew Lunn, David S. Miller, Eric Dumazet, Jakub Kicinski,
	netdev, Paolo Abeni
  Cc: Alexei Lazar, Carolina Jubran, Leon Romanovsky, linux-kernel,
	linux-rdma, Mark Bloch, Saeed Mahameed, Tariq Toukan,
	Gal Pressman
In-Reply-To: <20260622112925.624795-1-tariqt@nvidia.com>

From: Alexei Lazar <alazar@nvidia.com>

Credit Based (CB) TSA is not supported by the mlx5 driver, so reject
any configurations that specify it.

Fixes: 08fb1dacdd76 ("net/mlx5e: Support DCBNL IEEE ETS")
Signed-off-by: Alexei Lazar <alazar@nvidia.com>
Reviewed-by: Carolina Jubran <cjubran@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c b/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c
index e4161603cdc0..602b982b1bbf 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c
@@ -326,6 +326,12 @@ static int mlx5e_dbcnl_validate_ets(struct net_device *netdev,
 
 	/* Validate Non ETS BW */
 	for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++) {
+		if (ets->tc_tsa[i] == IEEE_8021QAZ_TSA_CB_SHAPER) {
+			netdev_err(netdev,
+				   "Failed to validate ETS: CB Shaper is not supported\n");
+			return -EOPNOTSUPP;
+		}
+
 		if (ets->tc_tsa[i] != IEEE_8021QAZ_TSA_ETS &&
 		    ets->tc_tx_bw[i]) {
 			netdev_err(netdev,
-- 
2.44.0


^ permalink raw reply related

* [syzbot] [net?] INFO: task hung in netdev_rx_mode_work
From: syzbot @ 2026-06-22 11:32 UTC (permalink / raw)
  To: davem, edumazet, horms, kuba, linux-kernel, netdev, pabeni,
	syzkaller-bugs

Hello,

syzbot found the following issue on:

HEAD commit:    390d73adf896 Merge tag 'for-v7.2' of git://git.kernel.org/..
git tree:       upstream
console output: https://syzkaller.appspot.com/x/log.txt?x=12fad766580000
kernel config:  https://syzkaller.appspot.com/x/.config?x=bef29eb7c0231032
dashboard link: https://syzkaller.appspot.com/bug?extid=cb67c392b0b8f0fd0fc1
compiler:       Debian clang version 22.1.6 (++20260514074242+fc4aad7b5db3-1~exp1~20260514074407.73), Debian LLD 22.1.6

Unfortunately, I don't have any reproducer for this issue yet.

Downloadable assets:
disk image: https://storage.googleapis.com/syzbot-assets/5511a10e958e/disk-390d73ad.raw.xz
vmlinux: https://storage.googleapis.com/syzbot-assets/bbdc93bd1239/vmlinux-390d73ad.xz
kernel image: https://storage.googleapis.com/syzbot-assets/7d72a4665dd8/bzImage-390d73ad.xz

IMPORTANT: if you fix the issue, please add the following tag to the commit:
Reported-by: syzbot+cb67c392b0b8f0fd0fc1@syzkaller.appspotmail.com

INFO: task kworker/0:5:5689 blocked for more than 143 seconds.
      Tainted: G             L      syzkaller #0
"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
task:kworker/0:5     state:D stack:21448 pid:5689  tgid:5689  ppid:2      task_flags:0x4208060 flags:0x00080000
Workqueue: events netdev_rx_mode_work
Call Trace:
 <TASK>
 context_switch kernel/sched/core.c:5504 [inline]
 __schedule+0x17d9/0x56c0 kernel/sched/core.c:7228
 __schedule_loop kernel/sched/core.c:7307 [inline]
 schedule+0x164/0x360 kernel/sched/core.c:7322
 schedule_preempt_disabled+0x13/0x30 kernel/sched/core.c:7379
 __mutex_lock_common kernel/locking/mutex.c:726 [inline]
 __mutex_lock+0x7bf/0x1550 kernel/locking/mutex.c:821
 netdev_rx_mode_work+0x18/0x3c0 net/core/dev_addr_lists.c:1346
 process_one_work kernel/workqueue.c:3322 [inline]
 process_scheduled_works+0xa8e/0x14e0 kernel/workqueue.c:3405
 worker_thread+0xa47/0xfb0 kernel/workqueue.c:3486
 kthread+0x388/0x470 kernel/kthread.c:436
 ret_from_fork+0x514/0xb70 arch/x86/kernel/process.c:158
 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:245
 </TASK>
INFO: task kworker/u8:45:10240 blocked for more than 143 seconds.
      Tainted: G             L      syzkaller #0
"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
task:kworker/u8:45   state:D stack:22648 pid:10240 tgid:10240 ppid:2      task_flags:0x4208060 flags:0x00080000
Workqueue: events_unbound linkwatch_event
Call Trace:
 <TASK>
 context_switch kernel/sched/core.c:5504 [inline]
 __schedule+0x17d9/0x56c0 kernel/sched/core.c:7228
 __schedule_loop kernel/sched/core.c:7307 [inline]
 schedule+0x164/0x360 kernel/sched/core.c:7322
 schedule_preempt_disabled+0x13/0x30 kernel/sched/core.c:7379
 __mutex_lock_common kernel/locking/mutex.c:726 [inline]
 __mutex_lock+0x7bf/0x1550 kernel/locking/mutex.c:821
 linkwatch_event+0xe/0x60 net/core/link_watch.c:313
 process_one_work kernel/workqueue.c:3322 [inline]
 process_scheduled_works+0xa8e/0x14e0 kernel/workqueue.c:3405
 worker_thread+0xa47/0xfb0 kernel/workqueue.c:3486
 kthread+0x388/0x470 kernel/kthread.c:436
 ret_from_fork+0x514/0xb70 arch/x86/kernel/process.c:158
 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:245
 </TASK>
INFO: task syz.2.4918:21109 blocked for more than 143 seconds.
      Tainted: G             L      syzkaller #0
"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
task:syz.2.4918      state:D stack:22328 pid:21109 tgid:21108 ppid:14919  task_flags:0x400140 flags:0x00080002
Call Trace:
 <TASK>
 context_switch kernel/sched/core.c:5504 [inline]
 __schedule+0x17d9/0x56c0 kernel/sched/core.c:7228
 __schedule_loop kernel/sched/core.c:7307 [inline]
 schedule+0x164/0x360 kernel/sched/core.c:7322
 schedule_preempt_disabled+0x13/0x30 kernel/sched/core.c:7379
 __mutex_lock_common kernel/locking/mutex.c:726 [inline]
 __mutex_lock+0x7bf/0x1550 kernel/locking/mutex.c:821
 netdev_lock include/linux/netdevice.h:2837 [inline]
 netdev_lock_ops include/net/netdev_lock.h:42 [inline]
 __ethtool_get_link_ksettings+0x109/0x250 net/ethtool/ioctl.c:463
 bond_update_speed_duplex drivers/net/bonding/bond_main.c:801 [inline]
 bond_slave_netdev_event drivers/net/bonding/bond_main.c:3982 [inline]
 bond_netdev_event+0x643/0xf80 drivers/net/bonding/bond_main.c:4089
 notifier_call_chain+0x1a5/0x3d0 kernel/notifier.c:85
 call_netdevice_notifiers_extack net/core/dev.c:2288 [inline]
 call_netdevice_notifiers net/core/dev.c:2302 [inline]
 __dev_notify_flags+0x1aa/0x310 net/core/dev.c:9793
 netif_change_flags+0xde/0x1b0 net/core/dev.c:9822
 dev_change_flags+0x128/0x260 net/core/dev_api.c:68
 vlan_device_event+0x1b4e/0x1f00 net/8021q/vlan.c:494
 notifier_call_chain+0x1a5/0x3d0 kernel/notifier.c:85
 call_netdevice_notifiers_extack net/core/dev.c:2288 [inline]
 call_netdevice_notifiers net/core/dev.c:2302 [inline]
 __dev_notify_flags+0x1aa/0x310 net/core/dev.c:9793
 netif_change_flags+0xde/0x1b0 net/core/dev.c:9822
 do_setlink+0xdd6/0x4670 net/core/rtnetlink.c:3207
 rtnl_changelink net/core/rtnetlink.c:3841 [inline]
 __rtnl_newlink net/core/rtnetlink.c:4014 [inline]
 rtnl_newlink+0x15c2/0x1bd0 net/core/rtnetlink.c:4151
 rtnetlink_rcv_msg+0x802/0xc00 net/core/rtnetlink.c:7068
 netlink_rcv_skb+0x226/0x4a0 net/netlink/af_netlink.c:2556
 netlink_unicast_kernel net/netlink/af_netlink.c:1319 [inline]
 netlink_unicast+0x7bb/0x940 net/netlink/af_netlink.c:1345
 netlink_sendmsg+0x813/0xb40 net/netlink/af_netlink.c:1900
 sock_sendmsg_nosec+0x13a/0x180 net/socket.c:775
 __sock_sendmsg net/socket.c:790 [inline]
 ____sys_sendmsg+0x54e/0x850 net/socket.c:2684
 ___sys_sendmsg+0x2a5/0x360 net/socket.c:2738
 __sys_sendmsg net/socket.c:2770 [inline]
 __do_sys_sendmsg net/socket.c:2775 [inline]
 __se_sys_sendmsg net/socket.c:2773 [inline]
 __x64_sys_sendmsg+0x1b1/0x290 net/socket.c:2773
 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline]
 do_syscall_64+0x174/0x580 arch/x86/entry/syscall_64.c:94
 entry_SYSCALL_64_after_hwframe+0x77/0x7f
RIP: 0033:0x7fdcc019ce59
RSP: 002b:00007fdcc1070028 EFLAGS: 00000246 ORIG_RAX: 000000000000002e
RAX: ffffffffffffffda RBX: 00007fdcc0415fa0 RCX: 00007fdcc019ce59
RDX: 0000000000000000 RSI: 0000200000000340 RDI: 0000000000000004
RBP: 00007fdcc0232e6f R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000
R13: 00007fdcc0416038 R14: 00007fdcc0415fa0 R15: 00007fdcc053fa48
 </TASK>
INFO: task syz.3.4923:21119 blocked for more than 144 seconds.
      Tainted: G             L      syzkaller #0
"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
task:syz.3.4923      state:D stack:25976 pid:21119 tgid:21119 ppid:14189  task_flags:0x400040 flags:0x00080002
Call Trace:
 <TASK>
 context_switch kernel/sched/core.c:5504 [inline]
 __schedule+0x17d9/0x56c0 kernel/sched/core.c:7228
 __schedule_loop kernel/sched/core.c:7307 [inline]
 schedule+0x164/0x360 kernel/sched/core.c:7322
 schedule_preempt_disabled+0x13/0x30 kernel/sched/core.c:7379
 __mutex_lock_common kernel/locking/mutex.c:726 [inline]
 __mutex_lock+0x7bf/0x1550 kernel/locking/mutex.c:821
 tun_detach drivers/net/tun.c:641 [inline]
 tun_chr_close+0x3e/0x1c0 drivers/net/tun.c:3506
 __fput+0x418/0xa50 fs/file_table.c:512
 task_work_run+0x1d9/0x270 kernel/task_work.c:233
 resume_user_mode_work include/linux/resume_user_mode.h:50 [inline]
 __exit_to_user_mode_loop kernel/entry/common.c:70 [inline]
 exit_to_user_mode_loop+0x1fa/0x730 kernel/entry/common.c:101
 __exit_to_user_mode_prepare include/linux/irq-entry-common.h:207 [inline]
 syscall_exit_to_user_mode_prepare include/linux/irq-entry-common.h:230 [inline]
 syscall_exit_to_user_mode include/linux/entry-common.h:318 [inline]
 do_syscall_64+0x353/0x580 arch/x86/entry/syscall_64.c:100
 entry_SYSCALL_64_after_hwframe+0x77/0x7f
RIP: 0033:0x7fc966b9ce59
RSP: 002b:00007fc966f3fba8 EFLAGS: 00000246 ORIG_RAX: 00000000000001b4
RAX: 0000000000000000 RBX: 00007fc966e17da0 RCX: 00007fc966b9ce59
RDX: 0000000000000000 RSI: 000000000000001e RDI: 0000000000000003
RBP: 00007fc966e17da0 R08: 0000000000000006 R09: 0000000000000000
R10: 00007fc966e17cb0 R11: 0000000000000246 R12: 00000000000b3ba4
R13: 00007fc966e1609c R14: 00000000000b38d2 R15: 00007fc966e16090
 </TASK>
INFO: task syz.3.4923:21121 blocked for more than 144 seconds.
      Tainted: G             L      syzkaller #0
"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
task:syz.3.4923      state:D stack:28792 pid:21121 tgid:21119 ppid:14189  task_flags:0x400140 flags:0x00080002
Call Trace:
 <TASK>
 context_switch kernel/sched/core.c:5504 [inline]
 __schedule+0x17d9/0x56c0 kernel/sched/core.c:7228
 __schedule_loop kernel/sched/core.c:7307 [inline]
 schedule+0x164/0x360 kernel/sched/core.c:7322
 schedule_preempt_disabled+0x13/0x30 kernel/sched/core.c:7379
 __mutex_lock_common kernel/locking/mutex.c:726 [inline]
 __mutex_lock+0x7bf/0x1550 kernel/locking/mutex.c:821
 rtnl_net_lock include/linux/rtnetlink.h:130 [inline]
 devinet_ioctl+0x2fb/0x1c50 net/ipv4/devinet.c:1120
 inet_ioctl+0x42a/0x560 net/ipv4/af_inet.c:1009
 sock_do_ioctl+0x101/0x320 net/socket.c:1300
 sock_ioctl+0x57a/0x7e0 net/socket.c:1421
 vfs_ioctl fs/ioctl.c:51 [inline]
 __do_sys_ioctl fs/ioctl.c:597 [inline]
 __se_sys_ioctl+0xfc/0x170 fs/ioctl.c:583
 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline]
 do_syscall_64+0x174/0x580 arch/x86/entry/syscall_64.c:94
 entry_SYSCALL_64_after_hwframe+0x77/0x7f
RIP: 0033:0x7fc966b9ce59
RSP: 002b:00007fc9679b8028 EFLAGS: 00000246 ORIG_RAX: 0000000000000010
RAX: ffffffffffffffda RBX: 00007fc966e15fa0 RCX: 00007fc966b9ce59
RDX: 0000200000000040 RSI: 0000000000008914 RDI: 0000000000000004
RBP: 00007fc966c32e6f R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000
R13: 00007fc966e16038 R14: 00007fc966e15fa0 R15: 00007fc966f3fa48
 </TASK>
INFO: task syz.5.4925:21126 blocked for more than 145 seconds.
      Tainted: G             L      syzkaller #0
"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
task:syz.5.4925      state:D stack:26936 pid:21126 tgid:21125 ppid:20740  task_flags:0x400140 flags:0x00080002
Call Trace:
 <TASK>
 context_switch kernel/sched/core.c:5504 [inline]
 __schedule+0x17d9/0x56c0 kernel/sched/core.c:7228
 __schedule_loop kernel/sched/core.c:7307 [inline]
 schedule+0x164/0x360 kernel/sched/core.c:7322
 schedule_preempt_disabled+0x13/0x30 kernel/sched/core.c:7379
 __mutex_lock_common kernel/locking/mutex.c:726 [inline]
 __mutex_lock+0x7bf/0x1550 kernel/locking/mutex.c:821
 rtnl_lock net/core/rtnetlink.c:80 [inline]
 rtnl_nets_lock net/core/rtnetlink.c:341 [inline]
 rtnl_newlink+0x8a3/0x1bd0 net/core/rtnetlink.c:4150
 rtnetlink_rcv_msg+0x802/0xc00 net/core/rtnetlink.c:7068
 netlink_rcv_skb+0x226/0x4a0 net/netlink/af_netlink.c:2556
 netlink_unicast_kernel net/netlink/af_netlink.c:1319 [inline]
 netlink_unicast+0x7bb/0x940 net/netlink/af_netlink.c:1345
 netlink_sendmsg+0x813/0xb40 net/netlink/af_netlink.c:1900
 sock_sendmsg_nosec+0x13a/0x180 net/socket.c:775
 __sock_sendmsg net/socket.c:790 [inline]
 ____sys_sendmsg+0x54e/0x850 net/socket.c:2684
 ___sys_sendmsg+0x2a5/0x360 net/socket.c:2738
 __sys_sendmsg net/socket.c:2770 [inline]
 __do_sys_sendmsg net/socket.c:2775 [inline]
 __se_sys_sendmsg net/socket.c:2773 [inline]
 __x64_sys_sendmsg+0x1b1/0x290 net/socket.c:2773
 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline]
 do_syscall_64+0x174/0x580 arch/x86/entry/syscall_64.c:94
 entry_SYSCALL_64_after_hwframe+0x77/0x7f
RIP: 0033:0x7f1b77d9ce59
RSP: 002b:00007f1b78c2d028 EFLAGS: 00000246 ORIG_RAX: 000000000000002e
RAX: ffffffffffffffda RBX: 00007f1b78015fa0 RCX: 00007f1b77d9ce59
RDX: 0000000000000000 RSI: 0000200000000040 RDI: 0000000000000008
RBP: 00007f1b77e32e6f R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000
R13: 00007f1b78016038 R14: 00007f1b78015fa0 R15: 00007f1b7813fa48
 </TASK>
INFO: task syz.1.4926:21132 blocked for more than 145 seconds.
      Tainted: G             L      syzkaller #0
"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
task:syz.1.4926      state:D stack:25448 pid:21132 tgid:21129 ppid:14306  task_flags:0x400140 flags:0x00080002
Call Trace:
 <TASK>
 context_switch kernel/sched/core.c:5504 [inline]
 __schedule+0x17d9/0x56c0 kernel/sched/core.c:7228
 __schedule_loop kernel/sched/core.c:7307 [inline]
 schedule+0x164/0x360 kernel/sched/core.c:7322
 schedule_preempt_disabled+0x13/0x30 kernel/sched/core.c:7379
 __mutex_lock_common kernel/locking/mutex.c:726 [inline]
 __mutex_lock+0x7bf/0x1550 kernel/locking/mutex.c:821
 rtnl_net_lock include/linux/rtnetlink.h:130 [inline]
 dev_ioctl+0x7a4/0x1150 net/core/dev_ioctl.c:815
 sock_do_ioctl+0x23e/0x320 net/socket.c:1314
 sock_ioctl+0x57a/0x7e0 net/socket.c:1421
 vfs_ioctl fs/ioctl.c:51 [inline]
 __do_sys_ioctl fs/ioctl.c:597 [inline]
 __se_sys_ioctl+0xfc/0x170 fs/ioctl.c:583
 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline]
 do_syscall_64+0x174/0x580 arch/x86/entry/syscall_64.c:94
 entry_SYSCALL_64_after_hwframe+0x77/0x7f
RIP: 0033:0x7f41c799ce59
RSP: 002b:00007f41c5bb4028 EFLAGS: 00000246 ORIG_RAX: 0000000000000010
RAX: ffffffffffffffda RBX: 00007f41c7c16180 RCX: 00007f41c799ce59
RDX: 0000200000000080 RSI: 0000000000008993 RDI: 000000000000000b
RBP: 00007f41c7a32e6f R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000
R13: 00007f41c7c16218 R14: 00007f41c7c16180 R15: 00007f41c7d3fa48
 </TASK>
INFO: lockdep is turned off.
NMI backtrace for cpu 1
CPU: 1 UID: 0 PID: 30 Comm: khungtaskd Tainted: G             L      syzkaller #0 PREEMPT(full) 
Tainted: [L]=SOFTLOCKUP
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 05/09/2026
Call Trace:
 <TASK>
 dump_stack_lvl+0xe8/0x150 lib/dump_stack.c:120
 nmi_cpu_backtrace+0x274/0x2d0 lib/nmi_backtrace.c:113
 nmi_trigger_cpumask_backtrace+0x17a/0x300 lib/nmi_backtrace.c:62
 trigger_all_cpu_backtrace include/linux/nmi.h:162 [inline]
 __sys_info lib/sys_info.c:157 [inline]
 sys_info+0x135/0x170 lib/sys_info.c:165
 check_hung_uninterruptible_tasks kernel/hung_task.c:353 [inline]
 watchdog+0xfd7/0x1030 kernel/hung_task.c:561
 kthread+0x388/0x470 kernel/kthread.c:436
 ret_from_fork+0x514/0xb70 arch/x86/kernel/process.c:158
 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:245
 </TASK>
Sending NMI from CPU 1 to CPUs 0:
NMI backtrace for cpu 0
CPU: 0 UID: 0 PID: 0 Comm: swapper/0 Tainted: G             L      syzkaller #0 PREEMPT(full) 
Tainted: [L]=SOFTLOCKUP
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 05/09/2026
RIP: 0010:pv_native_safe_halt+0xf/0x20 arch/x86/kernel/paravirt.c:64
Code: bc 7a 02 e9 13 05 03 00 cc cc cc 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 f3 0f 1e fa 66 90 0f 00 2d 33 28 1b 00 fb f4 <c3> cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc 90 90 90 90 90
RSP: 0018:ffffffff8e607de0 EFLAGS: 00000246
RAX: ffff888125253000 RBX: ffffffff819a72e0 RCX: 0000000080000001
RDX: 0000000000000001 RSI: ffffffff8c295400 RDI: ffffffff819a72e0
RBP: ffffffff8e607eb8 R08: ffff8880b86338db R09: 1ffff110170c671b
R10: dffffc0000000000 R11: ffffed10170c671c R12: 0000000000000000
R13: 1ffffffff1cd1de8 R14: 1ffffffff1cc0fc4 R15: dffffc0000000000
FS:  0000000000000000(0000) GS:ffff888125253000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000563c978b0ee8 CR3: 000000000e746000 CR4: 00000000003526f0
DR0: ffffffffffffffff DR1: 8000000000000000 DR2: 00000000000003ff
DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
Call Trace:
 <TASK>
 arch_safe_halt arch/x86/kernel/process.c:766 [inline]
 default_idle+0x9/0x20 arch/x86/kernel/process.c:767
 default_idle_call+0x72/0xb0 kernel/sched/idle.c:122
 cpuidle_idle_call kernel/sched/idle.c:199 [inline]
 do_idle+0x2e0/0x540 kernel/sched/idle.c:355
 cpu_startup_entry+0x43/0x60 kernel/sched/idle.c:454
 rest_init+0x2de/0x300 init/main.c:717
 start_kernel+0x38a/0x3e0 init/main.c:1175
 x86_64_start_reservations+0x24/0x30 arch/x86/kernel/head64.c:310
 x86_64_start_kernel+0x137/0x1b0 arch/x86/kernel/head64.c:291
 common_startup_64+0x13e/0x157
 </TASK>


---
This report is generated by a bot. It may contain errors.
See https://goo.gl/tpsmEJ for more information about syzbot.
syzbot engineers can be reached at syzkaller@googlegroups.com.

syzbot will keep track of this issue. See:
https://goo.gl/tpsmEJ#status for how to communicate with syzbot.

If the report is already addressed, let syzbot know by replying with:
#syz fix: exact-commit-title

If you want to overwrite report's subsystems, reply with:
#syz set subsystems: new-subsystem
(See the list of subsystem names on the web dashboard)

If the report is a duplicate of another one, reply with:
#syz dup: exact-subject-of-another-report

If you want to undo deduplication, reply with:
#syz undup

^ permalink raw reply

* Re: [PATCH net v2] octeontx2-af: npc: cn20k: Fix subbank free list indexing for search order
From: Simon Horman @ 2026-06-22 11:34 UTC (permalink / raw)
  To: Ratheesh Kannoth
  Cc: kuba, linux-kernel, netdev, andrew+netdev, davem, edumazet,
	pabeni, sgoutham
In-Reply-To: <20260619095100.1864440-1-rkannoth@marvell.com>

On Fri, Jun 19, 2026 at 03:21:00PM +0530, Ratheesh Kannoth wrote:
> subbank_srch_order[i] is the physical subbank at search-order slot i,
> so each subbank's arr_idx must be i (its slot), not
> subbank_srch_order[sb->idx].  The old logic mis-keyed xa_sb_free
> and broke allocation traversal order.
> 
> Populate arr_idx and xa_sb_free in a single pass over the search
> order after subbank structs are initialized.
> 
> Fixes: 7ac9d4c4075c ("octeontx2-af: npc: cn20k: add subbank search order control")
> Signed-off-by: Ratheesh Kannoth <rkannoth@marvell.com>
> 
> ---
> v1 -> v2: Addressed simon comments
> 	https://lore.kernel.org/netdev/20260619091341.918165-1-horms@kernel.org/

Thanks for the update.

Reviewed-by: Simon Horman <horms@kernel.org>

FTR, I think the issue flagged in the AI-generated review of this
patch on sashiko.org can be treated in the context of possible follow-up.
I don' think it should impede the progress of this patch.


^ permalink raw reply

* [PATCH iwl-net v2 0/2] ice: fix DFLT Rx rule handling for promisc and switchdev
From: Petr Oros @ 2026-06-22 11:34 UTC (permalink / raw)
  To: netdev; +Cc: Petr Oros

Two fixes for the uplink default VSI Rx rule (DFLT) on E810 when the
netdev is in IFF_PROMISC.

Patch 1 drops the redundant per-VLAN promisc expansion that exhausts
the FLU pool on a wide VLAN trunk across several PFs.

Patch 2 keeps the DFLT Rx rule across a switchdev teardown instead of
clobbering the promisc state the operator asked for.

Changes since v1:
- Patch 2: reworked to avoid the service task entirely. v1 scheduled a
  filter sync in ice_eswitch_disable_switchdev(); that work could run
  after ice_remove() freed the uplink VSI (use-after-free) and was not
  guaranteed to fire if ice_set_rx_mode() never ran again. v2 keeps or
  drops the DFLT Rx rule synchronously in ice_eswitch_release_env() by
  testing the live netdev->flags IFF_PROMISC, the same value
  ice_cfg_vlan_pruning() already keys on. No service task is scheduled
  and no symbol is exported. Dropped Aleksandr's Reviewed-by since the
  fix mechanism changed.
- Patch 1: no functional changes, collected Aleksandr's Reviewed-by.

Link to v1:
https://lore.kernel.org/all/cover.1781786935.git.poros@redhat.com/

Petr Oros (2):
  ice: skip per-VLAN promisc rules when default VSI Rx rule is set
  ice: preserve uplink DFLT Rx rule on switchdev release

 drivers/net/ethernet/intel/ice/ice_eswitch.c | 18 +++-
 drivers/net/ethernet/intel/ice/ice_main.c    | 90 +++++++++++++++-----
 2 files changed, 84 insertions(+), 24 deletions(-)

-- 
2.53.0


^ permalink raw reply

* [PATCH iwl-net v2 2/2] ice: preserve uplink DFLT Rx rule on switchdev release
From: Petr Oros @ 2026-06-22 11:34 UTC (permalink / raw)
  To: netdev; +Cc: Petr Oros
In-Reply-To: <20260622113428.2565255-1-poros@redhat.com>

ice_eswitch_setup_env() calls ice_set_dflt_vsi() to install the
ICE_SW_LKUP_DFLT Rx rule on the uplink VSI. The helper returns 0 even
when the rule is already in place, so the call is a no-op if
ice_vsi_sync_fltr() had previously installed the DFLT rule in response
to IFF_PROMISC on the uplink netdev. ice_remove_vsi_fltr() called
earlier in ice_eswitch_setup_env() does not affect this rule because
ice_remove_vsi_lkup_fltr() lacks a case for ICE_SW_LKUP_DFLT and falls
into its default branch which only logs. Switchdev mode then adds an
ICE_FLTR_TX leg via ice_cfg_dflt_vsi() on the same VSI handle.

ice_eswitch_release_env() unconditionally removed both the Rx and Tx
DFLT rules. When the Rx DFLT was installed by ice_vsi_sync_fltr()
before the switchdev session started, this clobbered promisc state the
operator had asked for: the DFLT Rx rule disappeared while IFF_PROMISC
was still set on the netdev, and the IFF_PROMISC sync path was not
retriggered, so the uplink ended the session without the catch-all
rule the netdev flags requested.

Skip the Rx DFLT removal when the uplink is promiscuous, both in
ice_eswitch_release_env() and in the err_def_tx unwind of
ice_eswitch_setup_env(). The Tx leg installed by switchdev is always
removed since switchdev owns it.

Test the live netdev->flags for this decision. The ena_rx_filtering()
call right above in ice_eswitch_release_env() reaches
ice_cfg_vlan_pruning(), which already keys on the live netdev->flags
IFF_PROMISC bit, so reusing the same value keeps the preserved DFLT
rule and the VLAN pruning state mutually consistent across every
promisc transition, including one the operator made while switchdev
ran: ice_set_rx_mode() is gated off for the uplink during the session,
so such a change never reaches the filter sync, but it is reflected in
netdev->flags and is therefore honored here on release.

Fixes: 1a1c40df2e80 ("ice: set and release switchdev environment")
Signed-off-by: Petr Oros <poros@redhat.com>
---
v2:
- Reworked the fix to avoid the service task entirely. v1 scheduled a
  filter sync in ice_eswitch_disable_switchdev() to reconcile the uplink
  DFLT Rx rule; that work could run after ice_remove() freed the uplink
  VSI (use-after-free) and was not guaranteed to fire if ice_set_rx_mode()
  never ran again. v2 keeps or drops the DFLT Rx rule synchronously in
  ice_eswitch_release_env() (and the setup_env error unwind) by testing
  the live netdev->flags IFF_PROMISC, the same value ice_cfg_vlan_pruning()
  already keys on, so the preserved rule and the pruning state stay
  consistent. No service task is scheduled and no symbol is exported.
- Dropped the Reviewed-by since the fix mechanism changed.

v1: https://lore.kernel.org/all/deef5756e534ef06c12d910c5305d3fd205d30a0.1781786935.git.poros@redhat.com/
---
 drivers/net/ethernet/intel/ice/ice_eswitch.c | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/intel/ice/ice_eswitch.c b/drivers/net/ethernet/intel/ice/ice_eswitch.c
index 2e4f0969035f77..48273ef9f69dc8 100644
--- a/drivers/net/ethernet/intel/ice/ice_eswitch.c
+++ b/drivers/net/ethernet/intel/ice/ice_eswitch.c
@@ -66,8 +66,10 @@ static int ice_eswitch_setup_env(struct ice_pf *pf)
 	ice_cfg_dflt_vsi(uplink_vsi->port_info, uplink_vsi->idx, false,
 			 ICE_FLTR_TX);
 err_def_tx:
-	ice_cfg_dflt_vsi(uplink_vsi->port_info, uplink_vsi->idx, false,
-			 ICE_FLTR_RX);
+	/* keep the Rx DFLT rule if the uplink is promiscuous (see release_env) */
+	if (!(uplink_vsi->netdev->flags & IFF_PROMISC))
+		ice_cfg_dflt_vsi(uplink_vsi->port_info, uplink_vsi->idx,
+				 false, ICE_FLTR_RX);
 err_def_rx:
 	ice_vsi_del_vlan_zero(uplink_vsi);
 err_vlan_zero:
@@ -278,8 +280,16 @@ static void ice_eswitch_release_env(struct ice_pf *pf)
 	vlan_ops->ena_rx_filtering(uplink_vsi);
 	ice_cfg_dflt_vsi(uplink_vsi->port_info, uplink_vsi->idx, false,
 			 ICE_FLTR_TX);
-	ice_cfg_dflt_vsi(uplink_vsi->port_info, uplink_vsi->idx, false,
-			 ICE_FLTR_RX);
+
+	/* Keep the Rx DFLT rule if the uplink is promiscuous; it must outlive
+	 * the session. Test the live netdev->flags, the same value
+	 * ena_rx_filtering() -> ice_cfg_vlan_pruning() above keys its decision
+	 * on, so the preserved DFLT rule and the pruning state stay consistent.
+	 */
+	if (!(uplink_vsi->netdev->flags & IFF_PROMISC))
+		ice_cfg_dflt_vsi(uplink_vsi->port_info, uplink_vsi->idx,
+				 false, ICE_FLTR_RX);
+
 	ice_fltr_add_mac_and_broadcast(uplink_vsi,
 				       uplink_vsi->port_info->mac.perm_addr,
 				       ICE_FWD_TO_VSI);
-- 
2.53.0

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox