Netdev List

Netdev List
 help / color / mirror / Atom feed

* [RFC PATCH v3 4/4] [DO NOT MERGE] ptp: ptp_vmclock: Add simulated 1PPS support
From: David Woodhouse @ 2026-06-22 20:36 UTC (permalink / raw)
  To: Rodolfo Giometti, David Woodhouse, Richard Cochran, Andrew Lunn,
	David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	John Stultz, Thomas Gleixner, Stephen Boyd, Miroslav Lichvar,
	linux-kernel, netdev, Alexander Gordeev
  Cc: David Woodhouse
In-Reply-To: <20260622211822.1056437-1-dwmw2@infradead.org>

From: David Woodhouse <dwmw@amazon.co.uk>

Before the timekeeping_set_reference() work, the simplest way to
synchronise the kernel against vmclock was to simulate a 1PPS signal.
Restore that hack here, for testing CONFIG_NTP_PPS in NOHZ mode.

Set up an hrtimer to fire at each vmclock second boundary, and teach
vmclock_get_crosststamp() to return the cycle counter and the
corresponding { systime, monoraw } at the *start* of the current second,
because hardpps() expects the timestamps it is given for phase and
frequency adjustment to be the kernel's clock readings at the moment the
true time is at the top of a second (i.e. when the pulse arrives).

The timer feeds a PTP_CLOCK_PPSUSR event; with PTP_ENABLE_PPS, the PPS
source bound to the in-kernel hardpps() consumer and STA_PPSTIME/PPSFREQ
set, the kernel disciplines CLOCK_REALTIME directly from vmclock. The
second-boundary cycle is recovered from get_device_system_crosststamp()
using a history snapshot for interpolation.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Assisted-by: Kiro:claude-opus-4.8
---
 drivers/ptp/ptp_vmclock.c | 196 +++++++++++++++++++++++++++++++++++---
 1 file changed, 185 insertions(+), 11 deletions(-)

diff --git a/drivers/ptp/ptp_vmclock.c b/drivers/ptp/ptp_vmclock.c
index eebdcd5ebc08..1f56c29b3d6b 100644
--- a/drivers/ptp/ptp_vmclock.c
+++ b/drivers/ptp/ptp_vmclock.c
@@ -13,6 +13,7 @@
 #include <linux/err.h>
 #include <linux/file.h>
 #include <linux/fs.h>
+#include <linux/hrtimer.h>
 #include <linux/init.h>
 #include <linux/io.h>
 #include <linux/interrupt.h>
@@ -51,6 +52,10 @@ struct vmclock_state {
 	enum clocksource_ids cs_id, sys_cs_id;
 	int index;
 	char *name;
+	struct hrtimer pps_timer;
+	bool pps_enabled;
+	struct system_time_snapshot history_snap;
+	bool history_valid;
 };
 
 #define VMCLOCK_MAX_WAIT ms_to_ktime(100)
@@ -98,10 +103,13 @@ static bool tai_adjust(struct vmclock_abi *clk, uint64_t *sec)
 static int vmclock_get_crosststamp(struct vmclock_state *st,
 				   struct ptp_system_timestamp *sts,
 				   struct system_counterval_t *system_counter,
-				   struct timespec64 *tspec)
+				   struct timespec64 *tspec,
+				   bool on_second)
 {
 	ktime_t deadline = ktime_add(ktime_get(), VMCLOCK_MAX_WAIT);
 	uint64_t cycle, delta, seq, frac_sec;
+	uint64_t period_frac_sec;
+	uint8_t period_shift;
 
 #ifdef CONFIG_X86
 	/*
@@ -154,11 +162,46 @@ static int vmclock_get_crosststamp(struct vmclock_state *st,
 
 		delta = cycle - le64_to_cpu(st->clk->counter_value);
 
+		period_frac_sec = le64_to_cpu(st->clk->counter_period_frac_sec);
+		period_shift = st->clk->counter_period_shift;
+
 		frac_sec = mul_u64_u64_shr_add_u64(&tspec->tv_sec, delta,
-						   le64_to_cpu(st->clk->counter_period_frac_sec),
-						   st->clk->counter_period_shift,
+						   period_frac_sec, period_shift,
 						   le64_to_cpu(st->clk->time_frac_sec));
-		tspec->tv_nsec = mul_u64_u64_shr(frac_sec, NSEC_PER_SEC, 64);
+
+		/* For simulated PPS, adjust to the most recent second boundary */
+		if (on_second) {
+			uint64_t delta_cycles;
+			int frac_shift, shift_remain;
+
+			if (tspec->tv_sec == 0)
+				return -EAGAIN;  /* No second boundary crossed yet */
+
+			/*
+			 * Roll the counter back to the top of the current second.
+			 * frac_sec == 0 means we are already exactly on the
+			 * boundary (and __builtin_clzll(0) is undefined).
+			 */
+			if (frac_sec) {
+				/* Shift frac_sec left until top bit is set */
+				frac_shift = __builtin_clzll(frac_sec);
+				frac_sec <<= frac_shift;
+
+				/* Shift period right by the remaining bits */
+				shift_remain = period_shift - frac_shift;
+				if (shift_remain > 0)
+					period_frac_sec >>= shift_remain;
+				else
+					frac_sec >>= -shift_remain;
+
+				delta_cycles = frac_sec / period_frac_sec;
+				cycle -= delta_cycles;
+			}
+			tspec->tv_nsec = 0;
+		} else {
+			tspec->tv_nsec = mul_u64_u64_shr(frac_sec, NSEC_PER_SEC, 64);
+		}
+
 		tspec->tv_sec += le64_to_cpu(st->clk->time_sec);
 
 		if (!tai_adjust(st->clk, &tspec->tv_sec))
@@ -193,7 +236,8 @@ static int vmclock_get_crosststamp(struct vmclock_state *st,
 static int vmclock_get_crosststamp_kvmclock(struct vmclock_state *st,
 					    struct ptp_system_timestamp *sts,
 					    struct system_counterval_t *system_counter,
-					    struct timespec64 *tspec)
+					    struct timespec64 *tspec,
+					    bool on_second)
 {
 	struct pvclock_vcpu_time_info *pvti = this_cpu_pvti();
 	unsigned int pvti_ver;
@@ -204,7 +248,7 @@ static int vmclock_get_crosststamp_kvmclock(struct vmclock_state *st,
 	do {
 		pvti_ver = pvclock_read_begin(pvti);
 
-		ret = vmclock_get_crosststamp(st, sts, system_counter, tspec);
+		ret = vmclock_get_crosststamp(st, sts, system_counter, tspec, on_second);
 		if (ret)
 			break;
 
@@ -240,10 +284,10 @@ static int ptp_vmclock_get_time_fn(ktime_t *device_time,
 #ifdef SUPPORT_KVMCLOCK
 	if (READ_ONCE(st->sys_cs_id) == CSID_X86_KVM_CLK)
 		ret = vmclock_get_crosststamp_kvmclock(st, NULL, system_counter,
-						       &tspec);
+						       &tspec, false);
 	else
 #endif
-		ret = vmclock_get_crosststamp(st, NULL, system_counter, &tspec);
+		ret = vmclock_get_crosststamp(st, NULL, system_counter, &tspec, false);
 
 	if (!ret)
 		*device_time = timespec64_to_ktime(tspec);
@@ -280,6 +324,98 @@ static int ptp_vmclock_getcrosststamp(struct ptp_clock_info *ptp,
 	return ret;
 }
 
+static int ptp_vmclock_get_time_fn_pps(ktime_t *device_time,
+				       struct system_counterval_t *system_counter,
+				       void *ctx)
+{
+	struct vmclock_state *st = ctx;
+	struct timespec64 tspec;
+	int ret;
+
+#ifdef SUPPORT_KVMCLOCK
+	if (st->history_valid && st->history_snap.cs_id == CSID_X86_KVM_CLK)
+		ret = vmclock_get_crosststamp_kvmclock(st, NULL, system_counter,
+						       &tspec, true);
+	else
+#endif
+		ret = vmclock_get_crosststamp(st, NULL, system_counter, &tspec, true);
+
+	if (!ret)
+		*device_time = timespec64_to_ktime(tspec);
+
+	return ret;
+}
+
+/*
+ * Generate simulated PPS events for feeding __hardpps(), which expects to be
+ * given both CLOCK_REALTIME and CLOCK_MONOTONIC_RAW values for when a 1PPS
+ * signal actually happened (i.e. at the top of a second).
+ *
+ * vmclock_get_crosststamp(..., on_second=true) reads the vmclock and both
+ * system clocks from the same TSC value, then rolls the TSC back to the value
+ * it would have had at the start of the current second so the timestamps line
+ * up with a real pulse. The hrtimer reschedules itself for the top of the next
+ * second according to *vmclock*, not necessarily CLOCK_REALTIME.
+ */
+static enum hrtimer_restart ptp_vmclock_pps_timer(struct hrtimer *timer)
+{
+	struct vmclock_state *st = container_of(timer, struct vmclock_state, pps_timer);
+	struct system_device_crosststamp xtstamp = { .clock_id = CLOCK_REALTIME };
+	struct ptp_clock_event event;
+	ktime_t next, now_rt;
+	s64 delta_ns;
+	int ret;
+
+	if (!st->pps_enabled)
+		return HRTIMER_NORESTART;
+
+	/* Only report PPS if we have a valid history snapshot to interpolate from */
+	ret = -EINVAL;
+	if (st->history_valid) {
+		ret = get_device_system_crosststamp(ptp_vmclock_get_time_fn_pps, st,
+						    &st->history_snap, &xtstamp);
+		if (!ret) {
+			event.type = PTP_CLOCK_PPSUSR;
+			event.pps_times.ts_real = ktime_to_timespec64(xtstamp.sys_systime);
+#ifdef CONFIG_NTP_PPS
+			event.pps_times.ts_raw = ktime_to_timespec64(xtstamp.sys_monoraw);
+#endif
+			ptp_clock_event(st->ptp_clock, &event);
+		}
+	}
+
+	/* Capture a snapshot to bound the next interpolation */
+	ktime_get_snapshot_id(CLOCK_REALTIME, &st->history_snap);
+	st->history_valid = true;
+
+	/*
+	 * Schedule the next timer for the top of the next second according to
+	 * vmclock. If we reported a PPS event, xtstamp.sys_systime is already
+	 * at the second boundary, so just add a second; otherwise read the
+	 * current vmclock time and work out when it next hits a boundary.
+	 */
+	if (!ret) {
+		next = ktime_add_ns(xtstamp.sys_systime, NSEC_PER_SEC);
+	} else {
+		struct timespec64 ts;
+
+		if (vmclock_get_crosststamp(st, NULL, NULL, &ts, false))
+			return HRTIMER_NORESTART;
+
+		delta_ns = NSEC_PER_SEC - ts.tv_nsec;
+		next = ktime_add_ns(st->history_snap.systime, delta_ns);
+	}
+
+	/* Never reschedule in the past, or the timer tight-loops */
+	now_rt = ktime_get_real();
+	if (ktime_compare(next, now_rt) <= 0)
+		next = ktime_add_ns(now_rt, NSEC_PER_SEC);
+
+	hrtimer_set_expires(timer, next);
+
+	return HRTIMER_RESTART;
+}
+
 /*
  * PTP clock operations
  */
@@ -306,12 +442,43 @@ static int ptp_vmclock_gettimex(struct ptp_clock_info *ptp, struct timespec64 *t
 	struct vmclock_state *st = container_of(ptp, struct vmclock_state,
 						ptp_clock_info);
 
-	return vmclock_get_crosststamp(st, sts, NULL, ts);
+	return vmclock_get_crosststamp(st, sts, NULL, ts, false);
 }
 
 static int ptp_vmclock_enable(struct ptp_clock_info *ptp,
 			  struct ptp_clock_request *rq, int on)
 {
+	struct vmclock_state *st = container_of(ptp, struct vmclock_state,
+						ptp_clock_info);
+
+	switch (rq->type) {
+	case PTP_CLK_REQ_PPS:
+		st->pps_enabled = !!on;
+		if (on) {
+			struct timespec64 ts;
+			s64 delta_ns;
+
+			/* Snapshot to bound the first interpolation */
+			ktime_get_snapshot_id(CLOCK_REALTIME, &st->history_snap);
+			st->history_valid = true;
+
+			if (vmclock_get_crosststamp(st, NULL, NULL, &ts, false))
+				return -EIO;
+
+			/* When will vmclock next reach a second boundary? */
+			delta_ns = NSEC_PER_SEC - ts.tv_nsec;
+
+			hrtimer_start(&st->pps_timer,
+				      ktime_add_ns(st->history_snap.systime, delta_ns),
+				      HRTIMER_MODE_ABS);
+		} else {
+			hrtimer_cancel(&st->pps_timer);
+		}
+		return 0;
+	default:
+		break;
+	}
+
 	return -EOPNOTSUPP;
 }
 
@@ -320,7 +487,7 @@ static const struct ptp_clock_info ptp_vmclock_info = {
 	.max_adj	= 0,
 	.n_ext_ts	= 0,
 	.n_pins		= 0,
-	.pps		= 0,
+	.pps		= 1,
 	.adjfine	= ptp_vmclock_adjfine,
 	.adjtime	= ptp_vmclock_adjtime,
 	.gettimex64	= ptp_vmclock_gettimex,
@@ -356,6 +523,10 @@ static struct ptp_clock *vmclock_ptp_register(struct device *dev,
 	st->ptp_clock_info = ptp_vmclock_info;
 	strscpy(st->ptp_clock_info.name, st->name);
 
+	hrtimer_setup(&st->pps_timer, ptp_vmclock_pps_timer, CLOCK_REALTIME,
+		      HRTIMER_MODE_ABS);
+	st->pps_enabled = false;
+
 	return ptp_clock_register(&st->ptp_clock_info, dev);
 }
 
@@ -637,8 +808,11 @@ static void vmclock_remove(void *data)
 					   vmclock_acpi_notification_handler);
 #endif
 
-	if (st->ptp_clock)
+	if (st->ptp_clock) {
+		st->pps_enabled = false;
+		hrtimer_cancel(&st->pps_timer);
 		ptp_clock_unregister(st->ptp_clock);
+	}
 
 	if (st->miscdev.minor != MISC_DYNAMIC_MINOR)
 		misc_deregister(&st->miscdev);
-- 
2.54.0


^ permalink raw reply related

* [RFC PATCH v3 3/4] pps: Always use ktime_get_snapshot_id() for pps_get_ts()
From: David Woodhouse @ 2026-06-22 20:36 UTC (permalink / raw)
  To: Rodolfo Giometti, David Woodhouse, Richard Cochran, Andrew Lunn,
	David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	John Stultz, Thomas Gleixner, Stephen Boyd, Miroslav Lichvar,
	linux-kernel, netdev, Alexander Gordeev
  Cc: David Woodhouse
In-Reply-To: <20260622211822.1056437-1-dwmw2@infradead.org>

From: David Woodhouse <dwmw@amazon.co.uk>

A recent commit changed ktime_get_snapshot_id() to return a corrected
::systime value which takes into account the divergence of the normal
per-tick timekeeping from the ideal NTP-disciplined clock.

Rather than using that more accurate timestamp *only* in the case where
CONFIG_NTP_PPS is enabled, do so unconditionally.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
---
 include/linux/pps_kernel.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/include/linux/pps_kernel.h b/include/linux/pps_kernel.h
index 9f088c9023b1..5eeed6695882 100644
--- a/include/linux/pps_kernel.h
+++ b/include/linux/pps_kernel.h
@@ -99,14 +99,12 @@ static inline void timespec_to_pps_ktime(struct pps_ktime *kt,
 
 static inline void pps_get_ts(struct pps_event_time *ts)
 {
-#ifdef CONFIG_NTP_PPS
 	struct system_time_snapshot snap;
 
 	ktime_get_snapshot_id(CLOCK_REALTIME, &snap);
 	ts->ts_real = ktime_to_timespec64(snap.systime);
+#ifdef CONFIG_NTP_PPS
 	ts->ts_raw = ktime_to_timespec64(snap.monoraw);
-#else
-	ktime_get_real_ts64(&ts->ts_real);
 #endif
 }
 
-- 
2.54.0


^ permalink raw reply related

* [PATCH net-next] selftests: tls: size splice_short pipe by page size
From: Nirmoy Das @ 2026-06-22 20:28 UTC (permalink / raw)
  To: Jakub Kicinski, Sabrina Dubroca, John Fastabend
  Cc: netdev, linux-kernel, Nirmoy Das

splice_short grows its pipe with (MAX_FRAGS + 1) * 0x1000 so it can
queue one short vmsplice() buffer for each fragment before draining the
pipe. That assumes 4K pipe buffers.

On 64K-page kernels the request is rounded to 262144 bytes, which
provides only four pipe buffers. The fifth one-byte vmsplice() blocks in
pipe_wait_writable and the test times out before it reaches the TLS path.

Request enough bytes for the same number of pipe buffers using the
runtime page size, and assert that the kernel granted at least that much.
If an unprivileged run cannot raise the pipe above the system
pipe-max-size limit, skip the test because it cannot exercise the
intended path.

Fixes: 3667e9b442b9 ("selftests: tls: add test for short splice due to full skmsg")
Assisted-by: Codex:gpt-5
Signed-off-by: Nirmoy Das <nirmoyd@nvidia.com>
---
 tools/testing/selftests/net/tls.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/net/tls.c b/tools/testing/selftests/net/tls.c
index 30a236b8e9f73..e3bf4ade0f770 100644
--- a/tools/testing/selftests/net/tls.c
+++ b/tools/testing/selftests/net/tls.c
@@ -997,6 +997,8 @@ TEST_F(tls, splice_short)
 	char sendbuf[0x100];
 	char sendchar = 'S';
 	int pipefds[2];
+	int pipe_sz;
+	int ret;
 	int i;

 	sendchar_iov.iov_base = &sendchar;
@@ -1005,7 +1007,12 @@ TEST_F(tls, splice_short)
 	memset(sendbuf, 's', sizeof(sendbuf));

 	ASSERT_GE(pipe2(pipefds, O_NONBLOCK), 0);
-	ASSERT_GE(fcntl(pipefds[0], F_SETPIPE_SZ, (MAX_FRAGS + 1) * 0x1000), 0);
+	pipe_sz = (MAX_FRAGS + 1) * getpagesize();
+	ret = fcntl(pipefds[0], F_SETPIPE_SZ, pipe_sz);
+	if (ret < 0 && errno == EPERM)
+		SKIP(return, "insufficient pipe capacity");
+	ASSERT_GE(ret, 0);
+	ASSERT_GE(ret, pipe_sz);

 	for (i = 0; i < MAX_FRAGS; i++)
 		ASSERT_GE(vmsplice(pipefds[1], &sendchar_iov, 1, 0), 0);
-- 
2.43.0

^ permalink raw reply related

* Re: [PATCH bpf-next] bpf, unix: Guard sk_msg-dependent code behind CONFIG_NET_SOCK_MSG
From: Jakub Sitnicki @ 2026-06-22 20:23 UTC (permalink / raw)
  To: Kuniyuki Iwashima
  Cc: ast, bpf, daniel, jiayuan.chen, john.fastabend, kernel-team, kuba,
	netdev
In-Reply-To: <20260622161221.1742161-1-kuniyu@google.com>

On Mon, Jun 22, 2026 at 04:11 PM GMT, Kuniyuki Iwashima wrote:
> From: Jakub Sitnicki <jakub@cloudflare.com>
> Date: Mon, 22 Jun 2026 14:58:34 +0200
>> Prepare to decouple BPF_SYSCALL config option from NET_SOCK_MSG.
>> 
>> Signed-off-by: Jakub Sitnicki <jakub@cloudflare.com>
>> ---
>>  net/unix/unix_bpf.c | 6 ++++++
>
> AFAIU, everyhing in this file is for BPF_SYSCALL && NET_SOCK_MSG,
> or am I missing something ?
>
> I feel that it would be cleaner to add a new Kconfig that depends
> on BPF_SYSCALL and NET_SOCK_MSG, change Makefile obj-$(CONFIG_XXX),
> and guard .psock_update_sk_prot in af_unix.c

What I'm aiming for is to have all code for sockmap-based redirects
behind BPF_SYSCALL && NET_SOCK_MSG, and the rest, which is the
implementation of sockmap container for socket refs, behind just
BPF_SYSCALL.

You're right that in the unix_bpf case, the whole file could be behind
BPF_SYSCALL && NET_SOCK_MSG because while you can keep refs to Unix
sockets in sockmap without doing redirects, it won't be of much use
(there's no sk_lookup or tc-sk_assign).

We can add a new config but I won't be able to follow that pattern for
either tcp_bpf and udp_bpf, which are next in line.

^ permalink raw reply

* Re: [PATCH net] seg6: validate SRH length before reading fixed fields
From: Andrea Mayer @ 2026-06-22 19:33 UTC (permalink / raw)
  To: Nuoqi Gui
  Cc: David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Simon Horman, netdev, bpf, linux-kernel, stefano.salsano,
	Andrea Mayer
In-Reply-To: <20260620-f01-17-seg6-srh-len-v1-1-36cbb29c12f1@mails.tsinghua.edu.cn>

On Sat, 20 Jun 2026 23:55:51 +0800
Nuoqi Gui <gnq25@mails.tsinghua.edu.cn> wrote:

Hi Nuoqi,
Thanks for the patch.

> seg6_validate_srh() reads fixed SRH fields such as srh->type and
> srh->hdrlen before checking that the supplied length covers the fixed
> struct ipv6_sr_hdr fields.  Callers that pass a length smaller than
> sizeof(struct ipv6_sr_hdr) therefore expose those reads to memory
> outside the validated range.
>
> The BPF SEG6 encap path (bpf_lwt_push_encap() -> bpf_push_seg6_encap())
> is one such caller: it forwards a BPF program-supplied pointer and
> length straight to seg6_validate_srh() with no minimum-size guard, so a
> 2-byte SEG6 encap header lets the validator read srh->type at offset 2
> beyond the caller-supplied buffer.

Besides the BPF use case, is there a caller that can reach it with
len < sizeof(*srh)? The ones I found all pass at least the fixed header.

>
> Reject lengths shorter than the fixed SRH at the top of
> seg6_validate_srh(), before any field is read.  This fixes the BPF helper
> path and hardens the common validator for any other caller that reaches it
> with a too-short SRH.
>
> Fixes: fe94cc290f53 ("bpf: Add IPv6 Segment Routing helpers")
> Signed-off-by: Nuoqi Gui <gnq25@mails.tsinghua.edu.cn>
> ---
>  net/ipv6/seg6.c | 3 +++
>  1 file changed, 3 insertions(+)
>
> diff --git a/net/ipv6/seg6.c b/net/ipv6/seg6.c
> index 1c3ad25700c4c..d2cb32a1058af 100644
> --- a/net/ipv6/seg6.c
> +++ b/net/ipv6/seg6.c
> @@ -29,6 +29,9 @@ bool seg6_validate_srh(struct ipv6_sr_hdr *srh, int len, bool reduced)
>       int max_last_entry;
>       int trailing;
>
> +     if (len < (int)sizeof(*srh))
> +             return false;
> +

The (int) cast only changes the result when len < 0, which is not a meaningful
byte length. Plain "len < sizeof(*srh)" would be enough.

>       if (srh->type != IPV6_SRCRT_TYPE_4)
>               return false;
>
>
> ---
> base-commit: 96e7f9122aae0ed000ee321f324b812a447906d9
> change-id: 20260619-f01-17-seg6-srh-len-a85f35427e0b
>
> Best regards,
> --
> Nuoqi Gui <gnq25@mails.tsinghua.edu.cn>
>

Regards,
Andrea

^ permalink raw reply

* Re: [syzbot] [wireguard?] KCSAN: data-race in wg_socket_send_skb_to_peer / wg_socket_send_skb_to_peer (9)
From: Rafael Passos @ 2026-06-22 19:34 UTC (permalink / raw)
  To: Jason, andrew+netdev, davem, edumazet, kuba, linux-kernel, netdev,
	pabeni, syzkaller-bugs, wireguard, syzbot
In-Reply-To: <6a1d983b.b111c304.35cd64.0028.GAE@google.com>

Hi,

I started investigating this KCSAN warning by syzbot, and would like to
ask a few questions.

On Mon Jun 1, 2026 at 11:33 AM -03, syzbot wrote:
> ==================================================================
> BUG: KCSAN: data-race in wg_socket_send_skb_to_peer / wg_socket_send_skb_to_peer
>
> read-write to 0xffff88811af99028 of 8 bytes by task 310 on cpu 1:
>  wg_socket_send_skb_to_peer+0xe8/0x130 drivers/net/wireguard/socket.c:182
>  wg_socket_send_buffer_to_peer+0xf1/0x120 drivers/net/wireguard/socket.c:199
>  wg_packet_send_handshake_initiation drivers/net/wireguard/send.c:40 [inline]
>  wg_packet_handshake_send_worker+0x10d/0x160 drivers/net/wireguard/send.c:51
>  process_one_work kernel/workqueue.c:3314 [inline]
>  process_scheduled_works+0x4f0/0x9c0 kernel/workqueue.c:3397
>  worker_thread+0x58a/0x780 kernel/workqueue.c:3478
>  kthread+0x22a/0x280 kernel/kthread.c:436
>  ret_from_fork+0x146/0x330 arch/x86/kernel/process.c:158
>  ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:245
>
> read-write to 0xffff88811af99028 of 8 bytes by task 15360 on cpu 0:
>  wg_socket_send_skb_to_peer+0xe8/0x130 drivers/net/wireguard/socket.c:182
>  wg_packet_create_data_done drivers/net/wireguard/send.c:251 [inline]
>  wg_packet_tx_worker+0x12d/0x330 drivers/net/wireguard/send.c:276
>  process_one_work kernel/workqueue.c:3314 [inline]
>  process_scheduled_works+0x4f0/0x9c0 kernel/workqueue.c:3397
>  worker_thread+0x58a/0x780 kernel/workqueue.c:3478
>  kthread+0x22a/0x280 kernel/kthread.c:436
>  ret_from_fork+0x146/0x330 arch/x86/kernel/process.c:158
>  ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:245
>
> value changed: 0x0000000000000a2c -> 0x0000000000000ac0
>
> Reported by Kernel Concurrency Sanitizer on:
> CPU: 0 UID: 0 PID: 15360 Comm: kworker/0:2 Tainted: G        W           syzkaller #0 PREEMPT(lazy) 
> Tainted: [W]=WARN
> Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 04/18/2026
> Workqueue: wg-crypt-wg2 wg_packet_tx_worker

I tracked the change to this counter increment in `wg_socket_send_skb_to_peer`

+++ b/drivers/net/wireguard/socket.c
@@ -179,7 +179,8 @@ int wg_socket_send_skb_to_peer(struct wg_peer *peer, struct sk_buff *skb, u8 ds)
 	else
 		dev_kfree_skb(skb);
 	if (likely(!ret))
->		peer->tx_bytes += skb_len;  <- protected by a read_lock_bh only
 	read_unlock_bh(&peer->endpoint_lock);

It is protected by the read-part of a rwlock.
However, if the stack trace makes sense, this `wg_socket_send_skb_to_peer`
is being called after a handshake (wg_packet_send_handshake_initiation) and
a send worker call (wg_packet_tx_worker).

Does this make sense ? Are such calls possible to really hapen outside of fuzzing ?

Out of curiosity, I changed `tx_bytes` and `rx_bytes` from u64 to atomic64_t
in peer.h, and also the r/w ops in netlink.c, receive.c and socket.c files.
I ran the wireguard kselftest suite with and without this patch, and it
worked fine. Iperf results seem sine (on amd64).
I'm not sure if this should be the solution, or if this is even a real issue in the first place.

Any comments ?

Eager to learn.
Thanks,

Rafael Passos

^ permalink raw reply

* [PATCH net] net: pse-pd: scope pse_control regulator handle to kref lifetime
From: Carlo Szelinsky @ 2026-06-22 19:28 UTC (permalink / raw)
  To: Oleksij Rempel, Kory Maincent, Andrew Lunn, David S . Miller,
	Eric Dumazet, Jakub Kicinski, Paolo Abeni
  Cc: Corey Leavitt, Heiner Kallweit, Russell King, netdev,
	linux-kernel, Carlo Szelinsky

From: Corey Leavitt <corey@leavitt.info>

__pse_control_release() drops psec->ps via devm_regulator_put(), which
only succeeds if the devres entry added by the matching
devm_regulator_get_exclusive() is still present on pcdev->dev at the
time the pse_control's kref hits zero.

In practice that assumption does not hold when the controller is
unbound while any pse_control still has consumers: pcdev->dev's
devres list is released LIFO, so every per-attach regulator-GET
devres runs (and regulator_put()s the underlying regulator) before
pse_controller_unregister() itself is invoked. Any later
pse_control_put() from that unbind path then reads psec->ps as a
dangling pointer inside devm_regulator_put() and WARNs at
drivers/regulator/devres.c:232 (devres_release() fails to find the
already-released match).

The pse_control's consumer handle is logically scoped to the
pse_control's refcount, not to pcdev->dev's devres lifetime. Switch
to the plain regulator_get_exclusive() / regulator_put() pair so
__pse_control_release() does the right put regardless of whether
the controller's devres has already been unwound.

No change to the regulator-framework-visible refcount or lifetime of
the underlying regulator: a single get paired with a single put. The
existing devm_regulator_register() for the per-PI rails is unchanged
(those ARE correctly scoped to the controller's lifetime).

Fixes: d83e13761d5b ("net: pse-pd: Use regulator framework within PSE framework")
Signed-off-by: Corey Leavitt <corey@leavitt.info>
Acked-by: Kory Maincent <kory.maincent@bootlin.com>
Signed-off-by: Carlo Szelinsky <github@szelinsky.de>
---
This was patch 1 of the "decouple controller lookup from MDIO probe"
series [1]. Reposting it standalone for net as suggested, since it is a
self-contained fix. The rest of the series targets net-next and will be
resent once net-next reopens.

[1] https://lore.kernel.org/netdev/20260620112440.1734404-1-github@szelinsky.de/
---
 drivers/net/pse-pd/pse_core.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/net/pse-pd/pse_core.c b/drivers/net/pse-pd/pse_core.c
index 69dbdbde9d71..a5e6d7b26b9f 100644
--- a/drivers/net/pse-pd/pse_core.c
+++ b/drivers/net/pse-pd/pse_core.c
@@ -1367,7 +1367,7 @@ static void __pse_control_release(struct kref *kref)

 	if (psec->pcdev->pi[psec->id].admin_state_enabled)
 		regulator_disable(psec->ps);
-	devm_regulator_put(psec->ps);
+	regulator_put(psec->ps);

 	module_put(psec->pcdev->owner);

@@ -1436,8 +1436,8 @@ pse_control_get_internal(struct pse_controller_dev *pcdev, unsigned int index,
 		goto free_psec;

 	pcdev->pi[index].admin_state_enabled = ret;
-	psec->ps = devm_regulator_get_exclusive(pcdev->dev,
-						rdev_get_name(pcdev->pi[index].rdev));
+	psec->ps = regulator_get_exclusive(pcdev->dev,
+					   rdev_get_name(pcdev->pi[index].rdev));
 	if (IS_ERR(psec->ps)) {
 		ret = PTR_ERR(psec->ps);
 		goto put_module;

base-commit: d07d80b6a129a44538cda1549b7acf95154fb197
-- 
2.43.0

^ permalink raw reply related

* Re: Ethtool : PRBS feature
From: Lee Trager @ 2026-06-22 18:11 UTC (permalink / raw)
  To: Das, Shubham, Maxime Chevallier, Andrew Lunn
  Cc: Alexander H Duyck, netdev@vger.kernel.org, mkubecek@suse.cz,
	D H, Siddaraju, Chintalapalle, Balaji, Lindberg, Magnus,
	niklas.damberg@ericsson.com
In-Reply-To: <SN7PR11MB8109149608172808784CDCBEFFEF2@SN7PR11MB8109.namprd11.prod.outlook.com>

On 6/22/26 8:38 AM, Das, Shubham wrote:

> Hi Maxime,
>
>> Can you elaborate on what you have in mind for now ? what would the "ethtool --
>> phy-test" command look like in terms of its behaviour and parameters ?
> We are trying to converge on a userspace uAPI for PRBS/BERT functionality that can work across
> different hardware models (PHY-managed, MAC/NIC-offloaded, or firmware-based implementations),
> without exposing those differences to userspace.

This was my original thought as well. Create a well defined uAPI for 
PRBS testing/TX FIR tuning and allow the driver to implement support 
however it sees fit. Since our target is for ethernet devices ethtool 
was a natural spot for the uAPI.

I presented this at netdev last year and received strong push back 
against associating PRBS testing/TX FIR tuning with ethtool. The 
argument being any new uAPI added to the kernel should be generic enough 
to handle future use cases so duplicate uAPIs don't have to be added. 
Since PRBS testing/TX FIR tuning can be done on many phys(Ethernet, 
PCIE, USB, etc) the uAPI does not belong in ethtool and needs to be 
structured to support other use cases.

As drivers/phy is the base phy library the thought was support should be 
added in drivers/phy and a new phytool should be created to interact 
with a uAPI. This would be generic enough to support all use cases, with 
the downside being existing drivers would have to onboard to drivers/phy.

I do wonder if the best path forward would be to create phytool in a way 
that allows the driver to implement PRBS testing/TX FIR tuning as it 
sees fit instead of being strictly tied to drivers/phy.

>
> Based on the functionality we currently have, we proposed below commands in first email :
>
> PRBS Transmitter/Checker Pattern Configuration:
> ethtool --phy-test eth1 tx-prbs prbs7
> ethtool --phy-test eth2 rx-prbs prbs7
>
> BERT Test:
> ethtool --phy-test eth2 bert start
> ethtool --phy-test eth2 bert stop
>
> BERT Test Counter Read/ PRBS Lock Status:
> ethtool --phy-test eth2 stats
>
> BERT Clear stats - Symbol and Error counter:
> ethtool --phy-test eth2 clear-stats
>
> TX Error Injection:
> ethtool --phy-test eth1 inject-error 1
> ethtool --phy-test eth1 inject-error 1e-3
>
> Disable PRBS Pattern : TX/RX
> ethtool --phy-test eth1 tx-prbs off
> ethtool --phy-test eth2 rx-prbs off

The goal of running testing is to validate TX FIR values. If testing 
fails we need a uAPI to change those values.

Also the uAPI need to support testing per lane. One thing hardware 
engineers at Meta did was test each lane with a different set of TX FIR 
values which allowed them to quickly determine the best set of values.

>
> Approach would be to add a generic ethtool netlink API for PHY/SerDes and allow drivers to implement the operations directly.
> Conceptually:
>         ethtool ⇒ ethtool netlink ⇒ driver-specific implementation
>
> We would appreciate your input on whether a command-based model is suitable for a uAPI, and how we should design
> it to accommodate different implementation models, such as PHY-based, phylib-based, and MAC/firmware-offloaded PRBS.
>
> - Shubham D
>
>> -----Original Message-----
>> From: Maxime Chevallier <maxime.chevallier@bootlin.com>
>> Sent: 20 June 2026 20:09
>> To: Das, Shubham <shubham.das@intel.com>; Andrew Lunn <andrew@lunn.ch>
>> Cc: Alexander H Duyck <alexander.duyck@gmail.com>; lee@trager.us;
>> netdev@vger.kernel.org; mkubecek@suse.cz; D H, Siddaraju
>> <siddaraju.dh@intel.com>; Chintalapalle, Balaji
>> <balaji.chintalapalle@intel.com>; Lindberg, Magnus
>> <magnus.k.lindberg@ericsson.com>; niklas.damberg@ericsson.com
>> Subject: Re: Ethtool : PRBS feature
>>
>> Hi,
>>
>> On 6/20/26 15:48, Das, Shubham wrote:
>>>> Can you change the firmware to expose the 802.3 registers for PRBS?
>>>> You can then write a library which both plylib and your driver can use.
>>> Andrew,
>>>
>>> No, exposing the PRBS registers to drivers is not possible in our design (the
>> registers are buried deep within the Accelerator/NIC/PHY/Analog IP hierarchy).
>>> Additionally, the PHY PRBS registers are not in accordance with the IEEE Clause
>> 45 definitions. For instance, the PRBS registers are paged and 32-bit wide.
>>> Given these constraints, we think ethtool --phy-test is a reasonable starting
>> point for exposing the long-established Ethernet PRBS functionality to Linux
>> userspace, as it aligns well with the driver-owned NIC architecture model. If you
>> think a more generic layered approach would be preferable, we would appreciate
>> guidance on the expected architecture. That would help us better understand the
>> implementation complexity, required effort, and delivery timelines.
>>
>> Can you elaborate on what you have in mind for now ? what would the "ethtool --
>> phy-test" command look like in terms of its behaviour and parameters ?
>>
>> This feature is interesting for multiple people, each having different hardware
>> designs and constraints. It's good to consider an iterative approach to build this,
>> however we need to have in mind that this is uAPI, so once we commit to a design
>> choice, we have to live with it.
>>
>> We do have flexibility on the kernel side of the API. We can implement PRBS in
>> generic PHY, phylib, some MAC driver that talks to a firmware, etc. and hide away
>> these implementation details to userspace, but we need to make sure the uAPI
>> we come up with allows us to support all of that.
>>
>> Let's figure this out together, if you already have some ideas in mind we can use
>> that as a starting point for the discussion :)
>>
>> Maxime
>>
>>> Thanks,
>>> Shubham D
>>>
>>>> -----Original Message-----
>>>> From: Andrew Lunn <andrew@lunn.ch>
>>>> Sent: 20 June 2026 00:07
>>>> To: Das, Shubham <shubham.das@intel.com>
>>>> Cc: Alexander H Duyck <alexander.duyck@gmail.com>; lee@trager.us;
>>>> netdev@vger.kernel.org; mkubecek@suse.cz; D H, Siddaraju
>>>> <siddaraju.dh@intel.com>; Chintalapalle, Balaji
>>>> <balaji.chintalapalle@intel.com>; Lindberg, Magnus
>>>> <magnus.k.lindberg@ericsson.com>; niklas.damberg@ericsson.com
>>>> Subject: Re: Ethtool : PRBS feature
>>>>
>>>>> The host driver does not directly access any registers but requests
>>>>> the PHY FW to manage PRBS on behalf of it.
>>>> Maybe a dumb question. Why?
>>>>
>>>> Can you change the firmware to expose the 802.3 registers for PRBS?
>>>> You can then write a library which both plylib and your driver can use.
>>>>
>>>> 	Andrew

^ permalink raw reply

* Re: [PATCH v3 5/7] kernel: Use mutable list iterators
From: Eduard Zingerman @ 2026-06-22 19:03 UTC (permalink / raw)
  To: Kaitao Cheng, Paul Moore, Eric Paris, Alexei Starovoitov,
	Daniel Borkmann, Andrii Nakryiko, Kumar Kartikeya Dwivedi,
	David S. Miller, Jakub Kicinski, Jesper Dangaard Brouer,
	John Fastabend, Tejun Heo, Johannes Weiner, Michal Koutný,
	Maarten Lankhorst, Maxime Ripard, Natalie Vock, Peter Zijlstra,
	Ingo Molnar, Arnaldo Carvalho de Melo, Namhyung Kim,
	Masami Hiramatsu, Oleg Nesterov, Peter Oberparleiter,
	Andrew Morton, Baoquan He, Mike Rapoport, Pasha Tatashin,
	Pratyush Yadav, Naveen N Rao, Josh Poimboeuf, Jiri Kosina,
	Miroslav Benes, Petr Mladek, Will Deacon, Boqun Feng,
	Luis Chamberlain, Petr Pavlu, Daniel Gomez, Sami Tolvanen,
	Steffen Klassert, Daniel Jordan, Rafael J. Wysocki,
	Davidlohr Bueso, Paul E. McKenney, Josh Triplett,
	Frederic Weisbecker, Neeraj Upadhyay, Joel Fernandes,
	Uladzislau Rezki, Juri Lelli, Vincent Guittot, Kees Cook,
	Balbir Singh, Anna-Maria Behnsen, Thomas Gleixner, John Stultz,
	KP Singh, Matt Bobrowski, Nathan Chancellor, Martin KaFai Lau,
	Song Liu, Mark Rutland, Mathieu Desnoyers, Dietmar Eggemann,
	David Vernet, Steven Rostedt
  Cc: audit, linux-kernel, bpf, netdev, cgroups, dri-devel,
	linux-perf-users, linux-trace-kernel, kexec, live-patching,
	linux-modules, linux-crypto, linux-pm, rcu, sched-ext, llvm,
	Kaitao Cheng
In-Reply-To: <20260622042811.31684-1-kaitao.cheng@linux.dev>

On Mon, 2026-06-22 at 12:28 +0800, Kaitao Cheng wrote:
> From: Kaitao Cheng <chengkaitao@kylinos.cn>
> 
> The safe list iteration helpers require callers to provide a temporary
> cursor even when the cursor is only used internally by the loop. This
> leaves many functions with otherwise unused variables whose only purpose
> is to satisfy the old iterator interface.
> 
> Use the mutable list iteration helpers for those cases. The mutable
> helpers keep the same removal-safe traversal semantics, while allowing
> the temporary cursor to be internal to the macro when the caller does
> not need to observe it.
> 
> Convert list, hlist and llist users under kernel/ where the temporary
> cursor is not used outside the iteration. Keep the explicit cursor form
> where the next entry is still needed by the surrounding code.
> 
> No functional change intended.
> 
> Signed-off-by: Kaitao Cheng <chengkaitao@kylinos.cn>
> ---

Beside the fact that this does not apply,
I don't see a reason why is this needed for BPF sub-tree.

[...]

^ permalink raw reply

* [PATCH net 2/2] selftests/net: Add TCP-AO key shadowing test
From: Eric Dumazet @ 2026-06-22 18:52 UTC (permalink / raw)
  To: David S . Miller, Jakub Kicinski, Paolo Abeni
  Cc: Simon Horman, Dmitry Safonov, Neal Cardwell, Kuniyuki Iwashima,
	netdev, eric.dumazet, Eric Dumazet
In-Reply-To: <20260622185248.1717846-1-edumazet@google.com>

Add a new selftest shadowing.c to tools/testing/selftests/net/tcp_ao
to verify that more specific keys are correctly preferred over less
specific ones (shadowing prevention), regardless of their insertion order.

The test configures a server with a specific host key, and a client with
both a specific host key and a wildcard subnet key, inserted in the
"wrong" order (wildcard last, which would shadow the specific one under
the bug). It then verifies that the client can still successfully
connect to the server, which only succeeds if the client correctly
selects the more specific key for the outbound connection.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Assisted-by: Gemini:gemini-3.1-pro
---
 tools/testing/selftests/net/tcp_ao/Makefile   |  1 +
 .../testing/selftests/net/tcp_ao/shadowing.c  | 93 +++++++++++++++++++
 2 files changed, 94 insertions(+)
 create mode 100644 tools/testing/selftests/net/tcp_ao/shadowing.c

diff --git a/tools/testing/selftests/net/tcp_ao/Makefile b/tools/testing/selftests/net/tcp_ao/Makefile
index 5b0205c70c3983815315048c0ec1275525b7a29a..0c601d7049320be2310f9ff32988ae229584222e 100644
--- a/tools/testing/selftests/net/tcp_ao/Makefile
+++ b/tools/testing/selftests/net/tcp_ao/Makefile
@@ -2,6 +2,7 @@
 TEST_BOTH_AF := bench-lookups
 TEST_BOTH_AF += connect
 TEST_BOTH_AF += connect-deny
+TEST_BOTH_AF += shadowing
 TEST_BOTH_AF += icmps-accept icmps-discard
 TEST_BOTH_AF += key-management
 TEST_BOTH_AF += restore
diff --git a/tools/testing/selftests/net/tcp_ao/shadowing.c b/tools/testing/selftests/net/tcp_ao/shadowing.c
new file mode 100644
index 0000000000000000000000000000000000000000..da14b13e032d5a0632f398b7eaa72b8045e61ffe
--- /dev/null
+++ b/tools/testing/selftests/net/tcp_ao/shadowing.c
@@ -0,0 +1,93 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <inttypes.h>
+#include "aolib.h"
+
+static void *server_fn(void *arg)
+{
+	int sk, lsk;
+	ssize_t bytes;
+
+	lsk = test_listen_socket(this_ip_addr, test_server_port, 1);
+
+	/* Server only has the specific key for the client.
+	 * It expects KeyID 100, signed with "pass_specific".
+	 */
+	if (test_add_key(lsk, "pass_specific", this_ip_dest, -1, 100, 100))
+		test_error("setsockopt(TCP_AO_ADD_KEY)");
+
+	synchronize_threads(); /* 1: Server ready and key added */
+
+	if (test_wait_fd(lsk, TEST_TIMEOUT_SEC, 0))
+		test_error("test_wait_fd()");
+
+	sk = accept(lsk, NULL, NULL);
+	if (sk < 0)
+		test_error("accept()");
+
+	synchronize_threads(); /* 2: Connection accepted */
+
+	/* Verify we can receive data from the client */
+	bytes = test_server_run(sk, 0, 0);
+	if (bytes < 0) {
+		test_fail("server: failed to receive data");
+	} else {
+		test_ok("server: connection authenticated successfully");
+	}
+
+	close(sk);
+	close(lsk);
+	return NULL;
+}
+
+static void *client_fn(void *arg)
+{
+	int sk = socket(test_family, SOCK_STREAM, IPPROTO_TCP);
+	union tcp_addr wildcard_addr = {};
+
+	if (sk < 0)
+		test_error("socket()");
+
+	/* Client adds keys in the "wrong" order (wildcard last) to trigger shadowing.
+	 * 1. Specific key (Key B, ID 100)
+	 * 2. Wildcard key (Key A, ID 101)
+	 *
+	 * Without the fix, the wildcard key will be at the head of the list
+	 * and will shadow the specific key during outbound lookup, causing
+	 * the client to send a SYN with KeyID 101 (which the server doesn't have).
+	 */
+
+	/* 1. Add specific key */
+	if (test_add_key(sk, "pass_specific", this_ip_dest, -1, 100, 100))
+		test_error("setsockopt(TCP_AO_ADD_KEY) specific");
+
+	/* 2. Add wildcard key (any address, prefix 0) */
+	if (test_add_key(sk, "pass_wildcard", wildcard_addr, 0, 101, 101))
+		test_error("setsockopt(TCP_AO_ADD_KEY) wildcard");
+
+	synchronize_threads(); /* 1: Client ready and keys added => connect() */
+
+	if (test_connect_socket(sk, this_ip_dest, test_server_port) <= 0) {
+		test_fail("client: failed to connect (shadowing bug present?)");
+		close(sk);
+		return NULL;
+	}
+
+	synchronize_threads(); /* 2: Connection established */
+
+	/* Send some data to verify the connection works */
+	if (test_client_verify(sk, 100, 20)) {
+		test_fail("client: verify failed");
+	} else {
+		test_ok("client: connection established and verified (precedence correct)");
+	}
+
+	close(sk);
+	return NULL;
+}
+
+int main(int argc, char *argv[])
+{
+	/* We expect 2 test results: 1 from server, 1 from client */
+	test_init(2, server_fn, client_fn);
+	return 0;
+}
-- 
2.55.0.rc0.799.gd6f94ed593-goog


^ permalink raw reply related

* [PATCH net 1/2] tcp: fix TCP-AO key lookup precedence (shadowing)
From: Eric Dumazet @ 2026-06-22 18:52 UTC (permalink / raw)
  To: David S . Miller, Jakub Kicinski, Paolo Abeni
  Cc: Simon Horman, Dmitry Safonov, Neal Cardwell, Kuniyuki Iwashima,
	netdev, eric.dumazet, Eric Dumazet
In-Reply-To: <20260622185248.1717846-1-edumazet@google.com>

TCP-AO implementation stores Master Key Tuples (MKTs) in an unsorted
doubly-linked list (ao_info->head) and inserts new keys at the head.
When looking up a key, __tcp_ao_do_lookup() walks this list and returns
the first match it finds.

Because the list is unsorted, a newer, less-specific key can shadow an
older, more-specific key if it happens to be inserted later. This leads
to incorrect key selection in two scenarios:

1. VRF Shadowing: A wildcard VRF key (not bound to an interface) added
   after a VRF-specific key will shadow the VRF-specific key for traffic
   arriving on that VRF.
2. Prefix Shadowing: A less-specific prefix key (e.g., /24) added after
   a more-specific prefix key (e.g., /32) will shadow the more-specific
   key during outbound connection establishment.

Unlike TCP MD5, which walks the entire list and evaluates the "best
match" using better_md5_match(), TCP-AO expects the list order to
determine precedence.

Fix this by implementing sorted insertion in tcp_ao_link_mkt(). Keys
are inserted in descending order of specificity:
  - VRF-bound keys take precedence over unbound keys.
  - Longer prefix matches (LPM) take precedence over shorter ones.

This preserves the performance of the lockless RX lookup path (early
return on first match) while ensuring correct precedence.

Fixes: 4954f17ddefc ("net/tcp: Introduce TCP_AO setsockopt()s")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Assisted-by: Gemini:gemini-3.1-pro
---
 net/ipv4/tcp_ao.c | 27 ++++++++++++++++++++++++++-
 1 file changed, 26 insertions(+), 1 deletion(-)

diff --git a/net/ipv4/tcp_ao.c b/net/ipv4/tcp_ao.c
index 2f69bcecae78a677f33033a2d30e09a8ff858ad8..2d10fb1dd4cf87cc79ef5b5ead80eb3048218250 100644
--- a/net/ipv4/tcp_ao.c
+++ b/net/ipv4/tcp_ao.c
@@ -341,9 +341,34 @@ static struct tcp_ao_info *tcp_ao_alloc_info(gfp_t flags)
 	return ao;
 }

+static bool tcp_ao_key_is_more_specific(const struct tcp_ao_key *a,
+					const struct tcp_ao_key *b)
+{
+	bool a_vrf = !!(a->keyflags & TCP_AO_KEYF_IFINDEX);
+	bool b_vrf = !!(b->keyflags & TCP_AO_KEYF_IFINDEX);
+
+	if (a_vrf != b_vrf)
+		return a_vrf; /* VRF-bound is more specific */
+
+	return a->prefixlen > b->prefixlen; /* Longer prefix is more specific */
+}
+
 static void tcp_ao_link_mkt(struct tcp_ao_info *ao, struct tcp_ao_key *mkt)
 {
-	hlist_add_head_rcu(&mkt->node, &ao->head);
+	struct tcp_ao_key *pos;
+	struct hlist_node *last = NULL;
+
+	hlist_for_each_entry(pos, &ao->head, node) {
+		if (tcp_ao_key_is_more_specific(mkt, pos)) {
+			hlist_add_before_rcu(&mkt->node, &pos->node);
+			return;
+		}
+		last = &pos->node;
+	}
+	if (last)
+		hlist_add_behind_rcu(&mkt->node, last);
+	else
+		hlist_add_head_rcu(&mkt->node, &ao->head);
 }

 static struct tcp_ao_key *tcp_ao_copy_key(struct sock *sk,
-- 
2.55.0.rc0.799.gd6f94ed593-goog

^ permalink raw reply related

* [PATCH net 0/2] tcp: make TCP-AO lookups more predictable
From: Eric Dumazet @ 2026-06-22 18:52 UTC (permalink / raw)
  To: David S . Miller, Jakub Kicinski, Paolo Abeni
  Cc: Simon Horman, Dmitry Safonov, Neal Cardwell, Kuniyuki Iwashima,
	netdev, eric.dumazet, Eric Dumazet

This series fixes a TCP-AO key lookup precedence bug.

TCP-AO stores MKTs in an unsorted list and returns the first match. This
allows newer, less-specific keys (wildcard VRF or shorter prefixes) to
shadow older, more-specific keys if inserted later.

Fix this by implementing sorted insertion in tcp_ao_link_mkt() based on
key specificity (VRF binding, then prefix length). This keeps the RX
lookup path fast while ensuring correctness.

The second patch adds a selftest to verify this behavior.

Eric Dumazet (2):
  tcp: fix TCP-AO key lookup precedence (shadowing)
  selftests/net: Add TCP-AO key shadowing test

 net/ipv4/tcp_ao.c                             | 27 +++++-
 tools/testing/selftests/net/tcp_ao/Makefile   |  1 +
 .../testing/selftests/net/tcp_ao/shadowing.c  | 93 +++++++++++++++++++
 3 files changed, 120 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/net/tcp_ao/shadowing.c

-- 
2.55.0.rc0.799.gd6f94ed593-goog

^ permalink raw reply

* [PATCH] MAINTAINERS: Orphan SUNPLUS ETHERNET DRIVER
From: Wells Lu @ 2026-06-22 18:07 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: netdev, linux-kernel, Shitalkumar Gandhi, Andrew Lunn,
	David S . Miller, Eric Dumazet, Paolo Abeni, Simon Horman,
	Shitalkumar Gandhi, Wells Lu

I have left Sunplus and no longer have access to the relevant hardware
to test or maintain this driver. Mark the driver as orphaned.

Signed-off-by: Wells Lu <wellslutw@gmail.com>
---
 MAINTAINERS | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index 3f843d20b..30040fd65 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -25928,9 +25928,8 @@ S:	Maintained
 F:	drivers/net/ethernet/dlink/sundance.c
 
 SUNPLUS ETHERNET DRIVER
-M:	Wells Lu <wellslutw@gmail.com>
 L:	netdev@vger.kernel.org
-S:	Maintained
+S:	Orphan
 W:	https://sunplus.atlassian.net/wiki/spaces/doc/overview
 F:	Documentation/devicetree/bindings/net/sunplus,sp7021-emac.yaml
 F:	drivers/net/ethernet/sunplus/
-- 
2.43.0


^ permalink raw reply related

* [PATCH v2 4/4] vhost/vsock: add VHOST_RESET_OWNER ioctl
From: Andrey Drobyshev @ 2026-06-22 17:58 UTC (permalink / raw)
  To: linux-kernel
  Cc: kvm, virtualization, netdev, sgarzare, mst, stefanha,
	dongli.zhang, maciej.szmigiero, bchaney, mark.kanda, ptikhomirov,
	den, andrey.drobyshev
In-Reply-To: <20260622175808.508084-1-andrey.drobyshev@virtuozzo.com>

From: Pavel Tikhomirov <ptikhomirov@virtuozzo.com>

This ioctl is needed for QEMU's CPR (checkpoint-restore) migration of
the guest with vhost-vsock device.  For this to work, we need to reset
the device ownership on the source side by calling RESET_OWNER, and then
claim it on the dest side by calling SET_OWNER.  We expect not to lose any
AF_VSOCK connection while this happens.

RESET_OWNER keeps the guest CID hashed, so that connections survive. That
leaves the device reachable by the lockless send path while the worker is
being torn down: a concurrent vhost_transport_send_pkt() can call
vhost_vq_work_queue() as vhost_workers_free() frees the worker.  That might
cause a use-after-free of vq->worker.  In addition, any work queued onto
the dying worker leaves VHOST_WORK_QUEUED stuck, stalling send_pkt_queue
after resume.

Fence the send path around the teardown: send_pkt() only kicks the worker
while the backend is alive (otherwise the skb stays queued and
vhost_vsock_start() drains it on resume).  And reset_owner() calls
synchronize_rcu() after drop_backends() so in-flight senders finish before
the worker is freed.

Signed-off-by: Pavel Tikhomirov <ptikhomirov@virtuozzo.com>
Signed-off-by: Andrey Drobyshev <andrey.drobyshev@virtuozzo.com>
---
 drivers/vhost/vsock.c | 41 ++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 40 insertions(+), 1 deletion(-)

diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index 81d4f7209719..7d0146cd38d2 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -318,7 +318,14 @@ vhost_transport_send_pkt(struct sk_buff *skb, struct net *net)
 		atomic_inc(&vsock->queued_replies);
 
 	virtio_vsock_skb_queue_tail(&vsock->send_pkt_queue, skb);
-	vhost_vq_work_queue(&vsock->vqs[VSOCK_VQ_RX], &vsock->send_pkt_work);
+
+	/* Skip the kick once the backend is gone (stop/RESET_OWNER); the skb
+	 * stays queued and vhost_vsock_start() drains it. Pairs with the
+	 * synchronize_rcu() in vhost_vsock_reset_owner().
+	 */
+	if (data_race(vhost_vq_get_backend(&vsock->vqs[VSOCK_VQ_RX])))
+		vhost_vq_work_queue(&vsock->vqs[VSOCK_VQ_RX],
+				    &vsock->send_pkt_work);
 
 	rcu_read_unlock();
 	return len;
@@ -903,6 +910,36 @@ static int vhost_vsock_set_features(struct vhost_vsock *vsock, u64 features)
 	return -EFAULT;
 }
 
+static int vhost_vsock_reset_owner(struct vhost_vsock *vsock)
+{
+	struct vhost_iotlb *umem;
+	long err;
+
+	mutex_lock(&vsock->dev.mutex);
+	err = vhost_dev_check_owner(&vsock->dev);
+	if (err)
+		goto done;
+	umem = vhost_dev_reset_owner_prepare();
+	if (!umem) {
+		err = -ENOMEM;
+		goto done;
+	}
+	vhost_vsock_drop_backends(vsock);
+
+	/* Let in-flight send_pkt() callers stop touching the worker before the
+	 * flush + free below. Pairs with the backend check in
+	 * vhost_transport_send_pkt().
+	 */
+	synchronize_rcu();
+
+	vhost_vsock_flush(vsock);
+	vhost_dev_stop(&vsock->dev);
+	vhost_dev_reset_owner(&vsock->dev, umem);
+done:
+	mutex_unlock(&vsock->dev.mutex);
+	return err;
+}
+
 static long vhost_vsock_dev_ioctl(struct file *f, unsigned int ioctl,
 				  unsigned long arg)
 {
@@ -946,6 +983,8 @@ static long vhost_vsock_dev_ioctl(struct file *f, unsigned int ioctl,
 			return -EOPNOTSUPP;
 		vhost_set_backend_features(&vsock->dev, features);
 		return 0;
+	case VHOST_RESET_OWNER:
+		return vhost_vsock_reset_owner(vsock);
 	default:
 		mutex_lock(&vsock->dev.mutex);
 		r = vhost_dev_ioctl(&vsock->dev, ioctl, argp);
-- 
2.47.1


^ permalink raw reply related

* [PATCH v2 3/4] vhost/vsock: re-scan TX virtqueue on device start
From: Andrey Drobyshev @ 2026-06-22 17:58 UTC (permalink / raw)
  To: linux-kernel
  Cc: kvm, virtualization, netdev, sgarzare, mst, stefanha,
	dongli.zhang, maciej.szmigiero, bchaney, mark.kanda, ptikhomirov,
	den, andrey.drobyshev
In-Reply-To: <20260622175808.508084-1-andrey.drobyshev@virtuozzo.com>

During QEMU CPR live-update (and VHOST_RESET_OWNER in general) the guest
keeps running while the host drops and later re-attaches vhost backends.
If the guest adds a buffer to the TX virtqueue (guest->host) and kicks
while the backend is temporarily NULL (between vhost_vsock_drop_backends()
and the next vhost_vsock_start()), then the kick is delivered to the
vhost worker, handle_tx_kick() sees a NULL backend and returns, and the
kick signal is consumed.  The buffer is then left in the ring.

Then upon device start vhost_vsock_start() only re-kicks the RX send
worker, never the TX VQ, so the buffer is processed only if the guest
happens to kick again.  But if the guest itself is now waiting for data
from the host, it will never kick TX VQ again, and we end up in a
deadlock.

The issue itself is pre-existing, but it only manifests during a brief
pause caused by VHOST_RESET_OWNER.  Namely, the deadlock is reproduced
during active host->guest socat data transfer under multiple consecutive
CPR live-update's.

To fix this, in vhost_vsock_start(), after kicking the RX send worker, also
queue the TX vq poll so any buffers the guest enqueued while we were paused
get scanned.

Signed-off-by: Andrey Drobyshev <andrey.drobyshev@virtuozzo.com>
---
 drivers/vhost/vsock.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index bec6bcfd885f..81d4f7209719 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -646,6 +646,13 @@ static int vhost_vsock_start(struct vhost_vsock *vsock)
 	 */
 	vhost_vq_work_queue(&vsock->vqs[VSOCK_VQ_RX], &vsock->send_pkt_work);

+	/*
+	 * Some packets might've also been queued in TX VQ.  That is the case
+	 * during the brief device pause caused by VHOST_RESET_OWNER.  Re-scan
+	 * the TX VQ here, mirroring the RX send-worker kick above.
+	 */
+	vhost_poll_queue(&vsock->vqs[VSOCK_VQ_TX].poll);
+
 	mutex_unlock(&vsock->dev.mutex);
 	return 0;

-- 
2.47.1

^ permalink raw reply related

* [PATCH v2 2/4] vhost/vsock: suppress EHOSTUNREACH fast-fail during CPR pause
From: Andrey Drobyshev @ 2026-06-22 17:58 UTC (permalink / raw)
  To: linux-kernel
  Cc: kvm, virtualization, netdev, sgarzare, mst, stefanha,
	dongli.zhang, maciej.szmigiero, bchaney, mark.kanda, ptikhomirov,
	den, andrey.drobyshev
In-Reply-To: <20260622175808.508084-1-andrey.drobyshev@virtuozzo.com>

Earlier commit bb26ed5f3a8b ("vhost/vsock: Refuse the connection
immediately when guest isn't ready") added a fast-fail in
vhost_transport_send_pkt().  It rejects every host send with -EHOSTUNREACH
until the destination calls SET_RUNNING(1).  The fast-fail condition checks
whether device's backends are dropped, and if they're, the guest is
considered to be not ready.

However, there might be other reasons for backends to be nulled.  In
particular, when QEMU is performing CPR (checkpoint-restore) migration,
device ownership is being RESET and SET again, which leads to backends
drop and reattach.  If we end up connecting during this window, an
AF_VSOCK client gets -EHOSTUNREACH, which is wrong.

Add a 'started' flag which is set once in vhost_vsock_start() and is
never cleared.  The behaviour changes to:

  * When device was never started -> flag is unset -> no listener can
    exist yet -> fast-fail;
  * Once the device starts -> flag is set -> we don't fast-fail ->
    we queue and preserve during any later stop / CPR pause.

Important caveat: after the first start, a connect during any stopped
window is queued instead of fast-failed.  That was the behaviour before
the patch bb26ed5f3a8b, and we're restoring it now.  However we still
keep the behaviour originally intended by that commit (i.e. fast-fail if
there's no real listener yet) while fixing the CPR path.

Signed-off-by: Denis V. Lunev <den@openvz.org>
Signed-off-by: Andrey Drobyshev <andrey.drobyshev@virtuozzo.com>
---
 drivers/vhost/vsock.c | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index b12221ce6faf..bec6bcfd885f 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -61,6 +61,7 @@ struct vhost_vsock {
 
 	u32 guest_cid;
 	bool seqpacket_allow;
+	bool started;		/* set on first SET_RUNNING(1); never cleared */
 };
 
 static u32 vhost_transport_get_local_cid(void)
@@ -302,17 +303,12 @@ vhost_transport_send_pkt(struct sk_buff *skb, struct net *net)
 		return -ENODEV;
 	}
 
-	/* Fast-fail if the guest hasn't enabled the RX vq yet. Queuing the packet
-	 * and making the caller wait is pointless: even if the guest manages to init
-	 * within the timeout, it'll immediately reply with RST, because there's no
-	 * listener on the port yet.
-	 *
-	 * vhost_vq_get_backend() without vq->mutex is acceptable here: locking
-	 * the mutex would be too expensive in this hot path, and we already have
-	 * all the outcomes covered: if the backend becomes NULL right after the check,
-	 * vhost_transport_do_send_pkt() will check it under the mutex anyway.
+	/* Fast-fail until the guest first enables the device (SET_RUNNING(1)).
+	 * Before that there is no listener, so queuing is pointless. 'started'
+	 * is never cleared, so once we're up we keep queuing across later
+	 * stop / CPR-pause windows.
 	 */
-	if (unlikely(!data_race(vhost_vq_get_backend(&vsock->vqs[VSOCK_VQ_RX])))) {
+	if (unlikely(!READ_ONCE(vsock->started))) {
 		rcu_read_unlock();
 		kfree_skb(skb);
 		return -EHOSTUNREACH;
@@ -640,6 +636,11 @@ static int vhost_vsock_start(struct vhost_vsock *vsock)
 		mutex_unlock(&vq->mutex);
 	}
 
+	/* Set 'started' flag on the first start; never cleared, so send_pkt
+	 * keeps queuing (instead of fast-failing) on later stop / CPR pauses.
+	 */
+	WRITE_ONCE(vsock->started, true);
+
 	/* Some packets may have been queued before the device was started,
 	 * let's kick the send worker to send them.
 	 */
@@ -728,6 +729,7 @@ static int vhost_vsock_dev_open(struct inode *inode, struct file *file)
 
 	vsock->guest_cid = 0; /* no CID assigned yet */
 	vsock->seqpacket_allow = false;
+	vsock->started = false;
 
 	atomic_set(&vsock->queued_replies, 0);
 
-- 
2.47.1


^ permalink raw reply related

* [PATCH v2 1/4] vhost/vsock: split out vhost_vsock_drop_backends helper
From: Andrey Drobyshev @ 2026-06-22 17:58 UTC (permalink / raw)
  To: linux-kernel
  Cc: kvm, virtualization, netdev, sgarzare, mst, stefanha,
	dongli.zhang, maciej.szmigiero, bchaney, mark.kanda, ptikhomirov,
	den, andrey.drobyshev
In-Reply-To: <20260622175808.508084-1-andrey.drobyshev@virtuozzo.com>

From: Pavel Tikhomirov <ptikhomirov@virtuozzo.com>

Split the actual backend dropping part from vhost_vsock_stop.  We're
going to need it for the VHOST_RESET_OWNER implementation in the
following patch, when vsock->dev.mutex is already taken and owner is
checked.

Signed-off-by: Pavel Tikhomirov <ptikhomirov@virtuozzo.com>
Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
---
 drivers/vhost/vsock.c | 26 +++++++++++++++++---------
 1 file changed, 17 insertions(+), 9 deletions(-)

diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index 9aaab6bb8061..b12221ce6faf 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -664,9 +664,24 @@ static int vhost_vsock_start(struct vhost_vsock *vsock)
 	return ret;
 }
 
-static int vhost_vsock_stop(struct vhost_vsock *vsock, bool check_owner)
+static void vhost_vsock_drop_backends(struct vhost_vsock *vsock)
 {
+	struct vhost_virtqueue *vq;
 	size_t i;
+
+	lockdep_assert_held(&vsock->dev.mutex);
+
+	for (i = 0; i < ARRAY_SIZE(vsock->vqs); i++) {
+		vq = &vsock->vqs[i];
+
+		mutex_lock(&vq->mutex);
+		vhost_vq_set_backend(vq, NULL);
+		mutex_unlock(&vq->mutex);
+	}
+}
+
+static int vhost_vsock_stop(struct vhost_vsock *vsock, bool check_owner)
+{
 	int ret = 0;
 
 	mutex_lock(&vsock->dev.mutex);
@@ -677,14 +692,7 @@ static int vhost_vsock_stop(struct vhost_vsock *vsock, bool check_owner)
 			goto err;
 	}
 
-	for (i = 0; i < ARRAY_SIZE(vsock->vqs); i++) {
-		struct vhost_virtqueue *vq = &vsock->vqs[i];
-
-		mutex_lock(&vq->mutex);
-		vhost_vq_set_backend(vq, NULL);
-		mutex_unlock(&vq->mutex);
-	}
-
+	vhost_vsock_drop_backends(vsock);
 err:
 	mutex_unlock(&vsock->dev.mutex);
 	return ret;
-- 
2.47.1


^ permalink raw reply related

* [PATCH v2 0/4] vhost/vsock: add support for VHOST_RESET_OWNER and CPR migration
From: Andrey Drobyshev @ 2026-06-22 17:58 UTC (permalink / raw)
  To: linux-kernel
  Cc: kvm, virtualization, netdev, sgarzare, mst, stefanha,
	dongli.zhang, maciej.szmigiero, bchaney, mark.kanda, ptikhomirov,
	den, andrey.drobyshev

v1 -> v2:

  * Patch 2 (suppress EHOSTUNREACH): replace 'cpr_paused' + backend check
    with a single 'started' latch;
  * Patch 3 (re-scan TX virtqueue): reword commit message;
  * Patch 4 (VHOST_RESET_OWNER):
      - fix a vhost_worker use-after-free / stuck VHOST_WORK_QUEUED stall
        against the lockless send path;
      - drop the no-op vsock_for_each_connected_socket() iteration;
  * Shuffle the patches, keep RESET_OWNER implementation last to preserve
    bisectability;
  * Reword the cover letter.

v1: https://lore.kernel.org/virtualization/20260612165718.433546-1-andrey.drobyshev@virtuozzo.com

Host<-->guest connections via AF_VSOCK sockets aren't supposed to
outlive VM migration, since VM is moving to another host.  However
there's a special case, which is QEMU live-update, or CPR
(checkpoint-restore) migration.  In this case, VM remains on the same
host, and we'd like such connections to persist.

For this to work, we need to be able to transfer device ownership from
source QEMU to dest QEMU.  Namely, source needs to reset ownership by
issuing VHOST_RESET_OWNER ioctl, and then target has to claim it by
calling VHOST_SET_OWNER.

Since VHOST_RESET_OWNER isn't yet implemented for vhost-vsock, let's add
such implementation.  Patch 1 is a preliminary helper.  Patches 2 and 3
fix the pre-existing issues which do manifest during CPR / RESET_OWNER.
Patch 4 is the ioctl's implementation itself - we keep it last to
preserve bisectability.

There's a complementary series for QEMU [0] adding support of vhost-vsock
devices during CPR migration.

I've tested this (patched QEMU + patched kernel) approximately as follows:

  * Run listener in the guest:
  socat -u VSOCK-LISTEN:9999 - >/tmp/recv.bin

  * Run data transfer from host to guest:
  socat -u FILE:/root/bigfile.bin VSOCK-CONNECT:CID:9999

  * Perform CPR migration during transfer (either cpr-exec or cpr-transfer)
  * Check that file hash sum matches

[0] https://lore.kernel.org/qemu-devel/20260619105514.128812-1-andrey.drobyshev@virtuozzo.com

Andrey Drobyshev (2):
  vhost/vsock: suppress EHOSTUNREACH fast-fail during CPR pause
  vhost/vsock: re-scan TX virtqueue on device start

Pavel Tikhomirov (2):
  vhost/vsock: split out vhost_vsock_drop_backends helper
  vhost/vsock: add VHOST_RESET_OWNER ioctl

 drivers/vhost/vsock.c | 96 ++++++++++++++++++++++++++++++++++---------
 1 file changed, 76 insertions(+), 20 deletions(-)

-- 
2.47.1

^ permalink raw reply

* Re: [PATCH net-next v3 1/2] net: dsa: realtek: rtl8365mb: add SGMII support for RTL8367S
From: Johan Alvarado @ 2026-06-22 17:53 UTC (permalink / raw)
  To: Luiz Angelo Daros de Luca
  Cc: netdev, linusw, alsi, andrew, olteanv, davem, edumazet, kuba,
	pabeni, linux, maxime.chevallier, namiltd, linux-kernel
In-Reply-To: <CAJq09z4cVBuRUMyCt8NopNePmjbcj=ycvq95gSXgh581kk4zDw@mail.gmail.com>

Hi Luiz,

Sorry for the slow reply — I wanted to test the changes on hardware
before answering rather than reply blind. I'm also adding the list
back to CC, since this is review of an on-list patch and the
discussion is useful for the archive; hope that's OK.

Thanks for the very thorough review. Almost everything is addressed in
v4, which I'll post once net-next reopens. Replies inline.

> You might want to omit any specific model as there are multiple
> possible supported models that have SGMII. Just use "only the RGMII
> and SGMII interface...". Determining which device supports what is a
> job for chip_info.

Done — the NOTE comment no longer names a model, it just lists the
implemented interfaces.

> > +#define   RTL8365MB_SDS_INDACS_CMD_INDEX_MASK  0x0007
>
> Isn't this MASK larger? I was expecting 0x003F.
>
> Please use GENMASK/BIT whenever possible. It makes it clearer when
> there are holes or overlaps in the reg.
> Once a register macro is added with a bunch of bits described, I think
> it is better to describe all bits we can, even if not in use in this
> driver. Realtek normally maps bits sequentially and, with GENMASK/BIT,
> it is visually easier to spot errors.

The CMD index field is gone in v4: I dropped the always-zero SerDes
index argument, so that mask was removed with it.

On GENMASK/BIT and documenting all bits: agreed in principle, but the
driver currently uses raw hex masks almost everywhere (~75 raw _MASK
defines vs ~22 BIT/GENMASK), so converting only the new SDS defines
would make the file more inconsistent rather than less. Options as I
see them: (1) convert just the new defines, (2) keep raw hex to match
the surrounding style, or (3) a separate file-wide cleanup patch on
top of this series. I'd lean towards 3, keeping this series focused on
the feature and doing the style modernization as its own patch, but
I'm happy to do 1 if you'd rather the new code lead by example. Same
question for documenting currently-unused bits. Which would you prefer?

> > +#define RTL8365MB_SDS_INDACS_ADR_REG           0x6601
>
> This reg is formed by two parts but, in this case, it might be
> pedantic to add the descriptions as well.
> PAGE_MASK    0x7E0
> REGAD_MASK    0x1F

Noted. Since the addresses are passed as whole values and the page/reg
split isn't used in the driver, I've left it as a single address for
now, but I can add the sub-field masks if you'd prefer them documented.

> > +#define   RTL8365MB_SDS_BMCR_DPRST_PHASE1      0x1401
> > +#define   RTL8365MB_SDS_BMCR_DPRST_PHASE2      0x1403
>
> I do not like magic numbers. You could do the BMCR_ANENABLE |
> BMCR_ISOLATE in the macro or code instead of just keeping it in a
> comment. It would give more semantics to the code.

Done — the phase values are now built from the standard bits:

  #define RTL8365MB_SDS_BMCR_DPRST_PHASE1 (BMCR_ANENABLE | BMCR_ISOLATE | 0x1)
  #define RTL8365MB_SDS_BMCR_DPRST_PHASE2 (BMCR_ANENABLE | BMCR_ISOLATE | 0x3)

> > +static const struct rtl8365mb_jam_tbl_entry rtl8365mb_sds_jam_sgmii[] = {
>
> I guess you got this from vendor's redData. However, that sequence is
> for the case when RTL8365MB_CHIP_OPTION_REG(0x13C1) == 0. In my tests,
> rtl8367s returns 1 for that reg, which would select the redDataSB
> variant in the vendor's code. Did you test both or check register
> 0x13C1 [...] HSGMII also has a similar test in vendor's code.

Good catch. I checked 0x13C1 on the MR80X (with the magic
unlock/relock via 0x13C0): it reads option = 1, so the committed
tables are already the SB/HB variants. For SGMII the only difference
between redData and redDataSB is reg 0x482 (0x21A2 for option 0 vs
0x2420 for option 1), and my table has 0x2420; the HSGMII table
matches redDataHB likewise. I captured the full vendor write sequence
on hardware by chainloading a patched U-Boot, so both tables are
confirmed against the live silicon.

v4 reads 0x13C1 at runtime and returns -EOPNOTSUPP for the option-0
variant rather than driving the SerDes with values I cannot verify on
available hardware.

> > +       if (extint->id != 1)
> > +               return -EOPNOTSUPP;
>
> The model RTL8370MB is also a member of RTL8367C [...] Can't you just
> check extint supported_interfaces? [...] This type of hardcoded
> assumption just makes the job harder.

In v4 this hardcoded check is gone. With the phylink_pcs conversion
(see below), the SerDes is gated through supported_interfaces in
get_caps() and mac_select_pcs(), so whether a port uses the SerDes is
driven by chip_info rather than a hardcoded id.

> > +       ret = regmap_update_bits(priv->map, RTL8365MB_BYPASS_LINE_RATE_REG,
> > +                                BIT(extint->id), 0);
>
> BYPASS_LINE_RATE is actually indexed by port number starting at 5
> [...] Describe [it] with a parametric macro, receiving the port number
> and returning the BIT(5-port) as mask [...] For RTL8367R [...] it uses
> bit 0 for int 1 and bit 1 for int 0 or 2.

Done — it's now a parametric macro:

  #define RTL8365MB_BYPASS_LINE_RATE_MASK(_port)  BIT((_port) - 5)

with a comment noting port 5 is the base and that other families (e.g.
the RTL8367R's (id + 1) % 2) index it differently, so this mapping
only holds for the RTL8367C-style parts the driver supports. One small
thing: your mail had BIT(5 - port), which inverts it (port 6 would be
BIT(-1)); I used BIT(port - 5) so port 5 maps to bit 0 — let me know if
you meant something different.

> > +       usleep_range(10, 50);
>
> An arbitrary wait is not ideal but Mieczyslaw already suggested a
> better solution.

Done — the usleep is gone. Writes are fire-and-forget and reads poll
the self-clearing BUSY bit with regmap_read_poll_timeout(), matching
the vendor's getAsicSdsReg. I instrumented it on the MR80X: the BUSY
bit is never even observed set (the access completes within the
register transaction over MDIO), so no sleep is needed.

> > +       ret = regmap_update_bits(
> > +               priv->map, RTL8365MB_DIGITAL_INTERFACE_SELECT_REG(extint->id),
> > +               [...]
>
> Sometimes it is just easier to use a temp variable instead of fighting
> with the 80-col limit.

Done, the mode value goes into a temporary now.

> The lack of test devices is holding me back from making further
> improvements. [...] I think that, with some limitations, the rtl8365mb
> driver [...] could support the full range from RTL8370/RTL8367 up to
> RTL8367D.

Makes sense, and I tried to keep v4 from baking in RTL8367S-specific
assumptions where I could (the SerDes gating and the bypass macro
above). I can only test on the MR80X (RTL8367S) myself, so I've kept
the scope to what I can verify, but I'm happy to keep the code friendly
to that wider range.

One more thing from the wider discussion: following Maxime's review I
converted the whole SerDes path to a phylink_pcs in v4, so the SerDes
handling now lives in pcs_config()/pcs_get_state()/pcs_link_up() rather
than the ext/sds split in mac_link_up/down you saw in v3. pcs_get_state()
now reads the real SerDes link status (reg 0x3d) instead of reporting
the forced value.

Thanks again — this review made the series substantially better.

Best regards,
Johan

^ permalink raw reply

* Re: [PATCH] net: stmmac: fix missed le32_to_cpu()
From: Maxime Chevallier @ 2026-06-22 17:51 UTC (permalink / raw)
  To: Ben Dooks, Andrew Lunn, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Maxime Coquelin, Alexandre Torgue,
	Russell King (Oracle), netdev, linux-stm32, linux-arm-kernel,
	linux-kernel
In-Reply-To: <20260622143707.497198-1-ben.dooks@codethink.co.uk>

Hi Ben,

On 6/22/26 16:37, Ben Dooks wrote:
> The print in ndesc_display_ring() sends the des2 and des3
> to the pr_info() without passing them through the relevant
> conversion to cpu order.
> 
> Fix the (prototype) sparse warnings by using le32_to_cpu():
> drivers/net/ethernet/stmicro/stmmac/norm_desc.c:258:17: warning: incorrect type in argument 6 (different base types)
> drivers/net/ethernet/stmicro/stmmac/norm_desc.c:258:17:    expected unsigned int
> drivers/net/ethernet/stmicro/stmmac/norm_desc.c:258:17:    got restricted __le32 [usertype] des2
> drivers/net/ethernet/stmicro/stmmac/norm_desc.c:258:17: warning: incorrect type in argument 7 (different base types)
> drivers/net/ethernet/stmicro/stmmac/norm_desc.c:258:17:    expected unsigned int
> drivers/net/ethernet/stmicro/stmmac/norm_desc.c:258:17:    got restricted __le32 [usertype] des3
> 
> Signed-off-by: Ben Dooks <ben.dooks@codethink.co.uk>

I agree on the principle, but this isn't a fix so this'll have to wait
until net-next re-opens :)

Thanks,

Maxime

> ---
>  drivers/net/ethernet/stmicro/stmmac/norm_desc.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/drivers/net/ethernet/stmicro/stmmac/norm_desc.c b/drivers/net/ethernet/stmicro/stmmac/norm_desc.c
> index c4b613564f87..74c9b7b1fe8f 100644
> --- a/drivers/net/ethernet/stmicro/stmmac/norm_desc.c
> +++ b/drivers/net/ethernet/stmicro/stmmac/norm_desc.c
> @@ -258,7 +258,7 @@ static void ndesc_display_ring(void *head, unsigned int size, bool rx,
>  		pr_info("%03d [%pad]: 0x%x 0x%x 0x%x 0x%x",
>  			i, &dma_addr,
>  			(unsigned int)x, (unsigned int)(x >> 32),
> -			p->des2, p->des3);
> +			le32_to_cpu(p->des2), le32_to_cpu(p->des3));
>  		p++;
>  	}
>  	pr_info("\n");


^ permalink raw reply

* RE: [REGRESSION 6.12.90 -> 6.12.94] vsock/virtio: large AF_VSOCK transfers reset under backpressure
From: Brien Oberstein @ 2026-06-22 17:48 UTC (permalink / raw)
  To: 'Stefano Garzarella'; +Cc: netdev, regressions, stable
In-Reply-To: <ajkmjgGdJp9Dj6em@sgarzare-redhat>

Hi Stefano,

Confirmed -- the 16 MB buffer fixes it: with socat owning the VSOCK-LISTEN
and SO_VM_SOCKETS_BUFFER_MAX_SIZE/SIZE at 16 MB, a 6.12.94 guest passed
21/21 large transfers (1.5 MB x12 through 8 MB); the same 1.5 MB payload
failed every time without it. So the per-socket workaround covers the
bridges whose listen I control, but not vsock services I can't
reconfigure, which stay broken on 6.12.94.

Agreed the old behaviour was buggy in its own right -- it was
over-allocating past the advertised buffer. The practical effect for me is
just that a config that worked on 6.12.90 no longer does on 6.12.94.

A question mainly for stable@: until the merging work lands, would an
interim be acceptable -- something that keeps ordinary small-packet
workloads under the limit without reopening the DoS? I don't have the
kernel-side expertise to judge what's safe there, but I'm glad to prepare
and test whatever interim you think is right, and to test the merging
patch when it's ready.

Thanks,
Brien

-----Original Message-----
From: Stefano Garzarella <sgarzare@redhat.com> 
Sent: Monday, June 22, 2026 8:22 AM
To: Brien Oberstein <brienpub@gmail.com>
Cc: netdev@vger.kernel.org; regressions@lists.linux.dev;
stable@vger.kernel.org
Subject: Re: [REGRESSION 6.12.90 -> 6.12.94] vsock/virtio: large AF_VSOCK
transfers reset under backpressure

On Mon, Jun 22, 2026 at 07:55:30AM -0400, Brien Oberstein wrote:
>Hi Stefano,
>
>Thanks, that matches what I'm seeing: large transfers reset mid-stream
>instead of the sender being throttled (reliable above ~1.5 MB, fine below
>~90 KB).
>
>The bind for me: it's not just this mail bridge -- I use AF_VSOCK for a few
>host/guest services, some of which open their own sockets, so the
per-socket
>buffer workaround can't cover them all. That leaves pinning 6.12.90 (losing
>the DoS fix and further kernel updates) as the only blanket option.

Okay, but in that case did it work?

>
>A few quick questions:
>
>1. Is a -stable backport of the merging fix likely, and roughly when?

We don't have a fix yet.

>2. Could a smaller interim land in -stable sooner (e.g. more default
>   headroom) without reopening the DoS?

What we've merged so far is the best we can do for now, but anyone who 
wants to help improve the situation is welcome to submit patches.

>3. Will the fix guarantee backpressure for any packet size, or just widen
>   the margin?

It should fix STREAM sockets for any packet size.
SEQPACKET/DGRAM is a bit different since we need to keep boundaries, so 
it will come later if needed.

>
>Happy to test any patch

THanks, I'll ask you to test.

>I have a solid reproducer and can turn it around
>in a day. I'll also file this as a tracked regression so it's not lost.

Unfortunately, it's always been partially broken, using more memory than 
specified, so I don't know if this is actually a full regression, but I 
understand.

Thanks,
Stefano

^ permalink raw reply

* Re: [PATCH iproute2-next] "ip help" wrong output, exit code.
From: Dmitri Seletski @ 2026-06-22 17:47 UTC (permalink / raw)
  To: David Laight, Stephen Hemminger; +Cc: netdev
In-Reply-To: <20260622174454.576b3580@pumpkin>

Hello David,


Based on change introduced:

Two samples of "ip help" with demonstration of exit code and standard 
output are below.

This is in line with what expect.


dimkosPC~/compiled/iproute2-next #if ./ip/ip help a >>/dev/null  ; then 
echo help triggered  ; else echo error code triggered  ;fi  #this 
redirects standard output  to /dev/null, so text missing is not error,
but standard text
help triggered

dimkosPC~/compiled/iproute2-next #if ./ip/ip help   ; then echo help 
triggered  ; else echo error code triggered  ;fi
Usage: ip [ OPTIONS ] OBJECT { COMMAND | help }
       ip [ -force ] -batch filename
where  OBJECT := { address | addrlabel | fou | help | ila | ioam | l2tp 
| link |
                   macsec | maddress | monitor | mptcp | mroute | mrule |
                   neighbor | neighbour | netconf | netns | nexthop | 
ntable |
                   ntbl | route | rule | sr | stats | tap | tcpmetrics |
                   token | tunnel | tuntap | vrf | xfrm }
       OPTIONS := { -V[ersion] | -s[tatistics] | -d[etails] | -r[esolve] |
                    -h[uman-readable] | -iec | -j[son] | -p[retty] |
                    -f[amily] { inet | inet6 | mpls | bridge | link } |
                    -4 | -6 | -M | -B | -0 |
                    -l[oops] { maximum-addr-flush-attempts } | -echo | 
-br[ief] |
                    -o[neline] | -t[imestamp] | -ts[hort] | -b[atch] 
[filename] |
                    -rc[vbuf] [size] | -n[etns] name | -N[umeric] | -a[ll] |
                    -c[olor]}
help triggered

Two samples of command that is broken on purpose.

dimkosPC~/compiled/iproute2-next #if ./ip/ip idontexist   ; then echo 
help triggered  ; else echo error code triggered  ;fi
Object "idontexist" is unknown, try "ip help".
error code triggered

dimkosPC~/compiled/iproute2-next #if ./ip/ip idontexist  >>/dev/null  ; 
then echo help triggered  ; else echo error code triggered  ;fi  #this 
redirects standard output  to /dev/null, so text missing is not error, 
but standard text
Object "idontexist" is unknown, try "ip help".
error code triggered

This works as expected as per my understanding.


Not everything is fixed, but chunk of things fixed is better than non of it.

for example:

if ip  add help    ; then echo help triggered  ; else echo error code 
triggered  ;fi  #this redirects standard output  to /dev/null, so text 
missing is not error, but standard text
Usage: ip address {add|change|replace} IFADDR dev IFNAME [ LIFETIME ]
                                                      [ CONFFLAG-LIST ]
       ip address del IFADDR dev IFNAME [mngtmpaddr]
       ip address {save|flush} [ dev IFNAME ] [ scope SCOPE-ID ] [ to 
PREFIX ]
                            [ FLAG-LIST ] [ label LABEL ] [ { up | down } ]
       ip address [ show [ dev IFNAME ] [ scope SCOPE-ID ] [ master DEVICE ]
                         [ nomaster ]
                         [ type TYPE ] [ to PREFIX ] [ FLAG-LIST ]
                         [ label LABEL ] [ { up | down } ] [ vrf NAME ]
                         [ proto ADDRPROTO ] ]
       ip address {showdump|restore}
IFADDR := PREFIX | ADDR peer PREFIX
          [ broadcast ADDR ] [ anycast ADDR ]
          [ label IFNAME ] [ scope SCOPE-ID ] [ metric METRIC ]
          [ proto ADDRPROTO ]
SCOPE-ID := [ host | link | global | NUMBER ]
FLAG-LIST := [ FLAG-LIST ] FLAG
FLAG  := [ permanent | dynamic | secondary | primary |
           [-]tentative | [-]deprecated | [-]dadfailed | temporary |
           CONFFLAG-LIST ]
CONFFLAG-LIST := [ CONFFLAG-LIST ] CONFFLAG
CONFFLAG  := [ home | nodad | mngtmpaddr | noprefixroute | autojoin ]
LIFETIME := [ valid_lft LFT ] [ preferred_lft LFT ]
LFT := forever | SECONDS
ADDRPROTO := [ NAME | NUMBER ]
TYPE := { amt | bareudp | bond | bond_slave | bridge | bridge_slave |
          dsa | dummy | erspan | geneve | gre | gretap | gtp | hsr |
          ifb | ip6erspan | ip6gre | ip6gretap | ip6tnl |
          ipip | ipoib | ipvlan | ipvtap |
          macsec | macvlan | macvtap | netdevsim |
          netkit | nlmon | pfcp | rmnet | sit | team | team_slave |
          vcan | veth | vlan | vrf | vti | vxcan | vxlan | wwan |
          xfrm | virt_wifi }
error code triggered

This is still problematic.


But so far code leaves "ip help" command/argument in better shape than 
it found it in.


I may try improve things more, but lets submit what we already have 
"better", please.

Kind Regards

Dmitri Seletski


On 6/22/26 17:44, David Laight wrote:
> On Mon, 22 Jun 2026 07:57:00 -0700
> Stephen Hemminger <stephen@networkplumber.org> wrote:
>
>> On Sun, 21 Jun 2026 22:48:59 +0100
>> Dmitri Seletski <drjoms@gmail.com> wrote:
>>
>>>  From 0805e07105cd15c5b94271a4706e50e3c65dbde5 Mon Sep 17 00:00:00 2001
>>> From: Dmitri Seletski <drjoms@gmail.com>
>>> Date: Sun, 21 Jun 2026 22:12:43 +0100
>>> Subject: [PATCH iproute2-next]  "ip help" wrong output, exit code.
>>>
>>> Changed output of "ip help" from standard error to standard output. And
>>> Exit is now 0 instead of -1. "ip help|grep bridge" - now gives bridge
>>> syntax instead of flooding user with everything from "ip help".
>>> ---
>>> ip/ip.c | 4 ++--
>>> 1 file changed, 2 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/ip/ip.c b/ip/ip.c
>>> index e4b71bde..4627b61c 100644
>>> --- a/ip/ip.c
>>> +++ b/ip/ip.c
>>> @@ -56,7 +56,7 @@ static void usage(void) __attribute__((noreturn));
>>>
>>> static void usage(void)
>>> {
>>> -fprintf(stderr,
>>> +fprintf(stdout,
>>> "Usage: ip [ OPTIONS ] OBJECT { COMMAND | help }\n"
>>> "       ip [ -force ] -batch filename\n"
>>> "where  OBJECT := { address | addrlabel | fou | help | ila | ioam | l2tp
>>> | link |\n"
>>> @@ -72,7 +72,7 @@ static void usage(void)
>>> "                    -o[neline] | -t[imestamp] | -ts[hort] | -b[atch]
>>> [filename] |\n"
>>> "                    -rc[vbuf] [size] | -n[etns] name | -N[umeric] |
>>> -a[ll] |\n"
>>> "                    -c[olor]}\n");
>>> -exit(-1);
>>> +exit(0);
>>> }
>> Your mailer damages white space.
>>
> The output also needs to depend on whether these is a 'usage' error or
> if 'help' is requested.
> Code code is correct for the former - except it should do exit(1).
>
> 	David
>
>

^ permalink raw reply

* Re: [PATCH 05/23] powerpc/powermac: fix OF node refcount
From: Bartosz Golaszewski @ 2026-06-22 17:43 UTC (permalink / raw)
  To: Madhavan Srinivasan
  Cc: brgl, linux-kernel, netdev, linux-arm-msm, linux-sound,
	driver-core, devicetree, linuxppc-dev, linux-i2c, iommu, linux-pm,
	imx, linux-arm-kernel, intel-xe, dri-devel, linux-usb, linux-mips,
	platform-driver-x86, Bartosz Golaszewski, stable, Lee Jones,
	Mark Brown, Thierry Reding, Sebastian Hesselbarth, Andrew Lunn,
	David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Srinivas Kandagatla, Greg Kroah-Hartman, Vinod Koul,
	Rafael J. Wysocki, Danilo Krummrich, Rob Herring, Saravana Kannan,
	Michael Ellerman, Nicholas Piggin, Christophe Leroy (CS GROUP),
	Andi Shyti, Andy Shevchenko, Joerg Roedel, Will Deacon,
	Robin Murphy, Doug Berger, Florian Fainelli,
	Broadcom internal kernel review list, Ulf Hansson, Frank Li,
	Sascha Hauer, Pengutronix Kernel Team, Fabio Estevam,
	Matthew Brost, Thomas Hellström, Rodrigo Vivi, David Airlie,
	Simona Vetter, Peter Chen, Paul Cercueil, Bin Liu, Philipp Zabel,
	Maximilian Luz, Hans de Goede, Ilpo Järvinen,
	Krzysztof Kozlowski, Benjamin Herrenschmidt
In-Reply-To: <20260521-pdev-fwnode-ref-v1-5-88c324a1b8d2@oss.qualcomm.com>

On Thu, 21 May 2026 10:36:28 +0200, Bartosz Golaszewski
<bartosz.golaszewski@oss.qualcomm.com> said:
> Platform devices created with platform_device_alloc() call
> platform_device_release() when the last reference to the device's
> kobject is dropped. This function calls of_node_put() unconditionally.
> This works fine for devices created with platform_device_register_full()
> but users of the split approach (platform_device_alloc() +
> platform_device_add()) must bump the reference of the of_node they
> assign manually. Add the missing call to of_node_get().
>
> Cc: stable@vger.kernel.org
> Fixes: 81e5d8646ff6 ("i2c/powermac: Register i2c devices from device-tree")
> Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@oss.qualcomm.com>
> ---
>  arch/powerpc/platforms/powermac/low_i2c.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/arch/powerpc/platforms/powermac/low_i2c.c b/arch/powerpc/platforms/powermac/low_i2c.c
> index da72a30ab8657e6dc7e6f3437af612155783d8f9..973f58771d9636605ed5d3e91b45008543b584d3 100644
> --- a/arch/powerpc/platforms/powermac/low_i2c.c
> +++ b/arch/powerpc/platforms/powermac/low_i2c.c
> @@ -1471,7 +1471,7 @@ static int __init pmac_i2c_create_platform_devices(void)
>  		if (bus->platform_dev == NULL)
>  			return -ENOMEM;
>  		bus->platform_dev->dev.platform_data = bus;
> -		bus->platform_dev->dev.of_node = bus->busnode;
> +		bus->platform_dev->dev.of_node = of_node_get(bus->busnode);
>  		platform_device_add(bus->platform_dev);
>  	}
>
>
> --
> 2.47.3
>
>

Madhavan, can you please pick this up and send it upstream as a fix please?
Not having to carry it with the rest of the series will make things easier
for the next release.

Thanks,
Bartosz

^ permalink raw reply

* Re: [PATCH net 01/14] netfilter: flowtable: fix offloaded ct timeout never being extended
From: patchwork-bot+netdevbpf @ 2026-06-22 17:40 UTC (permalink / raw)
  To: Pablo Neira Ayuso
  Cc: netfilter-devel, davem, netdev, kuba, pabeni, edumazet, fw, horms
In-Reply-To: <20260620222738.112506-2-pablo@netfilter.org>

Hello:

This series was applied to netdev/net.git (main)
by Pablo Neira Ayuso <pablo@netfilter.org>:

On Sun, 21 Jun 2026 00:27:25 +0200 you wrote:
> From: Adrian Bente <adibente@gmail.com>
> 
> OpenWrt has recently migrated many platforms to kernel 6.18. On the
> MediaTek platform, which supports hardware network offloading, WiFi
> connections accelerated via the WED path were observed to drop after
> roughly 300 seconds.
> 
> [...]

Here is the summary with links:
  - [net,01/14] netfilter: flowtable: fix offloaded ct timeout never being extended
    https://git.kernel.org/netdev/net/c/53b3e60edb67
  - [net,02/14] netfilter: nf_queue: pin bridge device while NFQUEUE holds fake dst
    https://git.kernel.org/netdev/net/c/c9c9b37f8c55
  - [net,03/14] netfilter: xt_cluster: reject template conntracks in hash match
    https://git.kernel.org/netdev/net/c/5feba91006ec
  - [net,04/14] netfilter: flowtable: fix and simplify IP6IP6 tunnel handling
    https://git.kernel.org/netdev/net/c/f4c2d8668d85
  - [net,05/14] netfilter: ipset: Don't use test_bit() in lockless RCU readers in hash types
    https://git.kernel.org/netdev/net/c/e4b4984e28c1
  - [net,06/14] netfilter: ipset: Don't use test_bit() in lockless RCU readers in bitmap types
    https://git.kernel.org/netdev/net/c/1171192ac9af
  - [net,07/14] netfilter: ipset: fix order of kfree_rcu() and rcu_assign_pointer()
    https://git.kernel.org/netdev/net/c/3ca9982a8882
  - [net,08/14] netfilter: ipset: make sure gc is properly stopped
    https://git.kernel.org/netdev/net/c/4a597a87e2e2
  - [net,09/14] netfilter: nft_payload: reject offsets exceeding 65535 bytes
    https://git.kernel.org/netdev/net/c/213be32f46a2
  - [net,10/14] netfilter: nft_meta_bridge: add validate callback for get operations
    https://git.kernel.org/netdev/net/c/bff1c8b49a9c
  - [net,11/14] netfilter: nft_flow_offload: zero device address for non-ether case
    https://git.kernel.org/netdev/net/c/e409c23c2d06
  - [net,12/14] netfilter: nf_reject: skip iphdr options when looking for icmp header
    https://git.kernel.org/netdev/net/c/af8d6ae09c0a
  - [net,13/14] netfilter: nf_conntrack_expect: use conntrack GC to reap expectations
    https://git.kernel.org/netdev/net/c/b8b09dc2bf35
  - [net,14/14] netfilter: nft_meta_bridge: fix NFT_META_BRI_IIFPVID stack leak
    https://git.kernel.org/netdev/net/c/27dd2997746d

You are awesome, thank you!
-- 
Deet-doot-dot, I am a bot.
https://korg.docs.kernel.org/patchwork/pwbot.html



^ permalink raw reply

* Re: [PATCH iproute2] ip: return correct status from help command
From: Rose @ 2026-06-22 17:18 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: netdev
In-Reply-To: <20260622081637.172a6bb8@phoenix.local>

Hi Stephen,

Thanks for the feedback. I'd love to finish this up. I can apply the
same fix to the other commands in iproute2 sometime after work, around
17:30 UTC today.

Thanks again,
Rose

On Mon, Jun 22, 2026 at 10:16 AM Stephen Hemminger
<stephen@networkplumber.org> wrote:
>
> On Sun, 21 Jun 2026 18:03:11 +0000
> Rose Wright <rosesophiewright@gmail.com> wrote:
>
> > Currently, "ip help" or "ip -help" always returns an error code because usage() is used as a fall through on "ip" and defaults to stderr with -1.
> >
> > This is a minor bug that breaks "ip help | grep" and other scripts that rely on standard exit codes. The fix is to pass the status code as a parameter into usage() and change stderr to stdout when needed.
> >
> > Signed-off-by: Rose Wright <rosesophiewright@gmail.com>
> > ---
>
> This is the closest of the three submissions, but there are way more commands in iproute2
> than just ip. Need to address all the commands. Looks like perfect trivial job for AI
> coding tools. I am looking into it now.
>

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox