Netdev List
 help / color / mirror / Atom feed
* [PATCH net 04/14] tcp: annotate data-races around tp->snd_ssthresh
From: Eric Dumazet @ 2026-04-16 20:03 UTC (permalink / raw)
  To: David S . Miller, Jakub Kicinski, Paolo Abeni
  Cc: Simon Horman, Neal Cardwell, Kuniyuki Iwashima, netdev,
	eric.dumazet, Eric Dumazet
In-Reply-To: <20260416200319.3608680-1-edumazet@google.com>

tcp_get_timestamping_opt_stats() intentionally runs lockless, we must
add READ_ONCE() and WRITE_ONCE() annotations to keep KCSAN happy.

Fixes: 7156d194a077 ("tcp: add snd_ssthresh stat in SCM_TIMESTAMPING_OPT_STATS")
Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 net/core/filter.c       | 2 +-
 net/ipv4/tcp.c          | 4 ++--
 net/ipv4/tcp_bbr.c      | 6 +++---
 net/ipv4/tcp_bic.c      | 2 +-
 net/ipv4/tcp_cdg.c      | 4 ++--
 net/ipv4/tcp_cubic.c    | 6 +++---
 net/ipv4/tcp_dctcp.c    | 2 +-
 net/ipv4/tcp_input.c    | 8 ++++----
 net/ipv4/tcp_metrics.c  | 4 ++--
 net/ipv4/tcp_nv.c       | 4 ++--
 net/ipv4/tcp_output.c   | 4 ++--
 net/ipv4/tcp_vegas.c    | 9 +++++----
 net/ipv4/tcp_westwood.c | 4 ++--
 net/ipv4/tcp_yeah.c     | 3 ++-
 14 files changed, 32 insertions(+), 30 deletions(-)

diff --git a/net/core/filter.c b/net/core/filter.c
index fcfcb72663ca3798bb33d33275d18fc73071c8d4..3b5609fb96de5e92880c6170a7fcf54da4612818 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -5396,7 +5396,7 @@ static int bpf_sol_tcp_setsockopt(struct sock *sk, int optname,
 		if (val <= 0)
 			return -EINVAL;
 		tp->snd_cwnd_clamp = val;
-		tp->snd_ssthresh = val;
+		WRITE_ONCE(tp->snd_ssthresh, val);
 		break;
 	case TCP_BPF_DELACK_MAX:
 		timeout = usecs_to_jiffies(val);
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 24ba80d244b1fb69102b587b568cebe7b78dff9d..802a9ea05211f8eab30b6f937a459a270476974d 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -3425,7 +3425,7 @@ int tcp_disconnect(struct sock *sk, int flags)
 	icsk->icsk_rto = TCP_TIMEOUT_INIT;
 	WRITE_ONCE(icsk->icsk_rto_min, TCP_RTO_MIN);
 	WRITE_ONCE(icsk->icsk_delack_max, TCP_DELACK_MAX);
-	tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
+	WRITE_ONCE(tp->snd_ssthresh, TCP_INFINITE_SSTHRESH);
 	tcp_snd_cwnd_set(tp, TCP_INIT_CWND);
 	tp->snd_cwnd_cnt = 0;
 	tp->is_cwnd_limited = 0;
@@ -4452,7 +4452,7 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk,
 	nla_put_u8(stats, TCP_NLA_RECUR_RETRANS,
 		   READ_ONCE(inet_csk(sk)->icsk_retransmits));
 	nla_put_u8(stats, TCP_NLA_DELIVERY_RATE_APP_LMT, data_race(!!tp->rate_app_limited));
-	nla_put_u32(stats, TCP_NLA_SND_SSTHRESH, tp->snd_ssthresh);
+	nla_put_u32(stats, TCP_NLA_SND_SSTHRESH, READ_ONCE(tp->snd_ssthresh));
 	nla_put_u32(stats, TCP_NLA_DELIVERED, tp->delivered);
 	nla_put_u32(stats, TCP_NLA_DELIVERED_CE, tp->delivered_ce);
 
diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c
index 1ddc20a399b07054f8175b5f6459f8ae6dbf34bb..aec7805b1d37634783491649da1fa01602061781 100644
--- a/net/ipv4/tcp_bbr.c
+++ b/net/ipv4/tcp_bbr.c
@@ -897,8 +897,8 @@ static void bbr_check_drain(struct sock *sk, const struct rate_sample *rs)
 
 	if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) {
 		bbr->mode = BBR_DRAIN;	/* drain queue we created */
-		tcp_sk(sk)->snd_ssthresh =
-				bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT);
+		WRITE_ONCE(tcp_sk(sk)->snd_ssthresh,
+			   bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT));
 	}	/* fall through to check if in-flight is already small: */
 	if (bbr->mode == BBR_DRAIN &&
 	    bbr_packets_in_net_at_edt(sk, tcp_packets_in_flight(tcp_sk(sk))) <=
@@ -1043,7 +1043,7 @@ __bpf_kfunc static void bbr_init(struct sock *sk)
 	struct bbr *bbr = inet_csk_ca(sk);
 
 	bbr->prior_cwnd = 0;
-	tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
+	WRITE_ONCE(tp->snd_ssthresh, TCP_INFINITE_SSTHRESH);
 	bbr->rtt_cnt = 0;
 	bbr->next_rtt_delivered = tp->delivered;
 	bbr->prev_ca_state = TCP_CA_Open;
diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c
index 58358bf92e1b8ac43c07789dac9f6031fa2e03dd..65444ff142413aa7ea6151f1cb3cef7d14f253eb 100644
--- a/net/ipv4/tcp_bic.c
+++ b/net/ipv4/tcp_bic.c
@@ -74,7 +74,7 @@ static void bictcp_init(struct sock *sk)
 	bictcp_reset(ca);
 
 	if (initial_ssthresh)
-		tcp_sk(sk)->snd_ssthresh = initial_ssthresh;
+		WRITE_ONCE(tcp_sk(sk)->snd_ssthresh, initial_ssthresh);
 }
 
 /*
diff --git a/net/ipv4/tcp_cdg.c b/net/ipv4/tcp_cdg.c
index ceabfd690a296739323a795da5e1fc7453229b3f..0812c390aee5643ee39209f47e8ae901045dc498 100644
--- a/net/ipv4/tcp_cdg.c
+++ b/net/ipv4/tcp_cdg.c
@@ -162,7 +162,7 @@ static void tcp_cdg_hystart_update(struct sock *sk)
 				NET_ADD_STATS(sock_net(sk),
 					      LINUX_MIB_TCPHYSTARTTRAINCWND,
 					      tcp_snd_cwnd(tp));
-				tp->snd_ssthresh = tcp_snd_cwnd(tp);
+				WRITE_ONCE(tp->snd_ssthresh, tcp_snd_cwnd(tp));
 				return;
 			}
 		}
@@ -181,7 +181,7 @@ static void tcp_cdg_hystart_update(struct sock *sk)
 				NET_ADD_STATS(sock_net(sk),
 					      LINUX_MIB_TCPHYSTARTDELAYCWND,
 					      tcp_snd_cwnd(tp));
-				tp->snd_ssthresh = tcp_snd_cwnd(tp);
+				WRITE_ONCE(tp->snd_ssthresh, tcp_snd_cwnd(tp));
 			}
 		}
 	}
diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c
index ab78b5ae8d0e3d13a39bd1adf1e105b84f806b63..119bf8cbb007c22993367f0f1452a2db4ed9d92b 100644
--- a/net/ipv4/tcp_cubic.c
+++ b/net/ipv4/tcp_cubic.c
@@ -136,7 +136,7 @@ __bpf_kfunc static void cubictcp_init(struct sock *sk)
 		bictcp_hystart_reset(sk);
 
 	if (!hystart && initial_ssthresh)
-		tcp_sk(sk)->snd_ssthresh = initial_ssthresh;
+		WRITE_ONCE(tcp_sk(sk)->snd_ssthresh, initial_ssthresh);
 }
 
 __bpf_kfunc static void cubictcp_cwnd_event_tx_start(struct sock *sk)
@@ -420,7 +420,7 @@ static void hystart_update(struct sock *sk, u32 delay)
 				NET_ADD_STATS(sock_net(sk),
 					      LINUX_MIB_TCPHYSTARTTRAINCWND,
 					      tcp_snd_cwnd(tp));
-				tp->snd_ssthresh = tcp_snd_cwnd(tp);
+				WRITE_ONCE(tp->snd_ssthresh, tcp_snd_cwnd(tp));
 			}
 		}
 	}
@@ -440,7 +440,7 @@ static void hystart_update(struct sock *sk, u32 delay)
 				NET_ADD_STATS(sock_net(sk),
 					      LINUX_MIB_TCPHYSTARTDELAYCWND,
 					      tcp_snd_cwnd(tp));
-				tp->snd_ssthresh = tcp_snd_cwnd(tp);
+				WRITE_ONCE(tp->snd_ssthresh, tcp_snd_cwnd(tp));
 			}
 		}
 	}
diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c
index 96c99999e09dde9de9c337e4d6c692f517467c7b..274e628e7cf8621fd955528c8353001e773efb21 100644
--- a/net/ipv4/tcp_dctcp.c
+++ b/net/ipv4/tcp_dctcp.c
@@ -177,7 +177,7 @@ static void dctcp_react_to_loss(struct sock *sk)
 	struct tcp_sock *tp = tcp_sk(sk);
 
 	ca->loss_cwnd = tcp_snd_cwnd(tp);
-	tp->snd_ssthresh = max(tcp_snd_cwnd(tp) >> 1U, 2U);
+	WRITE_ONCE(tp->snd_ssthresh, max(tcp_snd_cwnd(tp) >> 1U, 2U));
 }
 
 __bpf_kfunc static void dctcp_state(struct sock *sk, u8 new_state)
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 6bb6bf049a35ac91fd53e3e66691f64fc4c93648..c6361447535f0a2b72eccb6fede4618471e38ae5 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2567,7 +2567,7 @@ void tcp_enter_loss(struct sock *sk)
 	    (icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) {
 		tp->prior_ssthresh = tcp_current_ssthresh(sk);
 		tp->prior_cwnd = tcp_snd_cwnd(tp);
-		tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
+		WRITE_ONCE(tp->snd_ssthresh, icsk->icsk_ca_ops->ssthresh(sk));
 		tcp_ca_event(sk, CA_EVENT_LOSS);
 		tcp_init_undo(tp);
 	}
@@ -2860,7 +2860,7 @@ static void tcp_undo_cwnd_reduction(struct sock *sk, bool unmark_loss)
 		tcp_snd_cwnd_set(tp, icsk->icsk_ca_ops->undo_cwnd(sk));
 
 		if (tp->prior_ssthresh > tp->snd_ssthresh) {
-			tp->snd_ssthresh = tp->prior_ssthresh;
+			WRITE_ONCE(tp->snd_ssthresh, tp->prior_ssthresh);
 			tcp_ecn_withdraw_cwr(tp);
 		}
 	}
@@ -2978,7 +2978,7 @@ static void tcp_init_cwnd_reduction(struct sock *sk)
 	tp->prior_cwnd = tcp_snd_cwnd(tp);
 	tp->prr_delivered = 0;
 	tp->prr_out = 0;
-	tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk);
+	WRITE_ONCE(tp->snd_ssthresh, inet_csk(sk)->icsk_ca_ops->ssthresh(sk));
 	tcp_ecn_queue_cwr(tp);
 }
 
@@ -3120,7 +3120,7 @@ static void tcp_non_congestion_loss_retransmit(struct sock *sk)
 
 	if (icsk->icsk_ca_state != TCP_CA_Loss) {
 		tp->high_seq = tp->snd_nxt;
-		tp->snd_ssthresh = tcp_current_ssthresh(sk);
+		WRITE_ONCE(tp->snd_ssthresh, tcp_current_ssthresh(sk));
 		tp->prior_ssthresh = 0;
 		tp->undo_marker = 0;
 		tcp_set_ca_state(sk, TCP_CA_Loss);
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index 7a9d6d9006f651e91054d3369b47758a6c35253b..dc0c081fc1f33f735a38aaae8a7b4ab3d633b148 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -490,9 +490,9 @@ void tcp_init_metrics(struct sock *sk)
 	val = READ_ONCE(net->ipv4.sysctl_tcp_no_ssthresh_metrics_save) ?
 	      0 : tcp_metric_get(tm, TCP_METRIC_SSTHRESH);
 	if (val) {
-		tp->snd_ssthresh = val;
+		WRITE_ONCE(tp->snd_ssthresh, val);
 		if (tp->snd_ssthresh > tp->snd_cwnd_clamp)
-			tp->snd_ssthresh = tp->snd_cwnd_clamp;
+			WRITE_ONCE(tp->snd_ssthresh, tp->snd_cwnd_clamp);
 	}
 	val = tcp_metric_get(tm, TCP_METRIC_REORDERING);
 	if (val && tp->reordering != val)
diff --git a/net/ipv4/tcp_nv.c b/net/ipv4/tcp_nv.c
index a60662f4bdf92cba1d92a3eedd7c607d1537d7f2..f345897a68dfcfe0f620ba50ff88f08b1c23e43f 100644
--- a/net/ipv4/tcp_nv.c
+++ b/net/ipv4/tcp_nv.c
@@ -396,8 +396,8 @@ static void tcpnv_acked(struct sock *sk, const struct ack_sample *sample)
 
 			/* We have enough data to determine we are congested */
 			ca->nv_allow_cwnd_growth = 0;
-			tp->snd_ssthresh =
-				(nv_ssthresh_factor * max_win) >> 3;
+			WRITE_ONCE(tp->snd_ssthresh,
+				   (nv_ssthresh_factor * max_win) >> 3);
 			if (tcp_snd_cwnd(tp) - max_win > 2) {
 				/* gap > 2, we do exponential cwnd decrease */
 				int dec;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index d8e8bba2d03a3be5e7a9ebac16e39f4a29ae6037..2663505a0dd7a0eae69f6a91250b4a0b6f357f7d 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -171,7 +171,7 @@ void tcp_cwnd_restart(struct sock *sk, s32 delta)
 
 	tcp_ca_event(sk, CA_EVENT_CWND_RESTART);
 
-	tp->snd_ssthresh = tcp_current_ssthresh(sk);
+	WRITE_ONCE(tp->snd_ssthresh, tcp_current_ssthresh(sk));
 	restart_cwnd = min(restart_cwnd, cwnd);
 
 	while ((delta -= inet_csk(sk)->icsk_rto) > 0 && cwnd > restart_cwnd)
@@ -2143,7 +2143,7 @@ static void tcp_cwnd_application_limited(struct sock *sk)
 		u32 init_win = tcp_init_cwnd(tp, __sk_dst_get(sk));
 		u32 win_used = max(tp->snd_cwnd_used, init_win);
 		if (win_used < tcp_snd_cwnd(tp)) {
-			tp->snd_ssthresh = tcp_current_ssthresh(sk);
+			WRITE_ONCE(tp->snd_ssthresh, tcp_current_ssthresh(sk));
 			tcp_snd_cwnd_set(tp, (tcp_snd_cwnd(tp) + win_used) >> 1);
 		}
 		tp->snd_cwnd_used = 0;
diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c
index 950a66966059e89fc108a31c106805a0f76fc6f0..574453af6bc03c95e7dee69bff7dde2b63fe65c4 100644
--- a/net/ipv4/tcp_vegas.c
+++ b/net/ipv4/tcp_vegas.c
@@ -245,7 +245,8 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked)
 				 */
 				tcp_snd_cwnd_set(tp, min(tcp_snd_cwnd(tp),
 							 (u32)target_cwnd + 1));
-				tp->snd_ssthresh = tcp_vegas_ssthresh(tp);
+				WRITE_ONCE(tp->snd_ssthresh,
+					   tcp_vegas_ssthresh(tp));
 
 			} else if (tcp_in_slow_start(tp)) {
 				/* Slow start.  */
@@ -261,8 +262,8 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked)
 					 * we slow down.
 					 */
 					tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) - 1);
-					tp->snd_ssthresh
-						= tcp_vegas_ssthresh(tp);
+					WRITE_ONCE(tp->snd_ssthresh,
+						   tcp_vegas_ssthresh(tp));
 				} else if (diff < alpha) {
 					/* We don't have enough extra packets
 					 * in the network, so speed up.
@@ -280,7 +281,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked)
 			else if (tcp_snd_cwnd(tp) > tp->snd_cwnd_clamp)
 				tcp_snd_cwnd_set(tp, tp->snd_cwnd_clamp);
 
-			tp->snd_ssthresh = tcp_current_ssthresh(sk);
+			WRITE_ONCE(tp->snd_ssthresh, tcp_current_ssthresh(sk));
 		}
 
 		/* Wipe the slate clean for the next RTT. */
diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c
index c6e97141eef2591c5ab50f4058a5377e0855313f..b5a42adfd6ca1fd93a91b99d7aa16bb4e64a9b3e 100644
--- a/net/ipv4/tcp_westwood.c
+++ b/net/ipv4/tcp_westwood.c
@@ -244,11 +244,11 @@ static void tcp_westwood_event(struct sock *sk, enum tcp_ca_event event)
 
 	switch (event) {
 	case CA_EVENT_COMPLETE_CWR:
-		tp->snd_ssthresh = tcp_westwood_bw_rttmin(sk);
+		WRITE_ONCE(tp->snd_ssthresh, tcp_westwood_bw_rttmin(sk));
 		tcp_snd_cwnd_set(tp, tp->snd_ssthresh);
 		break;
 	case CA_EVENT_LOSS:
-		tp->snd_ssthresh = tcp_westwood_bw_rttmin(sk);
+		WRITE_ONCE(tp->snd_ssthresh, tcp_westwood_bw_rttmin(sk));
 		/* Update RTT_min when next ack arrives */
 		w->reset_rtt_min = 1;
 		break;
diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c
index b22b3dccd05efddfe11203578950eddecee14887..9e581154f18f11832fa832544217fc9072b4f452 100644
--- a/net/ipv4/tcp_yeah.c
+++ b/net/ipv4/tcp_yeah.c
@@ -147,7 +147,8 @@ static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 acked)
 					tcp_snd_cwnd_set(tp, max(tcp_snd_cwnd(tp),
 								 yeah->reno_count));
 
-					tp->snd_ssthresh = tcp_snd_cwnd(tp);
+					WRITE_ONCE(tp->snd_ssthresh,
+						   tcp_snd_cwnd(tp));
 				}
 
 				if (yeah->reno_count <= 2)
-- 
2.54.0.rc1.513.gad8abe7a5a-goog


^ permalink raw reply related

* [PATCH net 03/14] tcp: add data-races annotations around tp->reordering, tp->snd_cwnd
From: Eric Dumazet @ 2026-04-16 20:03 UTC (permalink / raw)
  To: David S . Miller, Jakub Kicinski, Paolo Abeni
  Cc: Simon Horman, Neal Cardwell, Kuniyuki Iwashima, netdev,
	eric.dumazet, Eric Dumazet
In-Reply-To: <20260416200319.3608680-1-edumazet@google.com>

tcp_get_timestamping_opt_stats() intentionally runs lockless, we must
add READ_ONCE(), WRITE_ONCE() data_race() annotations to keep KCSAN happy.

Fixes: bb7c19f96012 ("tcp: add related fields into SCM_TIMESTAMPING_OPT_STATS")
Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 include/net/tcp.h      |  2 +-
 net/ipv4/tcp.c         |  8 ++++----
 net/ipv4/tcp_input.c   | 14 ++++++++------
 net/ipv4/tcp_metrics.c |  2 +-
 4 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 674af493882c802ebe03e0cac6e40b7c704aa0de..ecbadcb3a7446cb18c245e670ba49ff574dfaff7 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1513,7 +1513,7 @@ static inline u32 tcp_snd_cwnd(const struct tcp_sock *tp)
 static inline void tcp_snd_cwnd_set(struct tcp_sock *tp, u32 val)
 {
 	WARN_ON_ONCE((int)val <= 0);
-	tp->snd_cwnd = val;
+	WRITE_ONCE(tp->snd_cwnd, val);
 }
 
 static inline bool tcp_in_slow_start(const struct tcp_sock *tp)
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index e39e0734d958f39aa83a33f5c608ce3b94232fb1..24ba80d244b1fb69102b587b568cebe7b78dff9d 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -4445,13 +4445,13 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk,
 	rate64 = tcp_compute_delivery_rate(tp);
 	nla_put_u64_64bit(stats, TCP_NLA_DELIVERY_RATE, rate64, TCP_NLA_PAD);
 
-	nla_put_u32(stats, TCP_NLA_SND_CWND, tcp_snd_cwnd(tp));
-	nla_put_u32(stats, TCP_NLA_REORDERING, tp->reordering);
-	nla_put_u32(stats, TCP_NLA_MIN_RTT, tcp_min_rtt(tp));
+	nla_put_u32(stats, TCP_NLA_SND_CWND, READ_ONCE(tp->snd_cwnd));
+	nla_put_u32(stats, TCP_NLA_REORDERING, READ_ONCE(tp->reordering));
+	nla_put_u32(stats, TCP_NLA_MIN_RTT, data_race(tcp_min_rtt(tp)));
 
 	nla_put_u8(stats, TCP_NLA_RECUR_RETRANS,
 		   READ_ONCE(inet_csk(sk)->icsk_retransmits));
-	nla_put_u8(stats, TCP_NLA_DELIVERY_RATE_APP_LMT, !!tp->rate_app_limited);
+	nla_put_u8(stats, TCP_NLA_DELIVERY_RATE_APP_LMT, data_race(!!tp->rate_app_limited));
 	nla_put_u32(stats, TCP_NLA_SND_SSTHRESH, tp->snd_ssthresh);
 	nla_put_u32(stats, TCP_NLA_DELIVERED, tp->delivered);
 	nla_put_u32(stats, TCP_NLA_DELIVERED_CE, tp->delivered_ce);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 021f745747c59d8b9e200c5954af7807a4d08866..6bb6bf049a35ac91fd53e3e66691f64fc4c93648 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1293,8 +1293,9 @@ static void tcp_check_sack_reordering(struct sock *sk, const u32 low_seq,
 			 tp->sacked_out,
 			 tp->undo_marker ? tp->undo_retrans : 0);
 #endif
-		tp->reordering = min_t(u32, (metric + mss - 1) / mss,
-				       READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_max_reordering));
+		WRITE_ONCE(tp->reordering,
+			   min_t(u32, (metric + mss - 1) / mss,
+				 READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_max_reordering)));
 	}
 
 	/* This exciting event is worth to be remembered. 8) */
@@ -2439,8 +2440,9 @@ static void tcp_check_reno_reordering(struct sock *sk, const int addend)
 	if (!tcp_limit_reno_sacked(tp))
 		return;
 
-	tp->reordering = min_t(u32, tp->packets_out + addend,
-			       READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_max_reordering));
+	WRITE_ONCE(tp->reordering,
+		   min_t(u32, tp->packets_out + addend,
+			 READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_max_reordering)));
 	tp->reord_seen++;
 	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRENOREORDER);
 }
@@ -2579,8 +2581,8 @@ void tcp_enter_loss(struct sock *sk)
 	reordering = READ_ONCE(net->ipv4.sysctl_tcp_reordering);
 	if (icsk->icsk_ca_state <= TCP_CA_Disorder &&
 	    tp->sacked_out >= reordering)
-		tp->reordering = min_t(unsigned int, tp->reordering,
-				       reordering);
+		WRITE_ONCE(tp->reordering,
+			   min_t(unsigned int, tp->reordering, reordering));
 
 	tcp_set_ca_state(sk, TCP_CA_Loss);
 	tp->high_seq = tp->snd_nxt;
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index 06b1d5d3b6df7b8fa3fc631b8662160c8729a9df..7a9d6d9006f651e91054d3369b47758a6c35253b 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -496,7 +496,7 @@ void tcp_init_metrics(struct sock *sk)
 	}
 	val = tcp_metric_get(tm, TCP_METRIC_REORDERING);
 	if (val && tp->reordering != val)
-		tp->reordering = val;
+		WRITE_ONCE(tp->reordering, val);
 
 	crtt = tcp_metric_get(tm, TCP_METRIC_RTT);
 	rcu_read_unlock();
-- 
2.54.0.rc1.513.gad8abe7a5a-goog


^ permalink raw reply related

* [PATCH net 05/14] tcp: annotate data-races around tp->delivered and tp->delivered_ce
From: Eric Dumazet @ 2026-04-16 20:03 UTC (permalink / raw)
  To: David S . Miller, Jakub Kicinski, Paolo Abeni
  Cc: Simon Horman, Neal Cardwell, Kuniyuki Iwashima, netdev,
	eric.dumazet, Eric Dumazet
In-Reply-To: <20260416200319.3608680-1-edumazet@google.com>

tcp_get_timestamping_opt_stats() intentionally runs lockless, we must
add READ_ONCE() and WRITE_ONCE() annotations to keep KCSAN happy.

Fixes: feb5f2ec6464 ("tcp: export packets delivery info")
Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 include/net/tcp_ecn.h | 2 +-
 net/ipv4/tcp.c        | 4 ++--
 net/ipv4/tcp_input.c  | 8 ++++----
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/include/net/tcp_ecn.h b/include/net/tcp_ecn.h
index e9a933641636e11902a84a595672cd56a551f305..865d5c5a7718dbc7d6db1963261889fd44625bdc 100644
--- a/include/net/tcp_ecn.h
+++ b/include/net/tcp_ecn.h
@@ -181,7 +181,7 @@ static inline void tcp_accecn_third_ack(struct sock *sk,
 		    tcp_accecn_validate_syn_feedback(sk, ace, sent_ect)) {
 			if ((tcp_accecn_extract_syn_ect(ace) == INET_ECN_CE) &&
 			    !tp->delivered_ce)
-				tp->delivered_ce++;
+				WRITE_ONCE(tp->delivered_ce, 1);
 		}
 		break;
 	}
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 802a9ea05211f8eab30b6f937a459a270476974d..0aabd02d44967dae3e569702f76037beb45e5de8 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -4453,8 +4453,8 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk,
 		   READ_ONCE(inet_csk(sk)->icsk_retransmits));
 	nla_put_u8(stats, TCP_NLA_DELIVERY_RATE_APP_LMT, data_race(!!tp->rate_app_limited));
 	nla_put_u32(stats, TCP_NLA_SND_SSTHRESH, READ_ONCE(tp->snd_ssthresh));
-	nla_put_u32(stats, TCP_NLA_DELIVERED, tp->delivered);
-	nla_put_u32(stats, TCP_NLA_DELIVERED_CE, tp->delivered_ce);
+	nla_put_u32(stats, TCP_NLA_DELIVERED, READ_ONCE(tp->delivered));
+	nla_put_u32(stats, TCP_NLA_DELIVERED_CE, READ_ONCE(tp->delivered_ce));
 
 	nla_put_u32(stats, TCP_NLA_SNDQ_SIZE, tp->write_seq - tp->snd_una);
 	nla_put_u8(stats, TCP_NLA_CA_STATE, inet_csk(sk)->icsk_ca_state);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index c6361447535f0a2b72eccb6fede4618471e38ae5..63ff89210a72fbf5710279c41010d3f6e734e522 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -476,14 +476,14 @@ static bool tcp_accecn_process_option(struct tcp_sock *tp,
 
 static void tcp_count_delivered_ce(struct tcp_sock *tp, u32 ecn_count)
 {
-	tp->delivered_ce += ecn_count;
+	WRITE_ONCE(tp->delivered_ce, tp->delivered_ce + ecn_count);
 }
 
 /* Updates the delivered and delivered_ce counts */
 static void tcp_count_delivered(struct tcp_sock *tp, u32 delivered,
 				bool ece_ack)
 {
-	tp->delivered += delivered;
+	WRITE_ONCE(tp->delivered, tp->delivered + delivered);
 	if (tcp_ecn_mode_rfc3168(tp) && ece_ack)
 		tcp_count_delivered_ce(tp, delivered);
 }
@@ -6779,7 +6779,7 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVE);
 		/* SYN-data is counted as two separate packets in tcp_ack() */
 		if (tp->delivered > 1)
-			--tp->delivered;
+			WRITE_ONCE(tp->delivered, tp->delivered - 1);
 	}
 
 	tcp_fastopen_add_skb(sk, synack);
@@ -7212,7 +7212,7 @@ tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
 	SKB_DR_SET(reason, NOT_SPECIFIED);
 	switch (sk->sk_state) {
 	case TCP_SYN_RECV:
-		tp->delivered++; /* SYN-ACK delivery isn't tracked in tcp_ack */
+		WRITE_ONCE(tp->delivered, tp->delivered + 1); /* SYN-ACK delivery isn't tracked in tcp_ack */
 		if (!tp->srtt_us)
 			tcp_synack_rtt_meas(sk, req);
 
-- 
2.54.0.rc1.513.gad8abe7a5a-goog


^ permalink raw reply related

* [PATCH net 06/14] tcp: add data-race annotations for TCP_NLA_SNDQ_SIZE
From: Eric Dumazet @ 2026-04-16 20:03 UTC (permalink / raw)
  To: David S . Miller, Jakub Kicinski, Paolo Abeni
  Cc: Simon Horman, Neal Cardwell, Kuniyuki Iwashima, netdev,
	eric.dumazet, Eric Dumazet
In-Reply-To: <20260416200319.3608680-1-edumazet@google.com>

tcp_get_timestamping_opt_stats() intentionally runs lockless, we must
add READ_ONCE() and WRITE_ONCE() annotations to keep KCSAN happy.

Fixes: 87ecc95d81d9 ("tcp: add send queue size stat in SCM_TIMESTAMPING_OPT_STATS")
Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 net/ipv4/tcp.c        | 4 +++-
 net/ipv4/tcp_input.c  | 4 ++--
 net/ipv4/tcp_output.c | 2 +-
 3 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 0aabd02d44967dae3e569702f76037beb45e5de8..729936d13a5c6d9c39edc880636e01cf0973688e 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -4456,7 +4456,9 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk,
 	nla_put_u32(stats, TCP_NLA_DELIVERED, READ_ONCE(tp->delivered));
 	nla_put_u32(stats, TCP_NLA_DELIVERED_CE, READ_ONCE(tp->delivered_ce));
 
-	nla_put_u32(stats, TCP_NLA_SNDQ_SIZE, tp->write_seq - tp->snd_una);
+	nla_put_u32(stats, TCP_NLA_SNDQ_SIZE,
+		    max_t(int, 0,
+			  READ_ONCE(tp->write_seq) - READ_ONCE(tp->snd_una)));
 	nla_put_u8(stats, TCP_NLA_CA_STATE, inet_csk(sk)->icsk_ca_state);
 
 	nla_put_u64_64bit(stats, TCP_NLA_BYTES_SENT, tp->bytes_sent,
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 63ff89210a72fbf5710279c41010d3f6e734e522..edb5013873e0c4a9ba2f8a81a43eff5fb6e7a47b 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3912,7 +3912,7 @@ static void tcp_snd_una_update(struct tcp_sock *tp, u32 ack)
 	sock_owned_by_me((struct sock *)tp);
 	tp->bytes_acked += delta;
 	tcp_snd_sne_update(tp, ack);
-	tp->snd_una = ack;
+	WRITE_ONCE(tp->snd_una, ack);
 }
 
 static void tcp_rcv_sne_update(struct tcp_sock *tp, u32 seq)
@@ -7240,7 +7240,7 @@ tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
 		if (sk->sk_socket)
 			sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT);
 
-		tp->snd_una = TCP_SKB_CB(skb)->ack_seq;
+		WRITE_ONCE(tp->snd_una, TCP_SKB_CB(skb)->ack_seq);
 		tp->snd_wnd = ntohs(th->window) << tp->rx_opt.snd_wscale;
 		tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
 
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 2663505a0dd7a0eae69f6a91250b4a0b6f357f7d..9f83c7e4acabc64f0a45e4879c326694a306b368 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -4153,7 +4153,7 @@ static void tcp_connect_init(struct sock *sk)
 	tp->snd_wnd = 0;
 	tcp_init_wl(tp, 0);
 	tcp_write_queue_purge(sk);
-	tp->snd_una = tp->write_seq;
+	WRITE_ONCE(tp->snd_una, tp->write_seq);
 	tp->snd_sml = tp->write_seq;
 	tp->snd_up = tp->write_seq;
 	WRITE_ONCE(tp->snd_nxt, tp->write_seq);
-- 
2.54.0.rc1.513.gad8abe7a5a-goog


^ permalink raw reply related

* [PATCH net 08/14] tcp: annotate data-races around tp->bytes_retrans
From: Eric Dumazet @ 2026-04-16 20:03 UTC (permalink / raw)
  To: David S . Miller, Jakub Kicinski, Paolo Abeni
  Cc: Simon Horman, Neal Cardwell, Kuniyuki Iwashima, netdev,
	eric.dumazet, Eric Dumazet
In-Reply-To: <20260416200319.3608680-1-edumazet@google.com>

tcp_get_timestamping_opt_stats() intentionally runs lockless, we must
add READ_ONCE() and WRITE_ONCE() annotations to keep KCSAN happy.

Fixes: fb31c9b9f6c8 ("tcp: add data bytes retransmitted stats")
Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 net/ipv4/tcp.c        | 4 ++--
 net/ipv4/tcp_output.c | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index f999b86851cdcc2a9b9ce379397e55293871c00a..8c84639dc54bb8d8aa6f3eeb2b7c1fea16ebfcb5 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -4463,8 +4463,8 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk,
 
 	nla_put_u64_64bit(stats, TCP_NLA_BYTES_SENT, READ_ONCE(tp->bytes_sent),
 			  TCP_NLA_PAD);
-	nla_put_u64_64bit(stats, TCP_NLA_BYTES_RETRANS, tp->bytes_retrans,
-			  TCP_NLA_PAD);
+	nla_put_u64_64bit(stats, TCP_NLA_BYTES_RETRANS,
+			  READ_ONCE(tp->bytes_retrans), TCP_NLA_PAD);
 	nla_put_u32(stats, TCP_NLA_DSACK_DUPS, tp->dsack_dups);
 	nla_put_u32(stats, TCP_NLA_REORD_SEEN, tp->reord_seen);
 	nla_put_u32(stats, TCP_NLA_SRTT, tp->srtt_us >> 3);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 87af4731df87ea5f310d39b1392cb995d4fa8f78..f9d8755705f762fe4da3064d2b1bfce4828ec0c1 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -3645,7 +3645,7 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
 	if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
 		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
 	WRITE_ONCE(tp->total_retrans, tp->total_retrans + segs);
-	tp->bytes_retrans += skb->len;
+	WRITE_ONCE(tp->bytes_retrans, tp->bytes_retrans + skb->len);
 
 	/* make sure skb->data is aligned on arches that require it
 	 * and check if ack-trimming & collapsing extended the headroom
-- 
2.54.0.rc1.513.gad8abe7a5a-goog


^ permalink raw reply related

* [PATCH net 07/14] tcp: annotate data-races around tp->bytes_sent
From: Eric Dumazet @ 2026-04-16 20:03 UTC (permalink / raw)
  To: David S . Miller, Jakub Kicinski, Paolo Abeni
  Cc: Simon Horman, Neal Cardwell, Kuniyuki Iwashima, netdev,
	eric.dumazet, Eric Dumazet
In-Reply-To: <20260416200319.3608680-1-edumazet@google.com>

tcp_get_timestamping_opt_stats() intentionally runs lockless, we must
add READ_ONCE() and WRITE_ONCE() annotations to keep KCSAN happy.

Fixes: ba113c3aa79a ("tcp: add data bytes sent stats")
Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 net/ipv4/tcp.c        | 2 +-
 net/ipv4/tcp_output.c | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 729936d13a5c6d9c39edc880636e01cf0973688e..f999b86851cdcc2a9b9ce379397e55293871c00a 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -4461,7 +4461,7 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk,
 			  READ_ONCE(tp->write_seq) - READ_ONCE(tp->snd_una)));
 	nla_put_u8(stats, TCP_NLA_CA_STATE, inet_csk(sk)->icsk_ca_state);
 
-	nla_put_u64_64bit(stats, TCP_NLA_BYTES_SENT, tp->bytes_sent,
+	nla_put_u64_64bit(stats, TCP_NLA_BYTES_SENT, READ_ONCE(tp->bytes_sent),
 			  TCP_NLA_PAD);
 	nla_put_u64_64bit(stats, TCP_NLA_BYTES_RETRANS, tp->bytes_retrans,
 			  TCP_NLA_PAD);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 9f83c7e4acabc64f0a45e4879c326694a306b368..87af4731df87ea5f310d39b1392cb995d4fa8f78 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1690,7 +1690,8 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
 		tcp_event_data_sent(tp, sk);
 		WRITE_ONCE(tp->data_segs_out,
 			   tp->data_segs_out + tcp_skb_pcount(skb));
-		tp->bytes_sent += skb->len - tcp_header_size;
+		WRITE_ONCE(tp->bytes_sent,
+			   tp->bytes_sent + skb->len - tcp_header_size);
 	}
 
 	if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq)
-- 
2.54.0.rc1.513.gad8abe7a5a-goog


^ permalink raw reply related

* [PATCH net 09/14] tcp: annotate data-races around tp->dsack_dups
From: Eric Dumazet @ 2026-04-16 20:03 UTC (permalink / raw)
  To: David S . Miller, Jakub Kicinski, Paolo Abeni
  Cc: Simon Horman, Neal Cardwell, Kuniyuki Iwashima, netdev,
	eric.dumazet, Eric Dumazet
In-Reply-To: <20260416200319.3608680-1-edumazet@google.com>

tcp_get_timestamping_opt_stats() intentionally runs lockless, we must
add READ_ONCE() and WRITE_ONCE() annotations to keep KCSAN happy.

Fixes: 7e10b6554ff2 ("tcp: add dsack blocks received stats")
Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 net/ipv4/tcp.c       | 2 +-
 net/ipv4/tcp_input.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 8c84639dc54bb8d8aa6f3eeb2b7c1fea16ebfcb5..57c4dcc8bfe948e895f713cadaad20409a9b8f14 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -4465,7 +4465,7 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk,
 			  TCP_NLA_PAD);
 	nla_put_u64_64bit(stats, TCP_NLA_BYTES_RETRANS,
 			  READ_ONCE(tp->bytes_retrans), TCP_NLA_PAD);
-	nla_put_u32(stats, TCP_NLA_DSACK_DUPS, tp->dsack_dups);
+	nla_put_u32(stats, TCP_NLA_DSACK_DUPS, READ_ONCE(tp->dsack_dups));
 	nla_put_u32(stats, TCP_NLA_REORD_SEEN, tp->reord_seen);
 	nla_put_u32(stats, TCP_NLA_SRTT, tp->srtt_us >> 3);
 	nla_put_u16(stats, TCP_NLA_TIMEOUT_REHASH, tp->timeout_rehash);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index edb5013873e0c4a9ba2f8a81a43eff5fb6e7a47b..65b3ecc6be4b57f97a62d0e5737a96ddb16dd14d 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1246,7 +1246,7 @@ static u32 tcp_dsack_seen(struct tcp_sock *tp, u32 start_seq,
 	else if (tp->tlp_high_seq && tp->tlp_high_seq == end_seq)
 		state->flag |= FLAG_DSACK_TLP;
 
-	tp->dsack_dups += dup_segs;
+	WRITE_ONCE(tp->dsack_dups, tp->dsack_dups + dup_segs);
 	/* Skip the DSACK if dup segs weren't retransmitted by sender */
 	if (tp->dsack_dups > tp->total_retrans)
 		return 0;
-- 
2.54.0.rc1.513.gad8abe7a5a-goog


^ permalink raw reply related

* [PATCH net 10/14] tcp: annotate data-races around tp->reord_seen
From: Eric Dumazet @ 2026-04-16 20:03 UTC (permalink / raw)
  To: David S . Miller, Jakub Kicinski, Paolo Abeni
  Cc: Simon Horman, Neal Cardwell, Kuniyuki Iwashima, netdev,
	eric.dumazet, Eric Dumazet
In-Reply-To: <20260416200319.3608680-1-edumazet@google.com>

tcp_get_timestamping_opt_stats() intentionally runs lockless, we must
add READ_ONCE() and WRITE_ONCE() annotations to keep KCSAN happy.

Fixes: 7ec65372ca53 ("tcp: add stat of data packet reordering events")
Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 net/ipv4/tcp.c       | 2 +-
 net/ipv4/tcp_input.c | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 57c4dcc8bfe948e895f713cadaad20409a9b8f14..39a4b06e36bbc63b8bc334c8f568c5a2046573b6 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -4466,7 +4466,7 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk,
 	nla_put_u64_64bit(stats, TCP_NLA_BYTES_RETRANS,
 			  READ_ONCE(tp->bytes_retrans), TCP_NLA_PAD);
 	nla_put_u32(stats, TCP_NLA_DSACK_DUPS, READ_ONCE(tp->dsack_dups));
-	nla_put_u32(stats, TCP_NLA_REORD_SEEN, tp->reord_seen);
+	nla_put_u32(stats, TCP_NLA_REORD_SEEN, READ_ONCE(tp->reord_seen));
 	nla_put_u32(stats, TCP_NLA_SRTT, tp->srtt_us >> 3);
 	nla_put_u16(stats, TCP_NLA_TIMEOUT_REHASH, tp->timeout_rehash);
 	nla_put_u32(stats, TCP_NLA_BYTES_NOTSENT,
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 65b3ecc6be4b57f97a62d0e5737a96ddb16dd14d..896a5a5a6b1a9a678e5321dd802554ab343f7835 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1299,7 +1299,7 @@ static void tcp_check_sack_reordering(struct sock *sk, const u32 low_seq,
 	}
 
 	/* This exciting event is worth to be remembered. 8) */
-	tp->reord_seen++;
+	WRITE_ONCE(tp->reord_seen, tp->reord_seen + 1);
 	NET_INC_STATS(sock_net(sk),
 		      ts ? LINUX_MIB_TCPTSREORDER : LINUX_MIB_TCPSACKREORDER);
 }
@@ -2443,7 +2443,7 @@ static void tcp_check_reno_reordering(struct sock *sk, const int addend)
 	WRITE_ONCE(tp->reordering,
 		   min_t(u32, tp->packets_out + addend,
 			 READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_max_reordering)));
-	tp->reord_seen++;
+	WRITE_ONCE(tp->reord_seen, tp->reord_seen + 1);
 	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRENOREORDER);
 }
 
-- 
2.54.0.rc1.513.gad8abe7a5a-goog


^ permalink raw reply related

* [PATCH net 12/14] tcp: annotate data-races around tp->timeout_rehash
From: Eric Dumazet @ 2026-04-16 20:03 UTC (permalink / raw)
  To: David S . Miller, Jakub Kicinski, Paolo Abeni
  Cc: Simon Horman, Neal Cardwell, Kuniyuki Iwashima, netdev,
	eric.dumazet, Eric Dumazet
In-Reply-To: <20260416200319.3608680-1-edumazet@google.com>

tcp_get_timestamping_opt_stats() intentionally runs lockless, we must
add READ_ONCE() and WRITE_ONCE() annotations to keep KCSAN happy.

Fixes: 32efcc06d2a1 ("tcp: export count for rehash attempts")
Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 net/ipv4/tcp.c       | 3 ++-
 net/ipv4/tcp_timer.c | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 541bd0d2d8c4ed24934fd047dd46e532b30021be..192e95b71ce9868980a809184be83398d8740427 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -4469,7 +4469,8 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk,
 	nla_put_u32(stats, TCP_NLA_DSACK_DUPS, READ_ONCE(tp->dsack_dups));
 	nla_put_u32(stats, TCP_NLA_REORD_SEEN, READ_ONCE(tp->reord_seen));
 	nla_put_u32(stats, TCP_NLA_SRTT, READ_ONCE(tp->srtt_us) >> 3);
-	nla_put_u16(stats, TCP_NLA_TIMEOUT_REHASH, tp->timeout_rehash);
+	nla_put_u16(stats, TCP_NLA_TIMEOUT_REHASH,
+		    READ_ONCE(tp->timeout_rehash));
 	nla_put_u32(stats, TCP_NLA_BYTES_NOTSENT,
 		    max_t(int, 0, tp->write_seq - tp->snd_nxt));
 	nla_put_u64_64bit(stats, TCP_NLA_EDT, orig_skb->skb_mstamp_ns,
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index ea99988795e7e87412eef2768c4471e90c3e873e..8d791a954cd6ced52380226f205c66224b8f8bbd 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -297,7 +297,7 @@ static int tcp_write_timeout(struct sock *sk)
 	}
 
 	if (sk_rethink_txhash(sk)) {
-		tp->timeout_rehash++;
+		WRITE_ONCE(tp->timeout_rehash, tp->timeout_rehash + 1);
 		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPTIMEOUTREHASH);
 	}
 
-- 
2.54.0.rc1.513.gad8abe7a5a-goog


^ permalink raw reply related

* [PATCH net 11/14] tcp: annotate data-races around tp->srtt_us
From: Eric Dumazet @ 2026-04-16 20:03 UTC (permalink / raw)
  To: David S . Miller, Jakub Kicinski, Paolo Abeni
  Cc: Simon Horman, Neal Cardwell, Kuniyuki Iwashima, netdev,
	eric.dumazet, Eric Dumazet
In-Reply-To: <20260416200319.3608680-1-edumazet@google.com>

tcp_get_timestamping_opt_stats() intentionally runs lockless, we must
add READ_ONCE() and WRITE_ONCE() annotations to keep KCSAN happy.

Fixes: e8bd8fca6773 ("tcp: add SRTT to SCM_TIMESTAMPING_OPT_STATS")
Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 net/ipv4/tcp.c       | 5 +++--
 net/ipv4/tcp_input.c | 2 +-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 39a4b06e36bbc63b8bc334c8f568c5a2046573b6..541bd0d2d8c4ed24934fd047dd46e532b30021be 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -3623,7 +3623,8 @@ static void tcp_enable_tx_delay(struct sock *sk, int val)
 	if (delta && sk->sk_state == TCP_ESTABLISHED) {
 		s64 srtt = (s64)tp->srtt_us + delta;
 
-		tp->srtt_us = clamp_t(s64, srtt, 1, ~0U);
+		WRITE_ONCE(tp->srtt_us,
+			   clamp_t(s64, srtt, 1, ~0U));
 
 		/* Note: does not deal with non zero icsk_backoff */
 		tcp_set_rto(sk);
@@ -4467,7 +4468,7 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk,
 			  READ_ONCE(tp->bytes_retrans), TCP_NLA_PAD);
 	nla_put_u32(stats, TCP_NLA_DSACK_DUPS, READ_ONCE(tp->dsack_dups));
 	nla_put_u32(stats, TCP_NLA_REORD_SEEN, READ_ONCE(tp->reord_seen));
-	nla_put_u32(stats, TCP_NLA_SRTT, tp->srtt_us >> 3);
+	nla_put_u32(stats, TCP_NLA_SRTT, READ_ONCE(tp->srtt_us) >> 3);
 	nla_put_u16(stats, TCP_NLA_TIMEOUT_REHASH, tp->timeout_rehash);
 	nla_put_u32(stats, TCP_NLA_BYTES_NOTSENT,
 		    max_t(int, 0, tp->write_seq - tp->snd_nxt));
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 896a5a5a6b1a9a678e5321dd802554ab343f7835..e04ae105893c2bf234e593b1529748283bb2176c 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1132,7 +1132,7 @@ static void tcp_rtt_estimator(struct sock *sk, long mrtt_us)
 
 		tcp_bpf_rtt(sk, mrtt_us, srtt);
 	}
-	tp->srtt_us = max(1U, srtt);
+	WRITE_ONCE(tp->srtt_us, max(1U, srtt));
 }
 
 void tcp_update_pacing_rate(struct sock *sk)
-- 
2.54.0.rc1.513.gad8abe7a5a-goog


^ permalink raw reply related

* [PATCH net 13/14] tcp: annotate data-races around (tp->write_seq - tp->snd_nxt)
From: Eric Dumazet @ 2026-04-16 20:03 UTC (permalink / raw)
  To: David S . Miller, Jakub Kicinski, Paolo Abeni
  Cc: Simon Horman, Neal Cardwell, Kuniyuki Iwashima, netdev,
	eric.dumazet, Eric Dumazet
In-Reply-To: <20260416200319.3608680-1-edumazet@google.com>

tcp_get_timestamping_opt_stats() intentionally runs lockless, we must
add READ_ONCE() annotations to keep KCSAN happy.

WRITE_ONCE() annotations are already present.

Fixes: e08ab0b377a1 ("tcp: add bytes not sent to SCM_TIMESTAMPING_OPT_STATS")
Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 net/ipv4/tcp.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 192e95b71ce9868980a809184be83398d8740427..68894c03f2622d1a08fd747ff4c5e32be8579d2c 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -4472,7 +4472,8 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk,
 	nla_put_u16(stats, TCP_NLA_TIMEOUT_REHASH,
 		    READ_ONCE(tp->timeout_rehash));
 	nla_put_u32(stats, TCP_NLA_BYTES_NOTSENT,
-		    max_t(int, 0, tp->write_seq - tp->snd_nxt));
+		    max_t(int, 0,
+			  READ_ONCE(tp->write_seq) - READ_ONCE(tp->snd_nxt)));
 	nla_put_u64_64bit(stats, TCP_NLA_EDT, orig_skb->skb_mstamp_ns,
 			  TCP_NLA_PAD);
 	if (ack_skb)
-- 
2.54.0.rc1.513.gad8abe7a5a-goog


^ permalink raw reply related

* [PATCH net 14/14] tcp: annotate data-races around tp->plb_rehash
From: Eric Dumazet @ 2026-04-16 20:03 UTC (permalink / raw)
  To: David S . Miller, Jakub Kicinski, Paolo Abeni
  Cc: Simon Horman, Neal Cardwell, Kuniyuki Iwashima, netdev,
	eric.dumazet, Eric Dumazet
In-Reply-To: <20260416200319.3608680-1-edumazet@google.com>

tcp_get_timestamping_opt_stats() intentionally runs lockless, we must
add READ_ONCE() and WRITE_ONCE() annotations to keep KCSAN happy.

Fixes: 29c1c44646ae ("tcp: add u32 counter in tcp_sock and an SNMP counter for PLB")
Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 net/ipv4/tcp.c     | 3 ++-
 net/ipv4/tcp_plb.c | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 68894c03f2622d1a08fd747ff4c5e32be8579d2c..7fbf2fca5eb2c63763231d4d55b770b85a7cbdbf 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -4480,7 +4480,8 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk,
 		nla_put_u8(stats, TCP_NLA_TTL,
 			   tcp_skb_ttl_or_hop_limit(ack_skb));
 
-	nla_put_u32(stats, TCP_NLA_REHASH, tp->plb_rehash + tp->timeout_rehash);
+	nla_put_u32(stats, TCP_NLA_REHASH,
+		    READ_ONCE(tp->plb_rehash) + READ_ONCE(tp->timeout_rehash));
 	return stats;
 }
 
diff --git a/net/ipv4/tcp_plb.c b/net/ipv4/tcp_plb.c
index 68ccdb9a54127632b6b764b5cbb18e1589ab1aa7..c11a0cd3f8feb004150a4056f5ca57f90d2cb2b8 100644
--- a/net/ipv4/tcp_plb.c
+++ b/net/ipv4/tcp_plb.c
@@ -80,7 +80,7 @@ void tcp_plb_check_rehash(struct sock *sk, struct tcp_plb_state *plb)
 
 	sk_rethink_txhash(sk);
 	plb->consec_cong_rounds = 0;
-	tcp_sk(sk)->plb_rehash++;
+	WRITE_ONCE(tcp_sk(sk)->plb_rehash, tcp_sk(sk)->plb_rehash + 1);
 	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPLBREHASH);
 }
 EXPORT_SYMBOL_GPL(tcp_plb_check_rehash);
-- 
2.54.0.rc1.513.gad8abe7a5a-goog


^ permalink raw reply related

* Re: [PATCH v3 1/4] rust: netlink: add raw netlink abstraction
From: Matthew Maurer @ 2026-04-16 20:06 UTC (permalink / raw)
  To: Alice Ryhl
  Cc: Miguel Ojeda, Boqun Feng, Gary Guo, Björn Roy Baron,
	Benno Lossin, Andreas Hindborg, Trevor Gross, Danilo Krummrich,
	Donald Hunter, Jakub Kicinski, David S. Miller, Eric Dumazet,
	Paolo Abeni, Simon Horman, Greg Kroah-Hartman,
	Arve Hjønnevåg, Todd Kjos, Christian Brauner,
	Carlos Llamas, linux-kernel, rust-for-linux, netdev
In-Reply-To: <20260415-binder-netlink-v3-1-84be9ba63ee2@google.com>

On Wed, Apr 15, 2026 at 2:44 AM Alice Ryhl <aliceryhl@google.com> wrote:
>
> This implements a safe and relatively simple API over the netlink API,
> that allows you to add different attributes to a netlink message and
> broadcast it. As the first user of this API only makes use of broadcast,
> only broadcast messages are supported here.
>
> This API is intended to be safe and to be easy to use in *generated*
> code. This is because netlink is generally used with yaml files that
> describe the underlying API, and the python generator outputs C code
> (or, soon, Rust code) that lets you use the API more easily. So for
> example, if there is a string field, the code generator will output a
> method that internall calls `put_string()` with the right attr type.
>
> Signed-off-by: Alice Ryhl <aliceryhl@google.com>
> ---
>  rust/bindings/bindings_helper.h |   3 +
>  rust/helpers/genetlink.c        |  46 ++++++
>  rust/helpers/helpers.c          |   1 +
>  rust/kernel/lib.rs              |   1 +
>  rust/kernel/netlink.rs          | 329 ++++++++++++++++++++++++++++++++++++++++
>  5 files changed, 380 insertions(+)
>
> diff --git a/rust/bindings/bindings_helper.h b/rust/bindings/bindings_helper.h
> index 083cc44aa952..8abb626fce6c 100644
> --- a/rust/bindings/bindings_helper.h
> +++ b/rust/bindings/bindings_helper.h
> @@ -88,6 +88,8 @@
>  #include <linux/wait.h>
>  #include <linux/workqueue.h>
>  #include <linux/xarray.h>
> +#include <net/genetlink.h>
> +#include <net/netlink.h>
>  #include <trace/events/rust_sample.h>
>
>  /*
> @@ -105,6 +107,7 @@
>  const size_t RUST_CONST_HELPER_ARCH_SLAB_MINALIGN = ARCH_SLAB_MINALIGN;
>  const size_t RUST_CONST_HELPER_ARCH_KMALLOC_MINALIGN = ARCH_KMALLOC_MINALIGN;
>  const size_t RUST_CONST_HELPER_PAGE_SIZE = PAGE_SIZE;
> +const size_t RUST_CONST_HELPER_GENLMSG_DEFAULT_SIZE = GENLMSG_DEFAULT_SIZE;
>  const gfp_t RUST_CONST_HELPER_GFP_ATOMIC = GFP_ATOMIC;
>  const gfp_t RUST_CONST_HELPER_GFP_KERNEL = GFP_KERNEL;
>  const gfp_t RUST_CONST_HELPER_GFP_KERNEL_ACCOUNT = GFP_KERNEL_ACCOUNT;
> diff --git a/rust/helpers/genetlink.c b/rust/helpers/genetlink.c
> new file mode 100644
> index 000000000000..3530b69f6cf7
> --- /dev/null
> +++ b/rust/helpers/genetlink.c
> @@ -0,0 +1,46 @@
> +// SPDX-License-Identifier: GPL-2.0
> +
> +/*
> + * Copyright (C) 2026 Google LLC.
> + */
> +
> +#include <net/genetlink.h>
> +
> +#ifdef CONFIG_NET
> +
> +__rust_helper struct sk_buff *rust_helper_genlmsg_new(size_t payload, gfp_t flags)
> +{
> +       return genlmsg_new(payload, flags);
> +}
> +
> +__rust_helper
> +int rust_helper_genlmsg_multicast(const struct genl_family *family,
> +                                 struct sk_buff *skb, u32 portid,
> +                                 unsigned int group, gfp_t flags)
> +{
> +       return genlmsg_multicast(family, skb, portid, group, flags);
> +}
> +
> +__rust_helper void rust_helper_genlmsg_cancel(struct sk_buff *skb, void *hdr)
> +{
> +       genlmsg_cancel(skb, hdr);
> +}
> +
> +__rust_helper void rust_helper_genlmsg_end(struct sk_buff *skb, void *hdr)
> +{
> +       genlmsg_end(skb, hdr);
> +}
> +
> +__rust_helper void rust_helper_nlmsg_free(struct sk_buff *skb)
> +{
> +       nlmsg_free(skb);
> +}
> +
> +__rust_helper
> +int rust_helper_genl_has_listeners(const struct genl_family *family,
> +                                  struct net *net, unsigned int group)
> +{
> +       return genl_has_listeners(family, net, group);
> +}
> +
> +#endif
> diff --git a/rust/helpers/helpers.c b/rust/helpers/helpers.c
> index a3c42e51f00a..0813185d8760 100644
> --- a/rust/helpers/helpers.c
> +++ b/rust/helpers/helpers.c
> @@ -32,6 +32,7 @@
>  #include "err.c"
>  #include "irq.c"
>  #include "fs.c"
> +#include "genetlink.c"
>  #include "io.c"
>  #include "jump_label.c"
>  #include "kunit.c"
> diff --git a/rust/kernel/lib.rs b/rust/kernel/lib.rs
> index d93292d47420..f5ea0ae0b6b7 100644
> --- a/rust/kernel/lib.rs
> +++ b/rust/kernel/lib.rs
> @@ -122,6 +122,7 @@
>  pub mod module_param;
>  #[cfg(CONFIG_NET)]
>  pub mod net;
> +pub mod netlink;
>  pub mod num;
>  pub mod of;
>  #[cfg(CONFIG_PM_OPP)]
> diff --git a/rust/kernel/netlink.rs b/rust/kernel/netlink.rs
> new file mode 100644
> index 000000000000..21f959c95fdc
> --- /dev/null
> +++ b/rust/kernel/netlink.rs
> @@ -0,0 +1,329 @@
> +// SPDX-License-Identifier: GPL-2.0
> +
> +// Copyright (C) 2026 Google LLC.
> +
> +//! Rust support for generic netlink.
> +//!
> +//! Currently only supports exposing multicast groups.
> +//!
> +//! C header: [`include/net/genetlink.h`](srctree/include/net/genetlink.h)
> +#![cfg(CONFIG_NET)]
> +
> +use kernel::{
> +    alloc::{self, AllocError},
> +    error::to_result,
> +    prelude::*,
> +    transmute::AsBytes,
> +    types::Opaque,
> +    ThisModule,
> +};
> +
> +use core::{
> +    mem::ManuallyDrop,
> +    ptr::NonNull, //
> +};
> +
> +/// The default netlink message size.
> +pub const GENLMSG_DEFAULT_SIZE: usize = bindings::GENLMSG_DEFAULT_SIZE;
> +
> +/// A wrapper around `struct sk_buff` for generic netlink messages.
> +///
> +/// This type is intended to be specific for buffers used with netlink only, and other usecases for
> +/// `struct sk_buff` are out-of-scope for this abstraction.
> +///
> +/// # Invariants
> +///
> +/// The pointer has ownership over a valid `sk_buff`.
> +pub struct NetlinkSkBuff {
> +    skb: NonNull<kernel::bindings::sk_buff>,
> +}
> +
> +impl NetlinkSkBuff {
> +    /// Creates a new `NetlinkSkBuff` with the given size.
> +    pub fn new(size: usize, flags: alloc::Flags) -> Result<NetlinkSkBuff, AllocError> {
> +        // SAFETY: `genlmsg_new` only requires its arguments to be valid integers.
> +        let skb = unsafe { bindings::genlmsg_new(size, flags.as_raw()) };
> +        let skb = NonNull::new(skb).ok_or(AllocError)?;
> +        Ok(NetlinkSkBuff { skb })
> +    }
> +
> +    /// Puts a generic netlink header into the `NetlinkSkBuff`.
> +    pub fn genlmsg_put(
> +        self,
> +        portid: u32,
> +        seq: u32,
> +        family: &'static Family,
> +        cmd: u8,
> +    ) -> Result<GenlMsg, AllocError> {
> +        let skb = self.skb.as_ptr();
> +        // SAFETY: The skb and family pointers are valid.
> +        let hdr = unsafe { bindings::genlmsg_put(skb, portid, seq, family.as_raw(), 0, cmd) };
> +        let hdr = NonNull::new(hdr).ok_or(AllocError)?;
> +        Ok(GenlMsg { skb: self, hdr })
> +    }
> +}
> +
> +impl Drop for NetlinkSkBuff {
> +    fn drop(&mut self) {
> +        // SAFETY: We have ownership over the `sk_buff`, so we may free it.
> +        unsafe { bindings::nlmsg_free(self.skb.as_ptr()) }
> +    }
> +}
> +
> +/// A generic netlink message being constructed.
> +///
> +/// # Invariants
> +///
> +/// `hdr` references the header in this netlink message.
> +pub struct GenlMsg {
> +    skb: NetlinkSkBuff,
> +    hdr: NonNull<c_void>,
> +}
> +
> +impl GenlMsg {
> +    /// Puts an attribute into the message.
> +    #[inline]
> +    fn put<T>(&mut self, attrtype: c_int, value: &T) -> Result
> +    where
> +        T: ?Sized + AsBytes,
> +    {
> +        let skb = self.skb.skb.as_ptr();
> +        let len = size_of_val(value);
> +        let ptr = core::ptr::from_ref(value).cast::<c_void>();
> +        // SAFETY: `skb` is valid by `NetlinkSkBuff` type invariants, and the provided value is
> +        // readable and initialized for its `size_of` bytes.
> +        to_result(unsafe { bindings::nla_put(skb, attrtype, len as c_int, ptr) })
> +    }
> +
> +    /// Puts a `u32` attribute into the message.
> +    #[inline]
> +    pub fn put_u32(&mut self, attrtype: c_int, value: u32) -> Result {
> +        self.put(attrtype, &value)
> +    }
> +
> +    /// Puts a string attribute into the message.
> +    #[inline]
> +    pub fn put_string(&mut self, attrtype: c_int, value: &CStr) -> Result {
> +        self.put(attrtype, value.to_bytes_with_nul())
> +    }
> +
> +    /// Puts a flag attribute into the message.
> +    #[inline]
> +    pub fn put_flag(&mut self, attrtype: c_int) -> Result {
> +        let skb = self.skb.skb.as_ptr();
> +        // SAFETY: `skb` is valid by `NetlinkSkBuff` type invariants, and a null pointer is valid
> +        // when the length is zero.
> +        to_result(unsafe { bindings::nla_put(skb, attrtype, 0, core::ptr::null()) })
> +    }
> +
> +    /// Sends the generic netlink message as a multicast message.
> +    #[inline]
> +    pub fn multicast(
> +        self,
> +        family: &'static Family,
> +        portid: u32,
> +        group: u32,
> +        flags: alloc::Flags,
> +    ) -> Result {
> +        let me = ManuallyDrop::new(self);
> +        // SAFETY: The `skb` and `family` pointers are valid. We pass ownership of the `skb` to
> +        // `genlmsg_multicast` by not dropping `self`.

I think if genlmsg_multicast returns an error code we may need to drop
to avoid leaking. Specifically, there is at least this path:
1. Set group to a large number (that's an unconstrained public parameter)
2. We suppress drop
3. We call genlmsg_multicast
4. We call genlmsg_multicast_netns
4. We call genlmsg_multicast_netns_filtered, which does an inbounds
check for the `group`. If it is too large, it returns EINVAL without
consuming the SKB - include/net/genetlink.h:493
5. We leak the skb

However, at the same time, if we pass that check and descend into
`netlink_broadcast_filtered`, it will unconditionally consume the SKB,
and possibly return an error code in other situations.

I think this either means that we need to make the inbounds check for
groups in `genlmsg_multicast_netns_filtered` use `consume_skb(skb)`
before returning EINVAL, or we need to check the error code for EINVAL
and manually drop if we get it. The second one seems kind of janky
because `genlmsg_multicast` doesn't document that its free-behavior
differs for different error codes.

> +        unsafe {
> +            bindings::genlmsg_end(me.skb.skb.as_ptr(), me.hdr.as_ptr());
> +            to_result(bindings::genlmsg_multicast(
> +                family.as_raw(),
> +                me.skb.skb.as_ptr(),
> +                portid,
> +                group,
> +                flags.as_raw(),
> +            ))
> +        }
> +    }
> +}
> +impl Drop for GenlMsg {
> +    fn drop(&mut self) {
> +        // SAFETY: The `hdr` pointer references the header of this generic netlink message.
> +        unsafe { bindings::genlmsg_cancel(self.skb.skb.as_ptr(), self.hdr.as_ptr()) };
> +    }
> +}
> +
> +/// Flags for a generic netlink family.
> +struct FamilyFlags {
> +    /// Whether the family supports network namespaces.
> +    netnsok: bool,
> +    /// Whether the family supports parallel operations.
> +    parallel_ops: bool,
> +}
> +
> +impl FamilyFlags {
> +    /// Converts the flags to the bitfield representation used by `genl_family`.
> +    const fn into_bitfield(self) -> bindings::__BindgenBitfieldUnit<[u8; 1]> {
> +        // The below shifts are verified correct by test_family_flags_bitfield() below.

1. My understanding is that bit layout is implementation defined from C11:

"An implementation may allocate any addressable storage unit large
enough to hold a bitfield." (This one gets tested statically via
interaction with bindgen)
"The order of allocation of bit-fields within a unit (high-order to
low-order or low-order to high-order) is implementation-defined."
(this one gets checked by your KUnit test)

so we can't actually know that any manual bitpack/unpack will do what
we want reliably - another C compiler might do something different.

2. While this looks correct to me from an endianness perspective (in
terms of what compilers actually do rather than what the spec says),
have we run it on a big endian arch just to make sure?

> +        //
> +        // Although bindgen generates helpers to change bitfields based on the C headers, these
> +        // helpers unfortunately can't be used in const context. Since `Family` needs to be filled
> +        // out at build-time, we use this helper instead.
> +        let mut bits = 0;
> +        if self.netnsok {
> +            bits |= 1 << 0;
> +        }
> +        if self.parallel_ops {
> +            bits |= 1 << 1;
> +        }
> +        // SAFETY: This bitfield is represented as an u8.
> +        unsafe { core::mem::transmute::<u8, bindings::__BindgenBitfieldUnit<[u8; 1]>>(bits) }
> +    }
> +}
> +
> +/// A generic netlink family.
> +#[repr(transparent)]
> +pub struct Family {
> +    inner: Opaque<bindings::genl_family>,
> +}
> +
> +// SAFETY: The `Family` type is thread safe.
> +unsafe impl Sync for Family {}
> +
> +impl Family {
> +    /// Creates a new `Family` instance.
Might be worth calling out that this panics on bad input rather than
returning an error in docs? It might be fine because this isn't going
to be called dynamically, but it doesn't match the usual behavior for
other kernel functions.
> +    pub const fn const_new(
> +        module: &ThisModule,
If const_new for MulticastGroup can take a &CStr, why can't we take one here?
> +        name: &[u8],
> +        version: u32,
> +        mcgrps: &'static [MulticastGroup],
> +    ) -> Family {
> +        let n_mcgrps = mcgrps.len() as u8;
> +        if n_mcgrps as usize != mcgrps.len() {
> +            panic!("too many mcgrps");
> +        }
> +        let mut genl_family = bindings::genl_family {
> +            version,
> +            _bitfield_1: FamilyFlags {
> +                netnsok: true,
> +                parallel_ops: true,
> +            }
> +            .into_bitfield(),
> +            module: module.as_ptr(),
> +            mcgrps: mcgrps.as_ptr().cast(),
> +            n_mcgrps,
> +            ..pin_init::zeroed()
> +        };
> +        if CStr::from_bytes_with_nul(name).is_err() {
> +            panic!("genl_family name not nul-terminated");
> +        }
> +        if genl_family.name.len() < name.len() {
> +            panic!("genl_family name too long");
> +        }
> +        let mut i = 0;
> +        while i < name.len() {
> +            genl_family.name[i] = name[i];
> +            i += 1;
> +        }
> +        Family {
> +            inner: Opaque::new(genl_family),
> +        }
> +    }
> +
> +    /// Checks if there are any listeners for the given multicast group.
> +    pub fn has_listeners(&self, group: u32) -> bool {
> +        // SAFETY: The family and init_net pointers are valid.
> +        unsafe {
> +            bindings::genl_has_listeners(self.as_raw(), &raw mut bindings::init_net, group) != 0
> +        }
> +    }
> +
> +    /// Returns a raw pointer to the underlying `genl_family` structure.
> +    pub fn as_raw(&self) -> *mut bindings::genl_family {
> +        self.inner.get()
> +    }
> +}
> +
> +/// A generic netlink multicast group.
> +#[repr(transparent)]
> +pub struct MulticastGroup {
> +    // No Opaque because fully immutable
> +    group: bindings::genl_multicast_group,
> +}
> +
> +// SAFETY: Pure data so thread safe.
> +unsafe impl Sync for MulticastGroup {}
> +
> +impl MulticastGroup {
> +    /// Creates a new `MulticastGroup` instance.
Same as before - should the panic be documented?
> +    pub const fn const_new(name: &CStr) -> MulticastGroup {
> +        let mut group: bindings::genl_multicast_group = pin_init::zeroed();
> +
> +        let name = name.to_bytes_with_nul();
> +        if group.name.len() < name.len() {
> +            panic!("genl_multicast_group name too long");
> +        }
> +        let mut i = 0;
> +        while i < name.len() {
> +            group.name[i] = name[i];
> +            i += 1;
> +        }
> +
> +        MulticastGroup { group }
> +    }
> +}
> +
> +/// A registration of a generic netlink family.
> +///
> +/// This type represents the registration of a [`Family`]. When an instance of this type is
> +/// dropped, its respective generic netlink family will be unregistered from the system.
> +///
> +/// # Invariants
> +///
> +/// `self.family` always holds a valid reference to an initialized and registered [`Family`].
> +pub struct Registration {
> +    family: &'static Family,
> +}
> +
> +impl Family {
> +    /// Registers the generic netlink family with the kernel.
> +    pub fn register(&'static self) -> Result<Registration> {
> +        // SAFETY: `self.as_raw()` is a valid pointer to a `genl_family` struct.
> +        // The `genl_family` struct is static, so it will outlive the registration.
> +        to_result(unsafe { bindings::genl_register_family(self.as_raw()) })?;
> +        Ok(Registration { family: self })
> +    }
> +}
> +
> +impl Drop for Registration {
> +    fn drop(&mut self) {
> +        // SAFETY: `self.family.as_raw()` is a valid pointer to a registered `genl_family` struct.
> +        // The `Registration` struct ensures that `genl_unregister_family` is called exactly once
> +        // for this family when it goes out of scope.
> +        unsafe { bindings::genl_unregister_family(self.family.as_raw()) };
> +    }
> +}
> +
> +#[macros::kunit_tests(rust_netlink)]
> +mod tests {
> +    use super::*;
> +
> +    #[test]
> +    fn test_family_flags_bitfield() {
> +        for netnsok in [false, true] {
> +            for parallel_ops in [false, true] {
> +                let mut b_fam = bindings::genl_family {
> +                    ..Default::default()
> +                };
> +                b_fam.set_netnsok(if netnsok { 1 } else { 0 });
> +                b_fam.set_parallel_ops(if parallel_ops { 1 } else { 0 });
> +
> +                let c_bitfield = FamilyFlags {
> +                    netnsok,
> +                    parallel_ops,
> +                }
> +                .into_bitfield();
> +
> +                // SAFETY: The bit field is stored as u8.
> +                let b_val: u8 = unsafe { core::mem::transmute(b_fam._bitfield_1) };
> +                // SAFETY: The bit field is stored as u8.
> +                let c_val: u8 = unsafe { core::mem::transmute(c_bitfield) };
> +                assert_eq!(b_val, c_val);
> +            }
> +        }
> +    }
> +}
>
> --
> 2.54.0.rc0.605.g598a273b03-goog
>
>

^ permalink raw reply

* Re: [PATCH v2] Documentation: sysctl: document net core sysctls
From: Jay Vosburgh @ 2026-04-16 20:11 UTC (permalink / raw)
  To: Simon Horman
  Cc: Shubham Chakraborty, netdev, davem, edumazet, kuba, pabeni,
	kuniyu, corbet, skhan, linux-doc, linux-kernel
In-Reply-To: <20260413164707.GT469338@kernel.org>

Simon Horman <horms@kernel.org> wrote:

>On Thu, Apr 09, 2026 at 11:18:59PM +0530, Shubham Chakraborty wrote:

[...]

>>  netdev_budget_usecs
>>  ---------------------
>>  
>
>The lines above the following hunk are:
>
>netdev_budget_usecs
>---------------------
>
>Maximum number of microseconds in one NAPI polling cycle. Polling
>
>> @@ -297,12 +332,16 @@ Maximum number of microseconds in one NAPI polling cycle. Polling
>>  will exit when either netdev_budget_usecs have elapsed during the
>>  poll cycle or the number of packets processed reaches netdev_budget.
>>  
>> +Default: ``2 * USEC_PER_SEC / HZ`` (2000 when ``HZ`` is 1000)
>> +
>
>Well, that is awkward.
>
>Looking at git history, it seems that this sysctl was added by 7acf8a1e8a28
>("Replace 2 jiffies with sysctl netdev_budget_usecs to enable softirq
>tuning") in 2017. And at that time the unic was us, and the default was 2000 us.
>
>But that was changed by a fix for that commit, a4837980fd9f ("net: revert
>default NAPI poll timeout to 2 jiffies"), in 2020. As a side-effect of
>that commit, the default was changed to what you have documented above,
>and the unit changed to jiffies.
>
>So while what you have is correct it seems nonsensical to me for the unit
>to be jiffies. Because that's not a meaningful unit for users. And because
>the name of the sysctl ends in usecs.

	I don't think the units for netdev_budget_usecs are actually
jiffies, even after a4837980fd9f.  The default value, for example, is
2000 if HZ is 1000.  However, the granularity of the measurement is in
jiffies, via:

static __latent_entropy void net_rx_action(void)
{
        struct softnet_data *sd = this_cpu_ptr(&softnet_data);
        unsigned long time_limit = jiffies +
                usecs_to_jiffies(READ_ONCE(net_hotdata.netdev_budget_usecs));


	I'm not sure offhand if usecs_to_jiffies rounds up or down, but
the netdev_budget_usecs looks to be interpreted as usecs.

	-J

>But I'm unsure what to do about it. Since changing the unit this would
>represent (another) KABI break.
>
>* Add another knob that shadows this one (But what to call it?)
>* Simply remove this one (KAPI break)
>* Change the unit of this knob (KAPI break)
>
>If the code is left as is, then I think it should be documented that the
>unit is jiffies.
>
>...
>

---
	-Jay Vosburgh, jv@jvosburgh.net

^ permalink raw reply

* Re: [PATCH net-next 5/6] net: stmmac: move PHY handling out of __stmmac_open()/release()
From: Jakub Kicinski @ 2026-04-16 20:16 UTC (permalink / raw)
  To: Russell King (Oracle)
  Cc: Alexander Stein, Andrew Lunn, Heiner Kallweit, Alexandre Torgue,
	Andrew Lunn, David S. Miller, Eric Dumazet, linux-arm-kernel,
	linux-stm32, Maxime Coquelin, netdev, Paolo Abeni
In-Reply-To: <aeE8mpXy9FRHvN9q@shell.armlinux.org.uk>

On Thu, 16 Apr 2026 20:46:34 +0100 Russell King (Oracle) wrote:
> On Thu, Apr 16, 2026 at 09:08:26AM -0700, Jakub Kicinski wrote:
> > On Thu, 16 Apr 2026 14:47:57 +0100 Russell King (Oracle) wrote:  
> > > The next problem will be netdev's policy over reviews vs patches
> > > balance which I'm already in deficit, and I have *NO* *TIME*
> > > what so ever to review patches - let alone propose patches to
> > > fix people's problems.
> > > 
> > > So I'm going to say this plainly: if netdev wants to enforce that
> > > rule, then I won't be fixing people's problems.
> > 
> > Do you have a better proposal?
> > I'm under the same pressure of million stupid projects from my employer
> > as you are. Do y'all think that upstream maintainers have time given by
> > their employers to do the reviews? SMH.  
> 
> Are you really under the same pressure? I have one of my parents in
> hospital right now, and was in A&E yesterday afternoon through into
> the evening. I've been down at the hospital since 2pm today, only
> just come back to feed the other parent and head back down for what
> could be a long night. Then there's supposed to be an appointment
> that will take up to 3 hours tomorrow morning...
> 
> Yea, I'm sure you have the same pressures and worry from your
> employer - except my pressures are medical, looking after my parents.
> 
> Thank you for your lack of understanding.

Not my point. Sorry to hear about the issues you're facing.

I don't think making vague complaints about the development process
is going to make anything better.

^ permalink raw reply

* Re: [PATCH net 10/13] i40e: fix napi_enable/disable skipping ringless q_vectors
From: Jacob Keller @ 2026-04-16 20:46 UTC (permalink / raw)
  To: Przemek Kitszel, Andrew Lunn, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni
  Cc: netdev, Aleksandr Loktionov, stable, Sunitha Mekala,
	Maciej Fijalkowski
In-Reply-To: <6cc3c5b2-fb71-42a4-8d5b-57cd85de2f02@intel.com>

On 4/15/2026 9:20 PM, Przemek Kitszel wrote:
> On 4/15/26 07:48, Jacob Keller wrote:
>> From: Aleksandr Loktionov <aleksandr.loktionov@intel.com>
>>
>> After ethtool -L reduces the queue count, i40e_napi_disable_all() sets
>> NAPI_STATE_SCHED on all q_vectors, then i40e_vsi_map_rings_to_vectors()
>> clears ring pointers on the excess ones.  i40e_napi_enable_all() skips
>> those with:
>>
>>     if (q_vector->rx.ring || q_vector->tx.ring)
>>         napi_enable(&q_vector->napi);
>>
>> leaving them on dev->napi_list with NAPI_STATE_SCHED permanently set.
>>
>> Writing to /sys/class/net/<iface>/threaded calls napi_stop_kthread()
>> on every entry in dev->napi_list.  The function loops on msleep(20)
>> waiting for NAPI_STATE_SCHED to clear -- which never happens for the
>> stale q_vectors.  The task hangs in D state forever; a concurrent write
>> deadlocks on dev->lock held by the first.
>>
>> Commit 13a8cd191a2b ("i40e: Do not enable NAPI on q_vectors that have no
>> rings") added the guard to prevent a divide-by-zero in i40e_napi_poll()
>> when epoll busy-poll iterated all device NAPIs (4.x era). Since
>> 7adc3d57fe2b ("net: Introduce preferred busy-polling"), from v5.11,
>> napi_busy_loop() polls by napi_id keyed to the socket, so ringless
>> q_vectors are never selected.  i40e_msix_clean_rings() also independently
>> avoids scheduling NAPI for them.  The guard is safe to remove.
>>
>> Add an early return in i40e_napi_poll() for num_ringpairs == 0 so the
>> function is self-defending against a NULL tx.ring dereference at the
>> WB_ON_ITR check, should the NAPI ever fire through an unexpected path.
>>
>> Reported-by: Jakub Kicinski <kuba@kernel.org>
>> Closes: https://lore.kernel.org/intel-wired-
>> lan/20260316133100.6054a11f@kernel.org/
> 
> Maciej developed a better fix for the problem, and he explicitly asked
> to not include this patch. Please drop it from this series.
> 
> Maciej's fix:
> https://lore.kernel.org/intel-wired-lan/20260414121405.631092-1-
> maciej.fijalkowski@intel.com/T/#u
> 
> ask for reject:
> https://lore.kernel.org/intel-wired-lan/
> PH0PR11MB75223C8A00C3183C5082A096A0252@PH0PR11MB7522.namprd11.prod.outlook.com/T/#mbac55f7219d7855a2e5d1527904b2da43ad080cb
> 

Ugh, sorry for failing to notice this when batching this series up :(

Thanks,
Jake

^ permalink raw reply

* Re: [PATCH net 10/13] i40e: fix napi_enable/disable skipping ringless q_vectors
From: Jacob Keller @ 2026-04-16 20:50 UTC (permalink / raw)
  To: Przemek Kitszel, Andrew Lunn, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni
  Cc: netdev, Aleksandr Loktionov, stable, Sunitha Mekala,
	Maciej Fijalkowski
In-Reply-To: <70776680-1b60-4898-b9cd-bcc48abaac76@intel.com>

On 4/16/2026 1:46 PM, Jacob Keller wrote:
> On 4/15/2026 9:20 PM, Przemek Kitszel wrote:
>> On 4/15/26 07:48, Jacob Keller wrote:
>>> From: Aleksandr Loktionov <aleksandr.loktionov@intel.com>
>>>
>>> After ethtool -L reduces the queue count, i40e_napi_disable_all() sets
>>> NAPI_STATE_SCHED on all q_vectors, then i40e_vsi_map_rings_to_vectors()
>>> clears ring pointers on the excess ones.  i40e_napi_enable_all() skips
>>> those with:
>>>
>>>     if (q_vector->rx.ring || q_vector->tx.ring)
>>>         napi_enable(&q_vector->napi);
>>>
>>> leaving them on dev->napi_list with NAPI_STATE_SCHED permanently set.
>>>
>>> Writing to /sys/class/net/<iface>/threaded calls napi_stop_kthread()
>>> on every entry in dev->napi_list.  The function loops on msleep(20)
>>> waiting for NAPI_STATE_SCHED to clear -- which never happens for the
>>> stale q_vectors.  The task hangs in D state forever; a concurrent write
>>> deadlocks on dev->lock held by the first.
>>>
>>> Commit 13a8cd191a2b ("i40e: Do not enable NAPI on q_vectors that have no
>>> rings") added the guard to prevent a divide-by-zero in i40e_napi_poll()
>>> when epoll busy-poll iterated all device NAPIs (4.x era). Since
>>> 7adc3d57fe2b ("net: Introduce preferred busy-polling"), from v5.11,
>>> napi_busy_loop() polls by napi_id keyed to the socket, so ringless
>>> q_vectors are never selected.  i40e_msix_clean_rings() also independently
>>> avoids scheduling NAPI for them.  The guard is safe to remove.
>>>
>>> Add an early return in i40e_napi_poll() for num_ringpairs == 0 so the
>>> function is self-defending against a NULL tx.ring dereference at the
>>> WB_ON_ITR check, should the NAPI ever fire through an unexpected path.
>>>
>>> Reported-by: Jakub Kicinski <kuba@kernel.org>
>>> Closes: https://lore.kernel.org/intel-wired-
>>> lan/20260316133100.6054a11f@kernel.org/
>>
>> Maciej developed a better fix for the problem, and he explicitly asked
>> to not include this patch. Please drop it from this series.
>>
>> Maciej's fix:
>> https://lore.kernel.org/intel-wired-lan/20260414121405.631092-1-
>> maciej.fijalkowski@intel.com/T/#u
>>
>> ask for reject:
>> https://lore.kernel.org/intel-wired-lan/
>> PH0PR11MB75223C8A00C3183C5082A096A0252@PH0PR11MB7522.namprd11.prod.outlook.com/T/#mbac55f7219d7855a2e5d1527904b2da43ad080cb
>>
> 
> Ugh, sorry for failing to notice this when batching this series up :(
> 
> Thanks,
> Jake
> 

Jakub,

Can you discard this patch out of the series when applying? Or should I
go ahead and send a v2?

Thanks,
Jake

^ permalink raw reply

* Re: [PATCH v2 iwl-net] i40e: keep q_vectors array in sync with channel count changes
From: Jacob Keller @ 2026-04-16 20:51 UTC (permalink / raw)
  To: Maciej Fijalkowski, intel-wired-lan
  Cc: netdev, magnus.karlsson, kuba, pabeni, horms, przemyslaw.kitszel
In-Reply-To: <20260416114046.642171-1-maciej.fijalkowski@intel.com>

On 4/16/2026 4:40 AM, Maciej Fijalkowski wrote:
> For the main VSI, i40e_set_num_rings_in_vsi() always derives
> num_q_vectors from pf->num_lan_msix. At the same time, ethtool -L stores
> the user requested channel count in vsi->req_queue_pairs and the queue
> setup path uses that value for the effective number of queue pairs.
> 
> This leaves queue and vector counts out of sync after shrinking channel
> count via ethtool -L. The active queue configuration is reduced, but the
> VSI still keeps the full PF-sized q_vector topology.
> 
> That mismatch breaks reconfiguration flows which rely on vector/NAPI
> state matching the effective channel configuration. In particular,
> toggling /sys/class/net/<dev>/threaded after reducing the channel count
> can hang, and later channel-count changes can fail because VSI reinit
> does not rebuild q_vectors to match the new vector count.
> 
> Fix this by making the main VSI num_q_vectors follow the effective
> requested channel count, capped by the available MSI-X vectors. Update
> i40e_vsi_reinit_setup() to rebuild q_vectors during VSI reinit so the
> vector topology is refreshed together with the ring arrays when channel
> count changes.
> 
> Keep alloc_queue_pairs unchanged and based on pf->num_lan_qps so the VSI
> retains its full queue capacity.
> 
> Selftest napi_threaded.py was originally used when Jakub reported hang
> on /sys/class/net/<dev>/threaded toggle. In order to make it pass on
> i40e, use persistent NAPI configuration for q_vector NAPIs so NAPI
> identity and threaded settings survive q_vector reallocation across
> channel-count changes. This is achieved by using netif_napi_add_config()
> when configuring q_vectors.
> 
> $ export NETIF=ens259f1np1
> $ sudo -E env PATH="$PATH" ./tools/testing/selftests/drivers/net/napi_threaded.py
> TAP version 13
> 1..3
> ok 1 napi_threaded.napi_init
> ok 2 napi_threaded.change_num_queues
> ok 3 napi_threaded.enable_dev_threaded_disable_napi_threaded
> Totals: pass:3 fail:0 xfail:0 xpass:0 skip:0 error:0
> 
> Reported-by: Jakub Kicinski <kuba@kernel.org>
> Closes: https://lore.kernel.org/intel-wired-lan/20260316133100.6054a11f@kernel.org/
> Fixes: d2a69fefd756 ("i40e: Fix changing previously set num_queue_pairs for PFs")
> Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
> ---
> v2:
> - NULL vsi->tx_rings in i40e_vsi_alloc_arrays() (Sashiko)
> ---

Thanks Maciej,

I'll go ahead and replace the older version of the fix on dev-queue
today. Apologies for missing the exchange related to this and the
previous fix :(

Thanks,
Jake

^ permalink raw reply

* Re: [PATCH net] eth: fbnic: fix double-free of PCS on phylink creation failure
From: Andrew Lunn @ 2026-04-16 21:00 UTC (permalink / raw)
  To: Bobby Eshleman
  Cc: Alexander Duyck, Jakub Kicinski, kernel-team, Andrew Lunn,
	David S. Miller, Eric Dumazet, Paolo Abeni, Russell King, netdev,
	linux-kernel, Bobby Eshleman
In-Reply-To: <20260416-fbnic-pcs-fix-v1-1-ac4b6badeac0@meta.com>

On Thu, Apr 16, 2026 at 12:31:39PM -0700, Bobby Eshleman wrote:
> From: Bobby Eshleman <bobbyeshleman@meta.com>
> 
> fbnic_phylink_create() stores the newly allocated PCS in fbn->pcs before
> calling phylink_create(). When phylink_create() fails the error path
> calls xpcs_destroy_pcs(pcs) to release the PCS, but neglects to clear
> fbn->pcs.
> 
> The caller, fbnic_netdev_alloc(), responds to the failure by calling
> fbnic_netdev_free() which in turn calls fbnic_phylink_destroy().

Isn't the real problem here? fbnic_phylink_create() failed, and it
correctly did its cleanup. Because it failed, you should not be
calling fbnic_phylink_destroy(). You only call the _destroy() if the
previous _create() was successful.

    Andrew

---
pw-bot: cr

^ permalink raw reply

* Re: [PATCH net,v2 00/11] Netfilter/IPVS fixes for net
From: Florian Westphal @ 2026-04-16 21:16 UTC (permalink / raw)
  To: Pablo Neira Ayuso
  Cc: netfilter-devel, davem, netdev, kuba, pabeni, edumazet, horms
In-Reply-To: <20260416131453.308611-1-pablo@netfilter.org>

Pablo Neira Ayuso <pablo@netfilter.org> wrote:
> v2: Keep back patches that have lengthy feedback by AI, they might
>     need more work.

sashiko findings response:
  ↳ [2026-04-16] Pablo Neira Ayuso <pablo@netfilter.org>: [PATCH net 01/11] netfilter: arp_tables: fix IEEE1394 ARP payload parsing in arp_packet_match()

yes, arpt_mangle.c has same bug pattern, will follow up.

  ↳ [2026-04-16] Pablo Neira Ayuso <pablo@netfilter.org>: [PATCH net 02/11] netfilter: nfnetlink_osf: fix divide-by-zero in OSF_WSS_MODULO
  ↳ [2026-04-16] Pablo Neira Ayuso <pablo@netfilter.org>: [PATCH net 03/11] netfilter: nft_osf: restrict it to ipv4
  ↳ [2026-04-16] Pablo Neira Ayuso <pablo@netfilter.org>: [PATCH net 04/11] netfilter: nfnetlink_osf: fix null-ptr-deref in nf_osf_ttl

yes, osf has more issues, I asked Fernando to investigate. Brief glance
the reports are accurate but these are NOT new issues added by these 3
fixes.

 ↳ [2026-04-16] Pablo Neira Ayuso <pablo@netfilter.org>: [PATCH net 07/11] netfilter: nat: use kfree_rcu to release ops

shashiko wants /kfree/kfree_rcu/ in error unwind path and I think we
should just do it.  Its an error path so it makes no practical
difference.  Also, with upcoming -next patch to dump the nat
hooks too it would be required.

  ↳ [2026-04-16] Pablo Neira Ayuso <pablo@netfilter.org>: [PATCH net 11/11] netfilter: nf_tables: join hook list via splice_list_rcu() in commit phase

report is accurate BUT this issue is already known and not a regression
added here.

The fix for this bug was in v1 PR but it needs more work and will come
in a followup batch.

If you don't want to take this v2 because of above issues, please
consider at least applying

  ↳ [2026-04-16] Pablo Neira Ayuso <pablo@netfilter.org>: [PATCH net 08/11] ipvs: fix MTU check for GSO packets in tunnel mode
  ↳ [2026-04-16] Pablo Neira Ayuso <pablo@netfilter.org>: [PATCH net 09/11] netfilter: nf_tables: use list_del_rcu for netlink hooks
  ↳ [2026-04-16] Pablo Neira Ayuso <pablo@netfilter.org>: [PATCH net 10/11] rculist: add list_splice_rcu() for private lists
  ↳ [2026-04-16] Pablo Neira Ayuso <pablo@netfilter.org>: [PATCH net 05/11] netfilter: conntrack: remove sprintf usage
  ↳ [2026-04-16] Pablo Neira Ayuso <pablo@netfilter.org>: [PATCH net 06/11] netfilter: xtables: restrict several matches to inet family

manually.  nf:main always tracks net:main, applying them manually
doesn't cause issues.

I hope we get shashiko to also digest netfilter-devel;
otherwise this situation will persist forever or can
dissolve nf-devel and spam netdev@ directly :-|

^ permalink raw reply

* Re: [PATCH net 1/1] mptcp: hold subflow request owners when cloning reqsk
From: Matthieu Baerts @ 2026-04-16 21:18 UTC (permalink / raw)
  To: Kuniyuki Iwashima
  Cc: Ren Wei, netdev, mptcp, davem, edumazet, kuba, pabeni, horms,
	ncardwell, dsahern, martineau, geliang, daniel, kafai, yuantan098,
	yifanwucs, tomapufckgml, bird, caoruide123, enjou1224z
In-Reply-To: <CAAVpQUDzh-dGsQpBCZjN3rUsoDc2QjzWh-o5yVWoBWDQNXbjmQ@mail.gmail.com>

Hi Kuniyuki,

Thank you for your reply!

16 Apr 2026 20:48:58 Kuniyuki Iwashima <kuniyu@google.com>:

> On Thu, Apr 16, 2026 at 10:45 AM Matthieu Baerts <matttbe@kernel.org> wrote:
>>
>> Hi Ren,
>>
>> On 15/04/2026 11:31, Ren Wei wrote:
>>> From: Ruide Cao <caoruide123@gmail.com>
>>>
>>> TCP request migration clones pending request sockets with
>>> inet_reqsk_clone(). For MPTCP MP_JOIN requests this raw-copies
>>> subflow_req->msk, but the cloned request does not take a new reference.
>>>
>>> Both the original and the cloned request can later drop the same msk in
>>> subflow_req_destructor(), and a migrated request may keep a dangling msk
>>> pointer after the original owner has already been released.
>>>
>>> Add a request_sock clone callback and let MPTCP grab a reference for cloned
>>> subflow requests that carry an msk. This keeps ownership balanced across
>>> both successful migrations and failed clone/insert paths without changing
>>> other protocols.

(...)

>>> diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
>>> index e961936b6be7..140a9e96ad58 100644
>>> --- a/net/ipv4/inet_connection_sock.c
>>> +++ b/net/ipv4/inet_connection_sock.c
>>> @@ -954,6 +954,9 @@ static struct request_sock *inet_reqsk_clone(struct request_sock *req,
>>>       if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(nreq)->tfo_listener)
>>>               rcu_assign_pointer(tcp_sk(nreq->sk)->fastopen_rsk, nreq);
>>
>> (Maybe TCP with fastopen could be this other user to call
>> rcu_assign_pointer()? (net-next material))
>>
>>> +     if (req->rsk_ops->init_clone)
>>> +             req->rsk_ops->init_clone(req, nreq);
>
> I think a simple direct call is better.
>
> #ifdef CONFIG_MPTCP
>     if (tcp_rsk(req)->is_mptcp)
>         mptcp_reqsk_clone(nreq);
> #endif

Fine by me!

I guess it is needed to check the protocol, similar to what is fine with
TFO above:

  if (sk->sk_protocol == IPPROTO_TCP) {
      if TFO
          ...
      if MPTCP (+ifdef)
          ...
  }

Cheers,
Matt

^ permalink raw reply

* Re: [PATCH v3 1/4] rust: netlink: add raw netlink abstraction
From: Andrew Lunn @ 2026-04-16 21:19 UTC (permalink / raw)
  To: Matthew Maurer
  Cc: Alice Ryhl, Miguel Ojeda, Boqun Feng, Gary Guo,
	Björn Roy Baron, Benno Lossin, Andreas Hindborg,
	Trevor Gross, Danilo Krummrich, Donald Hunter, Jakub Kicinski,
	David S. Miller, Eric Dumazet, Paolo Abeni, Simon Horman,
	Greg Kroah-Hartman, Arve Hjønnevåg, Todd Kjos,
	Christian Brauner, Carlos Llamas, linux-kernel, rust-for-linux,
	netdev
In-Reply-To: <CAGSQo03b4tTQW=bXiRrmjvF8NympD_dcJX=2jwPdMmguXLfifQ@mail.gmail.com>

On Thu, Apr 16, 2026 at 01:06:42PM -0700, Matthew Maurer wrote:
> > +    /// Sends the generic netlink message as a multicast message.
> > +    #[inline]
> > +    pub fn multicast(
> > +        self,
> > +        family: &'static Family,
> > +        portid: u32,
> > +        group: u32,
> > +        flags: alloc::Flags,
> > +    ) -> Result {
> > +        let me = ManuallyDrop::new(self);
> > +        // SAFETY: The `skb` and `family` pointers are valid. We pass ownership of the `skb` to
> > +        // `genlmsg_multicast` by not dropping `self`.

Hi Matthew

Please trim when replying, to just the needed context.

> I think if genlmsg_multicast returns an error code we may need to drop
> to avoid leaking. Specifically, there is at least this path:
> 1. Set group to a large number (that's an unconstrained public parameter)
> 2. We suppress drop
> 3. We call genlmsg_multicast
> 4. We call genlmsg_multicast_netns
> 4. We call genlmsg_multicast_netns_filtered, which does an inbounds
> check for the `group`. If it is too large, it returns EINVAL without
> consuming the SKB - include/net/genetlink.h:493
> 5. We leak the skb
> 
> However, at the same time, if we pass that check and descend into
> `netlink_broadcast_filtered`, it will unconditionally consume the SKB,
> and possibly return an error code in other situations.

A quick grep of the code suggests very few callers of
genlmsg_multicast look at the return code.

drivers/scsi/pmcraid.c prints an error message, but does nothing with
the skb.

drivers/regulator/event.c returns the error code to its caller, which
discards is, and the skb is leaked.

net/ieee802154/netlink.c returns the error code up the call stack but
leaks the skb.

net/nfc/netlink.c returns the error code up the call stack but leaks
the skb.

So i would agree with you, freeing it on error somewhere within
genlmsg_multicast() would make sense.

	Andrew

^ permalink raw reply

* [PATCH net] virtio_net: sync rss_trailer.max_tx_vq on queue_pairs change via VQ_PAIRS_SET
From: Brett Creeley @ 2026-04-16 21:21 UTC (permalink / raw)
  To: mst, jasowang, andrew+netdev, davem, edumazet, kuba, pabeni,
	netdev
  Cc: brett.creeley

When netif_is_rxfh_configured() is true (i.e., the user has explicitly
configured the RSS indirection table), virtnet_set_queues() skips the
RSS update path and falls through to the VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET
command to change the number of queue pairs. However, it does not update
vi->rss_trailer.max_tx_vq to reflect the new queue_pairs value.

This causes a mismatch between vi->curr_queue_pairs and
vi->rss_trailer.max_tx_vq. Any subsequent RSS reconfiguration (e.g.,
via ethtool -X) calls virtnet_commit_rss_command(), which sends the
stale max_tx_vq to the device, silently reverting the queue count.

Reproduction:
1. User configured RSS
  ethtool -X eth0 equal 8
2. VQ_PAIRS_SET path; max_tx_vq stays 16
  ethtool -L eth0 combined 12
3. RSS commit uses max_tx_vq=16 instead of 12
  ethtool -X eth0 equal 4

Fix this by updating vi->rss_trailer.max_tx_vq after a successful
VQ_PAIRS_SET command when RSS is enabled, keeping it in sync with
curr_queue_pairs.

Fixes: 50bfcaedd78e ("virtio_net: Update rss when set queue")
Assisted-by: Claude: claude-opus-4.6
Signed-off-by: Brett Creeley <brett.creeley@amd.com>
---
 drivers/net/virtio_net.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index bfb566fecb92..f4adcfee7a80 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -3759,6 +3759,12 @@ static int virtnet_set_queues(struct virtnet_info *vi, u16 queue_pairs)
 			 queue_pairs);
 		return -EINVAL;
 	}
+
+	/* Keep max_tx_vq in sync so that a later RSS command does not
+	 * revert queue_pairs to a stale value.
+	 */
+	if (vi->has_rss)
+		vi->rss_trailer.max_tx_vq = cpu_to_le16(queue_pairs);
 succ:
 	vi->curr_queue_pairs = queue_pairs;
 	if (dev->flags & IFF_UP) {
-- 
2.43.0


^ permalink raw reply related

* Re: [PATCH v3 1/4] rust: netlink: add raw netlink abstraction
From: Matthew Maurer @ 2026-04-16 21:54 UTC (permalink / raw)
  To: Andrew Lunn
  Cc: Alice Ryhl, Miguel Ojeda, Boqun Feng, Gary Guo,
	Björn Roy Baron, Benno Lossin, Andreas Hindborg,
	Trevor Gross, Danilo Krummrich, Donald Hunter, Jakub Kicinski,
	David S. Miller, Eric Dumazet, Paolo Abeni, Simon Horman,
	Greg Kroah-Hartman, Arve Hjønnevåg, Todd Kjos,
	Christian Brauner, Carlos Llamas, linux-kernel, rust-for-linux,
	netdev
In-Reply-To: <845b36ba-7b3a-41f2-acb2-b284f253e2ca@lunn.ch>

>
> So i would agree with you, freeing it on error somewhere within
> genlmsg_multicast() would make sense.
>
>         Andrew

If this should be fixed separately, then for the rust part:

Reviewed-by: Matthew Maurer <mmaurer@google.com>

with the caveat that documentation is added to the two `const_new`
functions to indicate that they are expected to panic on error, and
should only be used at const time.

^ permalink raw reply

* Re: [PATCH net] eth: fbnic: fix double-free of PCS on phylink creation failure
From: Bobby Eshleman @ 2026-04-16 21:54 UTC (permalink / raw)
  To: Andrew Lunn
  Cc: Alexander Duyck, Jakub Kicinski, kernel-team, Andrew Lunn,
	David S. Miller, Eric Dumazet, Paolo Abeni, Russell King, netdev,
	linux-kernel, Bobby Eshleman
In-Reply-To: <1873b5ba-419f-4247-ab92-1b5e899458e8@lunn.ch>

On Thu, Apr 16, 2026 at 11:00:01PM +0200, Andrew Lunn wrote:
> On Thu, Apr 16, 2026 at 12:31:39PM -0700, Bobby Eshleman wrote:
> > From: Bobby Eshleman <bobbyeshleman@meta.com>
> > 
> > fbnic_phylink_create() stores the newly allocated PCS in fbn->pcs before
> > calling phylink_create(). When phylink_create() fails the error path
> > calls xpcs_destroy_pcs(pcs) to release the PCS, but neglects to clear
> > fbn->pcs.
> > 
> > The caller, fbnic_netdev_alloc(), responds to the failure by calling
> > fbnic_netdev_free() which in turn calls fbnic_phylink_destroy().
> 
> Isn't the real problem here? fbnic_phylink_create() failed, and it
> correctly did its cleanup. Because it failed, you should not be
> calling fbnic_phylink_destroy(). You only call the _destroy() if the
> previous _create() was successful.
> 
>     Andrew
> 
> ---
> pw-bot: cr

True, the real issue is the _create/_destroy call asymmetry.

How about something like this (untested)?:

diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_netdev.c b/drivers/net/ethernet/meta/fbnic/fbnic_netdev.c
index e3ca5fcfabef..2a6a73393732 100644
--- a/drivers/net/ethernet/meta/fbnic/fbnic_netdev.c
+++ b/drivers/net/ethernet/meta/fbnic/fbnic_netdev.c
@@ -818,7 +818,8 @@ struct net_device *fbnic_netdev_alloc(struct fbnic_dev *fbd)
 	netif_tx_stop_all_queues(netdev);
 
 	if (fbnic_phylink_create(netdev)) {
-		fbnic_netdev_free(fbd);
+		free_netdev(netdev);
+		fbd->netdev = NULL;
 		return NULL;
 	}
 
diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_phylink.c b/drivers/net/ethernet/meta/fbnic/fbnic_phylink.c
index 09c5225111be..50240e6c2ee9 100644
--- a/drivers/net/ethernet/meta/fbnic/fbnic_phylink.c
+++ b/drivers/net/ethernet/meta/fbnic/fbnic_phylink.c
@@ -237,6 +237,7 @@ int fbnic_phylink_create(struct net_device *netdev)
 		dev_err(netdev->dev.parent,
 			"Failed to create Phylink interface, err: %d\n", err);
 		xpcs_destroy_pcs(pcs);
+		fbn->pcs = NULL;
 		return err;
 	}
 


I think we still probably want to be NULL-ing the pcs, even if it won't
necessarily matter anymore.

Best,
Bobby

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox