Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH net-next 08/11] netfilter: conntrack: check NULL when retrieving ct extension
From: Pablo Neira Ayuso @ 2026-06-14 11:46 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev, kuba, pabeni, edumazet, fw, horms
In-Reply-To: <20260614114605.474783-1-pablo@netfilter.org>

nf_ct_ext_find() might return NULL if ct extension is not found.

Add also the null checks to:

- nfct_help()
- nfct_help_data()
- nfct_seqadj()
- nfct_nat()

This is defensive, for safety reasons.

nf_ct_ext_find() used to return NULL if the extension is stale for
unconfirmed conntracks if the genid validation fails.

Skip NULL check in nf_nat_inet_fn() given this is valid to be NULL
for non-initialized ct nat extensions.

While at it, fetch ct helper area in nf_ct_expect_related_report() only
once and pass it on to other ancilliary functions. Replace WARN_ON()
by WARN_ON_ONCE() in nf_ct_unlink_expect_report().

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack_helper.h |  2 +
 net/ipv4/netfilter/nf_nat_h323.c            | 12 ++++++
 net/ipv4/netfilter/nf_nat_pptp.c            | 14 +++++--
 net/netfilter/nf_conntrack_broadcast.c      |  3 ++
 net/netfilter/nf_conntrack_expect.c         | 33 +++++++++--------
 net/netfilter/nf_conntrack_ftp.c            |  6 +++
 net/netfilter/nf_conntrack_h323_main.c      | 18 +++++++++
 net/netfilter/nf_conntrack_pptp.c           |  9 +++++
 net/netfilter/nf_conntrack_proto_gre.c      |  9 +++++
 net/netfilter/nf_conntrack_sane.c           |  3 ++
 net/netfilter/nf_conntrack_seqadj.c         | 17 ++++++---
 net/netfilter/nf_conntrack_sip.c            | 41 ++++++++++++++++++++-
 net/netfilter/nf_nat_sip.c                  | 12 ++++++
 net/netfilter/nfnetlink_cthelper.c          |  6 +++
 14 files changed, 158 insertions(+), 27 deletions(-)

diff --git a/include/net/netfilter/nf_conntrack_helper.h b/include/net/netfilter/nf_conntrack_helper.h
index ed93a5a1adc8..93207de4f2c8 100644
--- a/include/net/netfilter/nf_conntrack_helper.h
+++ b/include/net/netfilter/nf_conntrack_helper.h
@@ -136,6 +136,8 @@ static inline void *nfct_help_data(const struct nf_conn *ct)
 	struct nf_conn_help *help;
 
 	help = nf_ct_ext_find(ct, NF_CT_EXT_HELPER);
+	if (!help)
+		return NULL;
 
 	return (void *)help->data;
 }
diff --git a/net/ipv4/netfilter/nf_nat_h323.c b/net/ipv4/netfilter/nf_nat_h323.c
index faee20af4856..19dad54ada09 100644
--- a/net/ipv4/netfilter/nf_nat_h323.c
+++ b/net/ipv4/netfilter/nf_nat_h323.c
@@ -100,6 +100,9 @@ static int set_sig_addr(struct sk_buff *skb, struct nf_conn *ct,
 	__be16 port;
 	union nf_inet_addr addr;
 
+	if (!info)
+		return -1;
+
 	for (i = 0; i < count; i++) {
 		if (get_h225_addr(ct, *data, &taddr[i], &addr, &port)) {
 			if (addr.ip == ct->tuplehash[dir].tuple.src.u3.ip &&
@@ -184,6 +187,9 @@ static int nat_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct,
 	int i;
 	u_int16_t nated_port;
 
+	if (!info)
+		return -1;
+
 	/* Set expectations for NAT */
 	rtp_exp->saved_proto.udp.port = rtp_exp->tuple.dst.u.udp.port;
 	rtp_exp->expectfn = nf_nat_follow_master;
@@ -325,6 +331,9 @@ static int nat_h245(struct sk_buff *skb, struct nf_conn *ct,
 	int dir = CTINFO2DIR(ctinfo);
 	u_int16_t nated_port = ntohs(port);
 
+	if (!info)
+		return -1;
+
 	/* Set expectations for NAT */
 	exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port;
 	exp->expectfn = nf_nat_follow_master;
@@ -404,6 +413,9 @@ static int nat_q931(struct sk_buff *skb, struct nf_conn *ct,
 	u_int16_t nated_port = ntohs(port);
 	union nf_inet_addr addr;
 
+	if (!info)
+		return -1;
+
 	/* Set expectations for NAT */
 	exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port;
 	exp->expectfn = ip_nat_q931_expect;
diff --git a/net/ipv4/netfilter/nf_nat_pptp.c b/net/ipv4/netfilter/nf_nat_pptp.c
index fab357cc8559..fed5249001a4 100644
--- a/net/ipv4/netfilter/nf_nat_pptp.c
+++ b/net/ipv4/netfilter/nf_nat_pptp.c
@@ -53,11 +53,13 @@ static void pptp_nat_expected(struct nf_conn *ct,
 	struct nf_conn_nat *nat;
 
 	nat = nf_ct_nat_ext_add(ct);
-	if (WARN_ON_ONCE(!nat))
+	if (!nat)
 		return;
 
 	nat_pptp_info = &nat->help.nat_pptp_info;
 	ct_pptp_info = nfct_help_data(master);
+	if (!ct_pptp_info)
+		return;
 
 	/* And here goes the grand finale of corrosion... */
 	if (exp->dir == IP_CT_DIR_ORIGINAL) {
@@ -132,11 +134,13 @@ pptp_outbound_pkt(struct sk_buff *skb,
 	__be16 new_callid;
 	unsigned int cid_off;
 
-	if (WARN_ON_ONCE(!nat))
+	if (!nat)
 		return NF_DROP;
 
 	nat_pptp_info = &nat->help.nat_pptp_info;
 	ct_pptp_info = nfct_help_data(ct);
+	if (!ct_pptp_info)
+		return NF_DROP;
 
 	new_callid = ct_pptp_info->pns_call_id;
 
@@ -204,11 +208,13 @@ pptp_exp_gre(struct nf_conntrack_expect *expect_orig,
 	struct nf_ct_pptp_master *ct_pptp_info;
 	struct nf_nat_pptp *nat_pptp_info;
 
-	if (WARN_ON_ONCE(!nat))
+	if (!nat)
 		return;
 
 	nat_pptp_info = &nat->help.nat_pptp_info;
 	ct_pptp_info = nfct_help_data(ct);
+	if (!ct_pptp_info)
+		return;
 
 	/* save original PAC call ID in nat_info */
 	nat_pptp_info->pac_call_id = ct_pptp_info->pac_call_id;
@@ -241,7 +247,7 @@ pptp_inbound_pkt(struct sk_buff *skb,
 	__be16 new_pcid;
 	unsigned int pcid_off;
 
-	if (WARN_ON_ONCE(!nat))
+	if (!nat)
 		return NF_DROP;
 
 	nat_pptp_info = &nat->help.nat_pptp_info;
diff --git a/net/netfilter/nf_conntrack_broadcast.c b/net/netfilter/nf_conntrack_broadcast.c
index 75e53fde6b29..400119b6320e 100644
--- a/net/netfilter/nf_conntrack_broadcast.c
+++ b/net/netfilter/nf_conntrack_broadcast.c
@@ -29,6 +29,9 @@ int nf_conntrack_broadcast_help(struct sk_buff *skb,
 	struct nf_conn_help *help = nfct_help(ct);
 	__be32 mask = 0;
 
+	if (!help)
+		goto out;
+
 	/* we're only interested in locally generated packets */
 	if (skb->sk == NULL || !net_eq(nf_ct_net(ct), sock_net(skb->sk)))
 		goto out;
diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c
index 8e943efbdf0a..5c9b17835c28 100644
--- a/net/netfilter/nf_conntrack_expect.c
+++ b/net/netfilter/nf_conntrack_expect.c
@@ -52,8 +52,7 @@ void nf_ct_unlink_expect_report(struct nf_conntrack_expect *exp,
 	struct nf_conntrack_net *cnet;
 
 	lockdep_nfct_expect_lock_held();
-	WARN_ON(!master_help);
-	WARN_ON(timer_pending(&exp->timeout));
+	WARN_ON_ONCE(timer_pending(&exp->timeout));
 
 	hlist_del_rcu(&exp->hnode);
 
@@ -61,7 +60,8 @@ void nf_ct_unlink_expect_report(struct nf_conntrack_expect *exp,
 	cnet->expect_count--;
 
 	hlist_del_rcu(&exp->lnode);
-	master_help->expecting[exp->class]--;
+	if (master_help)
+		master_help->expecting[exp->class]--;
 
 	nf_ct_expect_event_report(IPEXP_DESTROY, exp, portid, report);
 	nf_ct_expect_put(exp);
@@ -405,10 +405,10 @@ void nf_ct_expect_put(struct nf_conntrack_expect *exp)
 }
 EXPORT_SYMBOL_GPL(nf_ct_expect_put);
 
-static void nf_ct_expect_insert(struct nf_conntrack_expect *exp)
+static void nf_ct_expect_insert(struct nf_conntrack_expect *exp,
+				struct nf_conn_help *master_help)
 {
 	struct nf_conntrack_net *cnet;
-	struct nf_conn_help *master_help = nfct_help(exp->master);
 	struct nf_conntrack_helper *helper;
 	struct net *net = nf_ct_exp_net(exp);
 	unsigned int h = nf_ct_expect_dst_hash(net, &exp->tuple);
@@ -436,10 +436,9 @@ static void nf_ct_expect_insert(struct nf_conntrack_expect *exp)
 }
 
 /* Race with expectations being used means we could have none to find; OK. */
-static void evict_oldest_expect(struct nf_conn *master,
+static void evict_oldest_expect(struct nf_conn_help *master_help,
 				struct nf_conntrack_expect *new)
 {
-	struct nf_conn_help *master_help = nfct_help(master);
 	struct nf_conntrack_expect *exp, *last = NULL;
 
 	hlist_for_each_entry(exp, &master_help->expectations, lnode) {
@@ -452,13 +451,12 @@ static void evict_oldest_expect(struct nf_conn *master,
 }
 
 static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect,
+				       struct nf_conn_help *master_help,
 				       unsigned int flags)
 {
 	const struct nf_conntrack_expect_policy *p;
 	struct nf_conntrack_expect *i;
 	struct nf_conntrack_net *cnet;
-	struct nf_conn *master = expect->master;
-	struct nf_conn_help *master_help = nfct_help(master);
 	struct nf_conntrack_helper *helper;
 	struct net *net = nf_ct_exp_net(expect);
 	struct hlist_node *next;
@@ -467,10 +465,6 @@ static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect,
 
 	lockdep_nfct_expect_lock_held();
 
-	if (!master_help) {
-		ret = -ESHUTDOWN;
-		goto out;
-	}
 	h = nf_ct_expect_dst_hash(net, &expect->tuple);
 	hlist_for_each_entry_safe(i, next, &nf_ct_expect_hash[h], hnode) {
 		if (master_matches(i, expect, flags) &&
@@ -493,7 +487,7 @@ static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect,
 		p = &helper->expect_policy[expect->class];
 		if (p->max_expected &&
 		    master_help->expecting[expect->class] >= p->max_expected) {
-			evict_oldest_expect(master, expect);
+			evict_oldest_expect(master_help, expect);
 			if (master_help->expecting[expect->class]
 						>= p->max_expected) {
 				ret = -EMFILE;
@@ -514,14 +508,21 @@ static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect,
 int nf_ct_expect_related_report(struct nf_conntrack_expect *expect,
 				u32 portid, int report, unsigned int flags)
 {
+	struct nf_conn_help *master_help;
 	int ret;
 
 	spin_lock_bh(&nf_conntrack_expect_lock);
-	ret = __nf_ct_expect_check(expect, flags);
+	master_help = nfct_help(expect->master);
+	if (!master_help) {
+		ret = -ESHUTDOWN;
+		goto out;
+	}
+
+	ret = __nf_ct_expect_check(expect, master_help, flags);
 	if (ret < 0)
 		goto out;
 
-	nf_ct_expect_insert(expect);
+	nf_ct_expect_insert(expect, master_help);
 
 	nf_ct_expect_event_report(IPEXP_NEW, expect, portid, report);
 	spin_unlock_bh(&nf_conntrack_expect_lock);
diff --git a/net/netfilter/nf_conntrack_ftp.c b/net/netfilter/nf_conntrack_ftp.c
index c7777f37371a..0847f845613d 100644
--- a/net/netfilter/nf_conntrack_ftp.c
+++ b/net/netfilter/nf_conntrack_ftp.c
@@ -384,6 +384,9 @@ static int help(struct sk_buff *skb,
 	int found = 0, ends_in_nl;
 	nf_nat_ftp_hook_fn *nf_nat_ftp;
 
+	if (!ct_ftp_info)
+		return NF_DROP;
+
 	/* Until there's been traffic both ways, don't look in packets. */
 	if (ctinfo != IP_CT_ESTABLISHED &&
 	    ctinfo != IP_CT_ESTABLISHED_REPLY) {
@@ -545,6 +548,9 @@ static int nf_ct_ftp_from_nlattr(struct nlattr *attr, struct nf_conn *ct)
 {
 	struct nf_ct_ftp_master *ftp = nfct_help_data(ct);
 
+	if (!ftp)
+		return -ENOENT;
+
 	/* This conntrack has been injected from user-space, always pick up
 	 * sequence tracking. Otherwise, the first FTP command after the
 	 * failover breaks.
diff --git a/net/netfilter/nf_conntrack_h323_main.c b/net/netfilter/nf_conntrack_h323_main.c
index ebae9fdab897..7f189dceb3c4 100644
--- a/net/netfilter/nf_conntrack_h323_main.c
+++ b/net/netfilter/nf_conntrack_h323_main.c
@@ -76,6 +76,9 @@ static int get_tpkt_data(struct sk_buff *skb, unsigned int protoff,
 	int tpktlen;
 	int tpktoff;
 
+	if (!info)
+		return 0;
+
 	/* Get TCP header */
 	th = skb_header_pointer(skb, protoff, sizeof(_tcph), &_tcph);
 	if (th == NULL)
@@ -1191,6 +1194,9 @@ static int expect_q931(struct sk_buff *skb, struct nf_conn *ct,
 	union nf_inet_addr addr;
 	struct nf_conntrack_expect *exp;
 
+	if (!info)
+		return -1;
+
 	/* Look for the first related address */
 	for (i = 0; i < count; i++) {
 		if (get_h225_addr(ct, *data, &taddr[i], &addr, &port) &&
@@ -1307,6 +1313,9 @@ static int process_rrq(struct sk_buff *skb, struct nf_conn *ct,
 	const struct nfct_h323_nat_hooks *nathook;
 	int ret;
 
+	if (!info)
+		return -1;
+
 	pr_debug("nf_ct_ras: RRQ\n");
 
 	ret = expect_q931(skb, ct, ctinfo, protoff, data,
@@ -1345,6 +1354,9 @@ static int process_rcf(struct sk_buff *skb, struct nf_conn *ct,
 	int ret;
 	struct nf_conntrack_expect *exp;
 
+	if (!info)
+		return -1;
+
 	pr_debug("nf_ct_ras: RCF\n");
 
 	nathook = rcu_dereference(nfct_h323_nat_hook);
@@ -1395,6 +1407,9 @@ static int process_urq(struct sk_buff *skb, struct nf_conn *ct,
 	int dir = CTINFO2DIR(ctinfo);
 	int ret;
 
+	if (!info)
+		return -1;
+
 	pr_debug("nf_ct_ras: URQ\n");
 
 	nathook = rcu_dereference(nfct_h323_nat_hook);
@@ -1429,6 +1444,9 @@ static int process_arq(struct sk_buff *skb, struct nf_conn *ct,
 	__be16 port;
 	union nf_inet_addr addr;
 
+	if (!info)
+		return 0;
+
 	pr_debug("nf_ct_ras: ARQ\n");
 
 	nathook = rcu_dereference(nfct_h323_nat_hook);
diff --git a/net/netfilter/nf_conntrack_pptp.c b/net/netfilter/nf_conntrack_pptp.c
index ed567a1cf7fd..776505a78e64 100644
--- a/net/netfilter/nf_conntrack_pptp.c
+++ b/net/netfilter/nf_conntrack_pptp.c
@@ -198,6 +198,9 @@ pptp_inbound_pkt(struct sk_buff *skb, unsigned int protoff,
 	u_int16_t msg;
 	__be16 cid = 0, pcid = 0;
 
+	if (!info)
+		return NF_DROP;
+
 	msg = ntohs(ctlh->messageType);
 	pr_debug("inbound control message %s\n", pptp_msg_name(msg));
 
@@ -325,6 +328,9 @@ pptp_outbound_pkt(struct sk_buff *skb, unsigned int protoff,
 	u_int16_t msg;
 	__be16 cid = 0, pcid = 0;
 
+	if (!info)
+		return NF_DROP;
+
 	msg = ntohs(ctlh->messageType);
 	pr_debug("outbound control message %s\n", pptp_msg_name(msg));
 
@@ -443,6 +449,9 @@ conntrack_pptp_help(struct sk_buff *skb, unsigned int protoff,
 	int ret;
 	u_int16_t msg;
 
+	if (!info)
+		return NF_DROP;
+
 #if IS_ENABLED(CONFIG_NF_NAT)
 	if (!nf_ct_is_confirmed(ct) && (ct->status & IPS_NAT_MASK)) {
 		struct nf_conn_nat *nat = nf_ct_ext_find(ct, NF_CT_EXT_NAT);
diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c
index 473658259f1a..616ab1e2fc5e 100644
--- a/net/netfilter/nf_conntrack_proto_gre.c
+++ b/net/netfilter/nf_conntrack_proto_gre.c
@@ -131,6 +131,9 @@ bool nf_ct_gre_keymap_add(struct nf_conn *ct,
 	struct nf_ct_gre_keymap *km_orig, *km_repl;
 	bool ret = false;
 
+	if (!ct_pptp_info)
+		return false;
+
 	km_orig = kmalloc_obj(*km_orig, GFP_ATOMIC);
 	if (!km_orig)
 		return false;
@@ -187,6 +190,9 @@ void nf_ct_gre_keymap_destroy(struct nf_conn *ct)
 	struct nf_ct_pptp_master *ct_pptp_info = nfct_help_data(ct);
 	enum ip_conntrack_dir dir;
 
+	if (!ct_pptp_info)
+		return;
+
 	pr_debug("entering for ct %p\n", ct);
 
 	spin_lock_bh(&keymap_lock);
@@ -389,6 +395,9 @@ void gre_pptp_destroy_siblings(struct nf_conn *ct)
 	const struct nf_ct_pptp_master *ct_pptp_info = nfct_help_data(ct);
 	struct nf_conntrack_tuple t;
 
+	if (!ct_pptp_info)
+		return;
+
 	nf_ct_gre_keymap_destroy(ct);
 
 	/* try original (pns->pac) tuple */
diff --git a/net/netfilter/nf_conntrack_sane.c b/net/netfilter/nf_conntrack_sane.c
index a7f7b07ba0c2..39085acf7a71 100644
--- a/net/netfilter/nf_conntrack_sane.c
+++ b/net/netfilter/nf_conntrack_sane.c
@@ -74,6 +74,9 @@ static int help(struct sk_buff *skb,
 		struct sane_reply_net_start repl;
 	} buf;
 
+	if (!ct_sane_info)
+		return NF_DROP;
+
 	/* Until there's been traffic both ways, don't look in packets. */
 	if (ctinfo != IP_CT_ESTABLISHED &&
 	    ctinfo != IP_CT_ESTABLISHED_REPLY)
diff --git a/net/netfilter/nf_conntrack_seqadj.c b/net/netfilter/nf_conntrack_seqadj.c
index b7e99f34dfce..220216a4edc5 100644
--- a/net/netfilter/nf_conntrack_seqadj.c
+++ b/net/netfilter/nf_conntrack_seqadj.c
@@ -18,9 +18,12 @@ int nf_ct_seqadj_init(struct nf_conn *ct, enum ip_conntrack_info ctinfo,
 		return 0;
 
 	spin_lock_bh(&ct->lock);
-	set_bit(IPS_SEQ_ADJUST_BIT, &ct->status);
-
 	seqadj = nfct_seqadj(ct);
+	if (!seqadj) {
+		spin_unlock_bh(&ct->lock);
+		return 0;
+	}
+	set_bit(IPS_SEQ_ADJUST_BIT, &ct->status);
 	this_way = &seqadj->seq[dir];
 	this_way->offset_before	 = off;
 	this_way->offset_after	 = off;
@@ -39,10 +42,8 @@ int nf_ct_seqadj_set(struct nf_conn *ct, enum ip_conntrack_info ctinfo,
 	if (off == 0)
 		return 0;
 
-	if (unlikely(!seqadj)) {
-		WARN_ONCE(1, "Missing nfct_seqadj_ext_add() setup call\n");
+	if (unlikely(!seqadj))
 		return 0;
-	}
 
 	set_bit(IPS_SEQ_ADJUST_BIT, &ct->status);
 
@@ -125,6 +126,9 @@ static unsigned int nf_ct_sack_adjust(struct sk_buff *skb,
 	struct nf_conn_seqadj *seqadj = nfct_seqadj(ct);
 	unsigned int dir, optoff, optend;
 
+	if (!seqadj)
+		return 0;
+
 	optoff = protoff + sizeof(struct tcphdr);
 	optend = protoff + tcph->doff * 4;
 
@@ -175,6 +179,9 @@ int nf_ct_seq_adjust(struct sk_buff *skb,
 	struct nf_ct_seqadj *this_way, *other_way;
 	int res = 1;
 
+	if (!seqadj)
+		return 0;
+
 	this_way  = &seqadj->seq[dir];
 	other_way = &seqadj->seq[!dir];
 
diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c
index 2c78a3e1dab5..c606d1f60b58 100644
--- a/net/netfilter/nf_conntrack_sip.c
+++ b/net/netfilter/nf_conntrack_sip.c
@@ -887,6 +887,9 @@ static int refresh_signalling_expectation(struct nf_conn *ct,
 	struct hlist_node *next;
 	int found = 0;
 
+	if (!help)
+		return 0;
+
 	spin_lock_bh(&nf_conntrack_expect_lock);
 	hlist_for_each_entry_safe(exp, next, &help->expectations, lnode) {
 		if (exp->class != SIP_EXPECT_SIGNALLING ||
@@ -910,6 +913,9 @@ static void flush_expectations(struct nf_conn *ct, bool media)
 	struct nf_conntrack_expect *exp;
 	struct hlist_node *next;
 
+	if (!help)
+		return;
+
 	spin_lock_bh(&nf_conntrack_expect_lock);
 	hlist_for_each_entry_safe(exp, next, &help->expectations, lnode) {
 		if ((exp->class != SIP_EXPECT_SIGNALLING) ^ media)
@@ -940,6 +946,11 @@ static int set_expected_rtp_rtcp(struct sk_buff *skb, unsigned int protoff,
 	u_int16_t base_port;
 	__be16 rtp_port, rtcp_port;
 	const struct nf_nat_sip_hooks *hooks;
+	struct nf_conn_help *help;
+
+	help = nfct_help(ct);
+	if (!help)
+		return NF_DROP;
 
 	saddr = NULL;
 	if (sip_direct_media) {
@@ -1002,7 +1013,7 @@ static int set_expected_rtp_rtcp(struct sk_buff *skb, unsigned int protoff,
 		exp = __nf_ct_expect_find(net, nf_ct_zone(ct), &tuple);
 
 		if (!exp || exp->master == ct ||
-		    exp->helper != nfct_help(ct)->helper ||
+		    exp->helper != help->helper ||
 		    exp->class != class)
 			break;
 #if IS_ENABLED(CONFIG_NF_NAT)
@@ -1227,6 +1238,9 @@ static int process_invite_response(struct sk_buff *skb, unsigned int protoff,
 	struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
 	struct nf_ct_sip_master *ct_sip_info = nfct_help_data(ct);
 
+	if (!ct_sip_info)
+		return NF_DROP;
+
 	if ((code >= 100 && code <= 199) ||
 	    (code >= 200 && code <= 299))
 		return process_sdp(skb, protoff, dataoff, dptr, datalen, cseq);
@@ -1244,6 +1258,9 @@ static int process_update_response(struct sk_buff *skb, unsigned int protoff,
 	struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
 	struct nf_ct_sip_master *ct_sip_info = nfct_help_data(ct);
 
+	if (!ct_sip_info)
+		return NF_DROP;
+
 	if ((code >= 100 && code <= 199) ||
 	    (code >= 200 && code <= 299))
 		return process_sdp(skb, protoff, dataoff, dptr, datalen, cseq);
@@ -1261,6 +1278,9 @@ static int process_prack_response(struct sk_buff *skb, unsigned int protoff,
 	struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
 	struct nf_ct_sip_master *ct_sip_info = nfct_help_data(ct);
 
+	if (!ct_sip_info)
+		return NF_DROP;
+
 	if ((code >= 100 && code <= 199) ||
 	    (code >= 200 && code <= 299))
 		return process_sdp(skb, protoff, dataoff, dptr, datalen, cseq);
@@ -1279,6 +1299,9 @@ static int process_invite_request(struct sk_buff *skb, unsigned int protoff,
 	struct nf_ct_sip_master *ct_sip_info = nfct_help_data(ct);
 	unsigned int ret;
 
+	if (!ct_sip_info)
+		return NF_DROP;
+
 	flush_expectations(ct, true);
 	ret = process_sdp(skb, protoff, dataoff, dptr, datalen, cseq);
 	if (ret == NF_ACCEPT)
@@ -1316,11 +1339,15 @@ static int process_register_request(struct sk_buff *skb, unsigned int protoff,
 	union nf_inet_addr *saddr, daddr;
 	const struct nf_nat_sip_hooks *hooks;
 	struct nf_conntrack_helper *helper;
+	struct nf_conn_help *help;
 	__be16 port;
 	u8 proto;
 	unsigned int expires = 0;
 	int ret;
 
+	if (!ct_sip_info)
+		return NF_DROP;
+
 	/* Expected connections can not register again. */
 	if (ct->status & IPS_EXPECTED)
 		return NF_ACCEPT;
@@ -1366,7 +1393,11 @@ static int process_register_request(struct sk_buff *skb, unsigned int protoff,
 		goto store_cseq;
 	}
 
-	helper = rcu_dereference(nfct_help(ct)->helper);
+	help = nfct_help(ct);
+	if (!help)
+		return NF_DROP;
+
+	helper = rcu_dereference(help->helper);
 	if (!helper)
 		return NF_DROP;
 
@@ -1421,6 +1452,9 @@ static int process_register_response(struct sk_buff *skb, unsigned int protoff,
 	unsigned int expires = 0;
 	int in_contact = 0, ret;
 
+	if (!ct_sip_info)
+		return NF_DROP;
+
 	/* According to RFC 3261, "UAs MUST NOT send a new registration until
 	 * they have received a final response from the registrar for the
 	 * previous one or the previous REGISTER request has timed out".
@@ -1550,6 +1584,9 @@ static int process_sip_request(struct sk_buff *skb, unsigned int protoff,
 	union nf_inet_addr addr;
 	__be16 port;
 
+	if (!ct_sip_info)
+		return NF_DROP;
+
 	/* Many Cisco IP phones use a high source port for SIP requests, but
 	 * listen for the response on port 5060.  If we are the local
 	 * router for one of these phones, save the port number from the
diff --git a/net/netfilter/nf_nat_sip.c b/net/netfilter/nf_nat_sip.c
index 9fbfc6bff0c2..7f29a6785327 100644
--- a/net/netfilter/nf_nat_sip.c
+++ b/net/netfilter/nf_nat_sip.c
@@ -106,6 +106,9 @@ static int map_addr(struct sk_buff *skb, unsigned int protoff,
 	union nf_inet_addr newaddr;
 	__be16 newport;
 
+	if (!ct_sip_info)
+		return 0;
+
 	if (nf_inet_addr_cmp(&ct->tuplehash[dir].tuple.src.u3, addr) &&
 	    ct->tuplehash[dir].tuple.src.u.udp.port == port) {
 		newaddr = ct->tuplehash[!dir].tuple.dst.u3;
@@ -158,6 +161,9 @@ static unsigned int nf_nat_sip(struct sk_buff *skb, unsigned int protoff,
 	__be16 port;
 	int request, in_header;
 
+	if (!ct_sip_info)
+		return NF_DROP;
+
 	/* Basic rules: requests and responses. */
 	if (strncasecmp(*dptr, "SIP/2.0", strlen("SIP/2.0")) != 0) {
 		if (ct_sip_parse_request(ct, *dptr, *datalen,
@@ -326,6 +332,9 @@ static void nf_nat_sip_expected(struct nf_conn *ct,
 	int range_set_for_snat = 0;
 	struct nf_nat_range2 range;
 
+	if (!help)
+		return;
+
 	/* This must be a fresh one. */
 	BUG_ON(ct->status & IPS_NAT_DONE_MASK);
 
@@ -390,6 +399,9 @@ static unsigned int nf_nat_sip_expect(struct sk_buff *skb, unsigned int protoff,
 	char buffer[INET6_ADDRSTRLEN + sizeof("[]:nnnnn")];
 	unsigned int buflen;
 
+	if (!ct_sip_info)
+		return NF_DROP;
+
 	/* Connection will come from reply */
 	if (nf_inet_addr_cmp(&ct->tuplehash[dir].tuple.src.u3,
 			     &ct->tuplehash[!dir].tuple.dst.u3))
diff --git a/net/netfilter/nfnetlink_cthelper.c b/net/netfilter/nfnetlink_cthelper.c
index 033ea90c4401..f1460b683d7a 100644
--- a/net/netfilter/nfnetlink_cthelper.c
+++ b/net/netfilter/nfnetlink_cthelper.c
@@ -101,6 +101,9 @@ nfnl_cthelper_from_nlattr(struct nlattr *attr, struct nf_conn *ct)
 	struct nf_conn_help *help = nfct_help(ct);
 	const struct nf_conntrack_helper *helper;
 
+	if (!help)
+		return -EINVAL;
+
 	if (attr == NULL)
 		return -EINVAL;
 
@@ -118,6 +121,9 @@ nfnl_cthelper_to_nlattr(struct sk_buff *skb, const struct nf_conn *ct)
 	const struct nf_conn_help *help = nfct_help(ct);
 	const struct nf_conntrack_helper *helper;
 
+	if (!help)
+		return 0;
+
 	helper = rcu_dereference(help->helper);
 	if (helper && helper->data_len &&
 	    nla_put(skb, CTA_HELP_INFO, helper->data_len, &help->data))
-- 
2.47.3


^ permalink raw reply related

* [PATCH net-next 09/11] netfilter: flowtable: bail out if forward path cannot be discovered
From: Pablo Neira Ayuso @ 2026-06-14 11:46 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev, kuba, pabeni, edumazet, fw, horms
In-Reply-To: <20260614114605.474783-1-pablo@netfilter.org>

If forward path discovery fails for any reason or netdevice is not
registered for this flowtable, then bail out to classic forwarding path
rather than providing incomplete forwarding path.

Update the existing forward path parser functions to report an error
so the flow_offload expressions gives up on setting up the flowtable
entry.

Link: https://sashiko.dev/#/patchset/20260607094954.48892-15-pablo%40netfilter.org?part=14
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_flow_table_path.c | 81 +++++++++++++++++-------------
 1 file changed, 46 insertions(+), 35 deletions(-)

diff --git a/net/netfilter/nf_flow_table_path.c b/net/netfilter/nf_flow_table_path.c
index a3e6b82f2f8e..1e7e216b9f89 100644
--- a/net/netfilter/nf_flow_table_path.c
+++ b/net/netfilter/nf_flow_table_path.c
@@ -90,9 +90,9 @@ struct nft_forward_info {
 	enum flow_offload_xmit_type xmit_type;
 };
 
-static void nft_dev_path_info(const struct net_device_path_stack *stack,
-			      struct nft_forward_info *info,
-			      unsigned char *ha, struct nf_flowtable *flowtable)
+static int nft_dev_path_info(const struct net_device_path_stack *stack,
+			     struct nft_forward_info *info,
+			     unsigned char *ha, struct nf_flowtable *flowtable)
 {
 	const struct net_device_path *path;
 	int i;
@@ -120,19 +120,17 @@ static void nft_dev_path_info(const struct net_device_path_stack *stack,
 
 			/* DEV_PATH_VLAN, DEV_PATH_PPPOE and DEV_PATH_TUN */
 			if (path->type == DEV_PATH_TUN) {
-				if (info->num_tuns) {
-					info->indev = NULL;
-					break;
-				}
+				if (info->num_tuns)
+					return -1;
+
 				info->tun.src_v6 = path->tun.src_v6;
 				info->tun.dst_v6 = path->tun.dst_v6;
 				info->tun.l3_proto = path->tun.l3_proto;
 				info->num_tuns++;
 			} else {
-				if (info->num_encaps >= NF_FLOW_TABLE_ENCAP_MAX) {
-					info->indev = NULL;
-					break;
-				}
+				if (info->num_encaps >= NF_FLOW_TABLE_ENCAP_MAX)
+					return -1;
+
 				info->encap[info->num_encaps].id =
 					path->encap.id;
 				info->encap[info->num_encaps].proto =
@@ -151,22 +149,23 @@ static void nft_dev_path_info(const struct net_device_path_stack *stack,
 
 			switch (path->bridge.vlan_mode) {
 			case DEV_PATH_BR_VLAN_UNTAG_HW:
+				if (info->num_encaps == 0)
+					return -1;
+
 				info->ingress_vlans |= BIT(info->num_encaps - 1);
 				break;
 			case DEV_PATH_BR_VLAN_TAG:
-				if (info->num_encaps >= NF_FLOW_TABLE_ENCAP_MAX) {
-					info->indev = NULL;
-					break;
-				}
+				if (info->num_encaps >= NF_FLOW_TABLE_ENCAP_MAX)
+					return -1;
+
 				info->encap[info->num_encaps].id = path->bridge.vlan_id;
 				info->encap[info->num_encaps].proto = path->bridge.vlan_proto;
 				info->num_encaps++;
 				break;
 			case DEV_PATH_BR_VLAN_UNTAG:
-				if (info->num_encaps == 0) {
-					info->indev = NULL;
-					break;
-				}
+				if (info->num_encaps == 0)
+					return -1;
+
 				info->num_encaps--;
 				break;
 			case DEV_PATH_BR_VLAN_KEEP:
@@ -175,8 +174,7 @@ static void nft_dev_path_info(const struct net_device_path_stack *stack,
 			info->xmit_type = FLOW_OFFLOAD_XMIT_DIRECT;
 			break;
 		default:
-			info->indev = NULL;
-			break;
+			return -1;
 		}
 	}
 	info->outdev = info->indev;
@@ -184,6 +182,8 @@ static void nft_dev_path_info(const struct net_device_path_stack *stack,
 	if (nf_flowtable_hw_offload(flowtable) &&
 	    nft_is_valid_ether_device(info->indev))
 		info->xmit_type = FLOW_OFFLOAD_XMIT_DIRECT;
+
+	return 0;
 }
 
 static bool nft_flowtable_find_dev(const struct net_device *dev,
@@ -241,11 +241,11 @@ static int nft_flow_tunnel_update_route(const struct nft_pktinfo *pkt,
 	return 0;
 }
 
-static void nft_dev_forward_path(const struct nft_pktinfo *pkt,
-				 struct nf_flow_route *route,
-				 const struct nf_conn *ct,
-				 enum ip_conntrack_dir dir,
-				 struct nft_flowtable *ft)
+static int nft_dev_forward_path(const struct nft_pktinfo *pkt,
+				struct nf_flow_route *route,
+				const struct nf_conn *ct,
+				enum ip_conntrack_dir dir,
+				struct nft_flowtable *ft)
 {
 	const struct dst_entry *dst = route->tuple[dir].dst;
 	struct net_device_path_stack stack;
@@ -253,15 +253,16 @@ static void nft_dev_forward_path(const struct nft_pktinfo *pkt,
 	unsigned char ha[ETH_ALEN];
 	int i;
 
-	if (nft_dev_fill_forward_path(route, dst, ct, dir, ha, &stack) >= 0)
-		nft_dev_path_info(&stack, &info, ha, &ft->data);
+	if (nft_dev_fill_forward_path(route, dst, ct, dir, ha, &stack) < 0 ||
+	    nft_dev_path_info(&stack, &info, ha, &ft->data) < 0)
+		return -ENOENT;
+
+	if (!nft_flowtable_find_dev(info.indev, ft))
+		return -ENOENT;
 
 	if (info.outdev)
 		route->tuple[dir].out.ifindex = info.outdev->ifindex;
 
-	if (!info.indev || !nft_flowtable_find_dev(info.indev, ft))
-		return;
-
 	route->tuple[!dir].in.ifindex = info.indev->ifindex;
 	for (i = 0; i < info.num_encaps; i++) {
 		route->tuple[!dir].in.encap[i].id = info.encap[i].id;
@@ -285,6 +286,8 @@ static void nft_dev_forward_path(const struct nft_pktinfo *pkt,
 		route->tuple[dir].xmit_type = info.xmit_type;
 	}
 	route->tuple[dir].out.needs_gso_segment = info.needs_gso_segment;
+
+	return 0;
 }
 
 int nft_flow_route(const struct nft_pktinfo *pkt, const struct nf_conn *ct,
@@ -329,11 +332,19 @@ int nft_flow_route(const struct nft_pktinfo *pkt, const struct nf_conn *ct,
 	nft_default_forward_path(route, this_dst, dir);
 	nft_default_forward_path(route, other_dst, !dir);
 
-	if (route->tuple[dir].xmit_type	== FLOW_OFFLOAD_XMIT_NEIGH)
-		nft_dev_forward_path(pkt, route, ct, dir, ft);
-	if (route->tuple[!dir].xmit_type == FLOW_OFFLOAD_XMIT_NEIGH)
-		nft_dev_forward_path(pkt, route, ct, !dir, ft);
+	if (route->tuple[dir].xmit_type	== FLOW_OFFLOAD_XMIT_NEIGH &&
+	    nft_dev_forward_path(pkt, route, ct, dir, ft) < 0)
+		goto err_dst_release;
+
+	if (route->tuple[!dir].xmit_type == FLOW_OFFLOAD_XMIT_NEIGH &&
+	    nft_dev_forward_path(pkt, route, ct, !dir, ft) < 0)
+		goto err_dst_release;
 
 	return 0;
+
+err_dst_release:
+	dst_release(route->tuple[dir].dst);
+	dst_release(route->tuple[!dir].dst);
+	return -ENOENT;
 }
 EXPORT_SYMBOL_GPL(nft_flow_route);
-- 
2.47.3


^ permalink raw reply related

* [PATCH net-next 10/11] ipvs: fix doc syntax for conn_max sysctl
From: Pablo Neira Ayuso @ 2026-06-14 11:46 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev, kuba, pabeni, edumazet, fw, horms
In-Reply-To: <20260614114605.474783-1-pablo@netfilter.org>

From: Julian Anastasov <ja@ssi.bg>

Fix the docutils error reported by kernel test robot
for the new conn_max sysctl:

Documentation/networking/ipvs-sysctl.rst:76: WARNING: Block quote ends
without a blank line; unexpected unindent. [docutils]
Documentation/networking/ipvs-sysctl.rst:76: ERROR: Unexpected section
title or transition.

Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202606071851.Dc1H7hOO-lkp@intel.com/
Fixes: 4a15044a2b06 ("ipvs: add conn_max sysctl to limit connections")
Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 Documentation/networking/ipvs-sysctl.rst | 23 ++++++++++++++++-------
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/Documentation/networking/ipvs-sysctl.rst b/Documentation/networking/ipvs-sysctl.rst
index b6bac2612420..fe36f4fcd3a0 100644
--- a/Documentation/networking/ipvs-sysctl.rst
+++ b/Documentation/networking/ipvs-sysctl.rst
@@ -72,20 +72,29 @@ conn_max - INTEGER
 	Netfilter connection tracking) the connections can be
 	limited also by nf_conntrack_max.
 
-				soft limit	hard limit
-	=====================================================
-	init_net:
+	Limits for init_net:
+
+	======================= =============== =============
+	\			soft limit	hard limit
+	======================= =============== =============
 	create netns		platform	platform
 	priv admin		0 .. platform	0 .. platform
-	=====================================================
-	new netns:
+	======================= =============== =============
+
+	Limits for new netns:
+
+	======================= =============== =============
+	\			soft limit	hard limit
+	======================= =============== =============
 	create netns		init_net:soft	init_net:soft
 	priv admin		0 .. platform	0 .. platform
 	unpriv admin		0 .. hard	N/A
+	======================= =============== =============
 
 	Limits per platform:
-	1,073,741,824 (2^30 for 64-bit)
-	   16,777,216 (2^24 for 32-bit)
+
+	- 1,073,741,824 (2^30 for 64-bit)
+	- 16,777,216 (2^24 for 32-bit)
 
 	Possible values: 0 .. platform limit
 
-- 
2.47.3


^ permalink raw reply related

* [PATCH net-next 11/11] netfilter: nf_dup_netdev: add nf_dev_xmit_recursion*() helpers and use them
From: Pablo Neira Ayuso @ 2026-06-14 11:46 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev, kuba, pabeni, edumazet, fw, horms
In-Reply-To: <20260614114605.474783-1-pablo@netfilter.org>

Update nft_dup and nft_fwd to use the nf_dev_xmit_recursion() helpers.
This patch also disables BH when transmitting the skb to address a
possible migration to different CPU leading to imbalanced decrementation
of the recursion counters.

This is modeled after Florian Westphal's dev_xmit_recursion*() API
available since commit 97cdcf37b57e ("net: place xmit recursion in
softnet data") according to its current state in the tree.

Fixes: 1d47b55b36d2 ("netfilter: nft_fwd_netdev: use recursion counter in neigh egress path")
Fixes: f37ad9127039 ("netfilter: nf_dup_netdev: Move the recursion counter struct netdev_xmit")
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_dup_netdev.h | 34 +++++++++++++++++++++++----
 net/netfilter/nf_dup_netdev.c         | 15 ++++++------
 net/netfilter/nft_fwd_netdev.c        | 17 ++++++++------
 3 files changed, 47 insertions(+), 19 deletions(-)

diff --git a/include/net/netfilter/nf_dup_netdev.h b/include/net/netfilter/nf_dup_netdev.h
index 609bcf422a9b..f6b05bd80c3f 100644
--- a/include/net/netfilter/nf_dup_netdev.h
+++ b/include/net/netfilter/nf_dup_netdev.h
@@ -11,15 +11,39 @@ void nf_fwd_netdev_egress(const struct nft_pktinfo *pkt, int oif);
 
 #define NF_RECURSION_LIMIT	2
 
-static inline u8 *nf_get_nf_dup_skb_recursion(void)
-{
 #ifndef CONFIG_PREEMPT_RT
-	return this_cpu_ptr(&softnet_data.xmit.nf_dup_skb_recursion);
+static inline bool nf_dev_xmit_recursion(void)
+{
+	return unlikely(__this_cpu_read(softnet_data.xmit.nf_dup_skb_recursion) >
+			NF_RECURSION_LIMIT);
+}
+
+static inline void nf_dev_xmit_recursion_inc(void)
+{
+	__this_cpu_inc(softnet_data.xmit.nf_dup_skb_recursion);
+}
+
+static inline void nf_dev_xmit_recursion_dec(void)
+{
+	__this_cpu_dec(softnet_data.xmit.nf_dup_skb_recursion);
+}
 #else
-	return &current->net_xmit.nf_dup_skb_recursion;
-#endif
+static inline bool nf_dev_xmit_recursion(void)
+{
+	return unlikely(current->net_xmit.nf_dup_skb_recursion > NF_RECURSION_LIMIT);
+}
+
+static inline void nf_dev_xmit_recursion_inc(void)
+{
+	current->net_xmit.nf_dup_skb_recursion++;
 }
 
+static inline void nf_dev_xmit_recursion_dec(void)
+{
+	current->net_xmit.nf_dup_skb_recursion--;
+}
+#endif
+
 struct nft_offload_ctx;
 struct nft_flow_rule;
 
diff --git a/net/netfilter/nf_dup_netdev.c b/net/netfilter/nf_dup_netdev.c
index 3b0a70e154cd..c189716e986a 100644
--- a/net/netfilter/nf_dup_netdev.c
+++ b/net/netfilter/nf_dup_netdev.c
@@ -16,11 +16,6 @@
 static void nf_do_netdev_egress(struct sk_buff *skb, struct net_device *dev,
 				enum nf_dev_hooks hook)
 {
-	u8 *nf_dup_skb_recursion = nf_get_nf_dup_skb_recursion();
-
-	if (*nf_dup_skb_recursion > NF_RECURSION_LIMIT)
-		goto err;
-
 	if (hook == NF_NETDEV_INGRESS && skb_mac_header_was_set(skb)) {
 		if (skb_cow_head(skb, skb->mac_len))
 			goto err;
@@ -30,9 +25,15 @@ static void nf_do_netdev_egress(struct sk_buff *skb, struct net_device *dev,
 
 	skb->dev = dev;
 	skb_clear_tstamp(skb);
-	(*nf_dup_skb_recursion)++;
+	local_bh_disable();
+	if (nf_dev_xmit_recursion()) {
+		local_bh_enable();
+		goto err;
+	}
+	nf_dev_xmit_recursion_inc();
 	dev_queue_xmit(skb);
-	(*nf_dup_skb_recursion)--;
+	nf_dev_xmit_recursion_dec();
+	local_bh_enable();
 	return;
 err:
 	kfree_skb(skb);
diff --git a/net/netfilter/nft_fwd_netdev.c b/net/netfilter/nft_fwd_netdev.c
index b9e88d7cf308..a48c2f765bba 100644
--- a/net/netfilter/nft_fwd_netdev.c
+++ b/net/netfilter/nft_fwd_netdev.c
@@ -95,7 +95,6 @@ static void nft_fwd_neigh_eval(const struct nft_expr *expr,
 			      struct nft_regs *regs,
 			      const struct nft_pktinfo *pkt)
 {
-	u8 *nf_dup_skb_recursion = nf_get_nf_dup_skb_recursion();
 	struct nft_fwd_neigh *priv = nft_expr_priv(expr);
 	void *addr = &regs->data[priv->sreg_addr];
 	int oif = regs->data[priv->sreg_dev];
@@ -154,13 +153,15 @@ static void nft_fwd_neigh_eval(const struct nft_expr *expr,
 		goto out;
 	}
 
-	if (*nf_dup_skb_recursion > NF_RECURSION_LIMIT) {
+	dev = dev_get_by_index_rcu(nft_net(pkt), oif);
+	if (!dev) {
 		verdict = NF_DROP;
 		goto out;
 	}
 
-	dev = dev_get_by_index_rcu(nft_net(pkt), oif);
-	if (dev == NULL) {
+	local_bh_disable();
+	if (nf_dev_xmit_recursion()) {
+		local_bh_enable();
 		verdict = NF_DROP;
 		goto out;
 	}
@@ -169,16 +170,18 @@ static void nft_fwd_neigh_eval(const struct nft_expr *expr,
 	if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
 		skb = skb_expand_head(skb, hh_len);
 		if (!skb) {
-			verdict = NF_STOLEN;
+			local_bh_enable();
 			goto out;
 		}
 	}
 
 	skb->dev = dev;
 	skb_clear_tstamp(skb);
-	(*nf_dup_skb_recursion)++;
+
+	nf_dev_xmit_recursion_inc();
 	neigh_xmit(neigh_table, dev, addr, skb);
-	(*nf_dup_skb_recursion)--;
+	nf_dev_xmit_recursion_dec();
+	local_bh_enable();
 out:
 	regs->verdict.code = verdict;
 }
-- 
2.47.3


^ permalink raw reply related

* Re: [PATCH iproute2-next 7/7] devlink: add scope filter to resource show
From: Or Har-Toov @ 2026-06-14 12:02 UTC (permalink / raw)
  To: David Ahern, Tariq Toukan, Stephen Hemminger, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Andrew Lunn, David S. Miller
  Cc: Donald Hunter, Simon Horman, Jiri Pirko, Jonathan Corbet,
	Shuah Khan, Saeed Mahameed, Leon Romanovsky, Mark Bloch,
	Shuah Khan, Matthieu Baerts (NGI0), Chuck Lever, Carolina Jubran,
	Moshe Shemesh, Shay Drori, Dragos Tatulea, Daniel Zahka,
	Shahar Shitrit, Jacob Keller, Cosmin Ratiu, Parav Pandit,
	Kees Cook, Adithya Jayachandran, Daniel Jurgens, netdev,
	linux-kernel, linux-doc, linux-rdma, linux-kselftest,
	Gal Pressman, Ido Schimmel, Jiri Pirko, Petr Machata
In-Reply-To: <943b4932-17f4-4a52-af92-b9485a0e8c7a@kernel.org>



On 11/06/2026 21:53, David Ahern wrote:
> 
> On 6/8/26 11:39 PM, Tariq Toukan wrote:
>> @@ -9010,13 +9029,29 @@ static int cmd_resource_show(struct dl *dl)
>>        uint16_t flags = NLM_F_REQUEST | NLM_F_ACK;
>>        struct nlmsghdr *nlh;
>>        struct resource_ctx resource_ctx = {};
>> +     struct dl_opts *opts = &dl->opts;
>>        int err;
>>
>> -     err = dl_argv_parse_with_selector(dl, &flags, DEVLINK_CMD_RESOURCE_DUMP,
>> -                                       DL_OPT_HANDLE | DL_OPT_HANDLEP,
>> -                                       0, 0, 0);
>> -     if (err)
>> -             return err;
>> +     if (dl_argv_match(dl, "scope")) {
>> +             const char *scopestr;
>> +
>> +             dl_arg_inc(dl);
>> +             err = dl_argv_str(dl, &scopestr);
>> +             if (err)
>> +                     return err;
>> +             err = resource_scope_get(scopestr, &opts->resource_scope_mask);
>> +             if (err)
>> +                     return err;
>> +             opts->present |= DL_OPT_RESOURCE_SCOPE;
> 
> Comment from Claude that seems legit:
> 
> Issue found: In cmd_resource_show, the scope path sets opts->present |=
> DL_OPT_RESOURCE_SCOPE without first clearing opts->present. In batch
> mode, dl->opts is shared across commands, and the non-scope path
> correctly resets opts->present via dl_argv_parse(). But the scope path
> bypasses dl_argv_parse(), so stale bits (e.g. DL_OPT_HANDLE from a
> previous dev show) remain. When dl_opts_put() runs, it writes the stale
> DEVLINK_ATTR_BUS_NAME/DEV_NAME attributes into the dump request,
> silently filtering to a single device instead of all devices. Fix: use =
> instead of |=
> 
> Are you ok with the suggested resolution?
> 

yes, thank you. let me know if I should resend.


^ permalink raw reply

* [PATCH net] net/smc: fix out-of-bounds read in smc_clcsock_data_ready()
From: Sechang Lim @ 2026-06-14 12:09 UTC (permalink / raw)
  To: D . Wythe, Dust Li, Sidraya Jayagond, Wenjia Zhang, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, David S . Miller
  Cc: Mahanta Jambigi, Tony Lu, Wen Gu, Simon Horman, Ursula Braun,
	Karsten Graul, Guvenc Gulce, netdev, linux-rdma, linux-s390, bpf,
	linux-kernel

smc_clcsock_data_ready() is installed on the listen socket and reads its
sk_user_data as an smc_sock. A passive-open child inherits this callback,
but sk_clone_lock() clears the child's sk_user_data because it is tagged
SK_USER_DATA_NOCOPY. smc_tcp_syn_recv_sock() restores the child's af_ops,
but the inherited sk_data_ready() is left in place until accept.

In that window the child is established. A cgroup sock_ops program can run
bpf_sock_hash_update() on it from tcp_init_transfer(); sk_psock_init()
stores a sk_psock in the NULL sk_user_data. The inherited callback then
reads sk_user_data via smc_clcsock_user_data(), which masks only
SK_USER_DATA_NOCOPY, mistakes the sk_psock for an smc_sock, and reads a
callback pointer past the end of the sk_psock:

  BUG: KASAN: slab-out-of-bounds in smc_clcsock_data_ready+0x84/0x200 net/smc/af_smc.c:2637
  Read of size 8 at addr ffff8880013b8674 by task syz.6.12484/67930
   <IRQ>
   smc_clcsock_data_ready+0x84/0x200 net/smc/af_smc.c:2637
   tcp_urg+0x24d/0x360 net/ipv4/tcp_input.c:6264
   tcp_rcv_state_process+0x280d/0x4940 net/ipv4/tcp_input.c:7336
   tcp_child_process+0x371/0xa50 net/ipv4/tcp_minisocks.c:1002
   tcp_v4_rcv+0x1eaa/0x2a00 net/ipv4/tcp_ipv4.c:2186
   ip_protocol_deliver_rcu+0x226/0x420 net/ipv4/ip_input.c:207
   ip_local_deliver_finish+0x35a/0x5f0 net/ipv4/ip_input.c:241
   __netif_receive_skb_one_core+0x1e5/0x210 net/core/dev.c:6216
   process_backlog+0x631/0x1470 net/core/dev.c:6682
   __napi_poll+0xb3/0x320 net/core/dev.c:7749
   net_rx_action+0x4fa/0xcb0 net/core/dev.c:7969
   handle_softirqs+0x236/0x800 kernel/softirq.c:622
   </IRQ>

  Allocated by task 67930:
   sk_psock_init+0x142/0x740 net/core/skmsg.c:766
   sock_map_link+0x646/0xdf0 net/core/sock_map.c:279
   sock_hash_update_common+0xd3/0x990 net/core/sock_map.c:1010
   bpf_sock_hash_update+0x114/0x170 net/core/sock_map.c:1229
   __cgroup_bpf_run_filter_sock_ops+0x74/0xa0 kernel/bpf/cgroup.c:1727
   tcp_init_transfer+0x1085/0x1100 net/ipv4/tcp_input.c:6693
   tcp_rcv_state_process+0x241e/0x4940 net/ipv4/tcp_input.c:7231
   tcp_child_process+0x371/0xa50 net/ipv4/tcp_minisocks.c:1002

Restore the inherited sk_data_ready() in smc_tcp_syn_recv_sock(), where the
child's sk_user_data is already cleared, rather than only at accept.

Fixes: a60a2b1e0af1 ("net/smc: reduce active tcp_listen workers")
Signed-off-by: Sechang Lim <rhkrqnwk98@gmail.com>
---
 net/smc/af_smc.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index b5db69073e20..152971e8ad17 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -156,6 +156,12 @@ static struct sock *smc_tcp_syn_recv_sock(const struct sock *sk,
 	if (child) {
 		rcu_assign_sk_user_data(child, NULL);
 
+		/*
+		 * the child inherited the listen-specific sk_data_ready();
+		 * restore it here, as sk_user_data may be reused before accept
+		 */
+		child->sk_data_ready = smc->clcsk_data_ready;
+
 		/* v4-mapped sockets don't inherit parent ops. Don't restore. */
 		if (inet_csk(child)->icsk_af_ops == inet_csk(sk)->icsk_af_ops)
 			inet_csk(child)->icsk_af_ops = smc->ori_af_ops;
-- 
2.43.0


^ permalink raw reply related

* Re: [PATCH iproute2-next 7/7] devlink: add scope filter to resource show
From: Or Har-Toov @ 2026-06-14 12:10 UTC (permalink / raw)
  To: David Ahern, Tariq Toukan, Stephen Hemminger, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Andrew Lunn, David S. Miller
  Cc: Donald Hunter, Simon Horman, Jiri Pirko, Jonathan Corbet,
	Shuah Khan, Saeed Mahameed, Leon Romanovsky, Mark Bloch,
	Shuah Khan, Matthieu Baerts (NGI0), Chuck Lever, Carolina Jubran,
	Moshe Shemesh, Shay Drori, Dragos Tatulea, Daniel Zahka,
	Shahar Shitrit, Jacob Keller, Cosmin Ratiu, Parav Pandit,
	Kees Cook, Adithya Jayachandran, Daniel Jurgens, netdev,
	linux-kernel, linux-doc, linux-rdma, linux-kselftest,
	Gal Pressman, Ido Schimmel, Jiri Pirko, Petr Machata
In-Reply-To: <943b4932-17f4-4a52-af92-b9485a0e8c7a@kernel.org>



On 11/06/2026 21:53, David Ahern wrote:
> 
> On 6/8/26 11:39 PM, Tariq Toukan wrote:
>> @@ -9010,13 +9029,29 @@ static int cmd_resource_show(struct dl *dl)
>>        uint16_t flags = NLM_F_REQUEST | NLM_F_ACK;
>>        struct nlmsghdr *nlh;
>>        struct resource_ctx resource_ctx = {};
>> +     struct dl_opts *opts = &dl->opts;
>>        int err;
>>
>> -     err = dl_argv_parse_with_selector(dl, &flags, DEVLINK_CMD_RESOURCE_DUMP,
>> -                                       DL_OPT_HANDLE | DL_OPT_HANDLEP,
>> -                                       0, 0, 0);
>> -     if (err)
>> -             return err;
>> +     if (dl_argv_match(dl, "scope")) {
>> +             const char *scopestr;
>> +
>> +             dl_arg_inc(dl);
>> +             err = dl_argv_str(dl, &scopestr);
>> +             if (err)
>> +                     return err;
>> +             err = resource_scope_get(scopestr, &opts->resource_scope_mask);
>> +             if (err)
>> +                     return err;
>> +             opts->present |= DL_OPT_RESOURCE_SCOPE;
> 
> Comment from Claude that seems legit:
> 
> Issue found: In cmd_resource_show, the scope path sets opts->present |=
> DL_OPT_RESOURCE_SCOPE without first clearing opts->present. In batch
> mode, dl->opts is shared across commands, and the non-scope path
> correctly resets opts->present via dl_argv_parse(). But the scope path
> bypasses dl_argv_parse(), so stale bits (e.g. DL_OPT_HANDLE from a
> previous dev show) remain. When dl_opts_put() runs, it writes the stale
> DEVLINK_ATTR_BUS_NAME/DEV_NAME attributes into the dump request,
> silently filtering to a single device instead of all devices. Fix: use =
> instead of |=
> 
> Are you ok with the suggested resolution?
> 

yes, thank you. let me know if I should resend.


^ permalink raw reply

* [PATCH net-next v2 0/2] net/sched: sch_fq_pie: add per-flow class statistics
From: Hemendra M. Naik @ 2026-06-14 12:49 UTC (permalink / raw)
  To: netdev
  Cc: davem, edumazet, kuba, pabeni, horms, jiri, jhs, shuah,
	linux-kernel, linux-kselftest, vishy0777, tahiliani,
	Hemendra M. Naik

 FQ-PIE runs an independent PIE controller per flow but exposes no
 per-flow statistics. This series wires up fq_pie_class_ops to expose
 per-flow AQM state (prob, delay, deficit, avg_dq_rate) 
 via 'tc -s class show', following a similar pattern as FQ-CoDel.
---
 Changelog:
 v2: Addressed ABI backward compatibility issue for tc_fq_pie_xstats.

 v1: https://lore.kernel.org/netdev/20260531125314.22492-1-hemendranaik@gmail.com/

---
Hemendra M. Naik (2):
  net/sched: sch_fq_pie: add per-flow statistics via class ops
  selftests: tc-testing: add fq_pie per-flow class stats test

 include/uapi/linux/pkt_sched.h                |  29 ++++-
 net/sched/sch_fq_pie.c                        | 118 +++++++++++++++++-
 tools/include/uapi/linux/pkt_sched.h          |   4 +-
 .../tc-testing/tc-tests/qdiscs/fq_pie.json    |  22 ++++
 4 files changed, 163 insertions(+), 10 deletions(-)

-- 
2.34.1


^ permalink raw reply

* [PATCH net-next v2 1/2]     net/sched: sch_fq_pie: add per-flow statistics via class ops
From: Hemendra M. Naik @ 2026-06-14 12:49 UTC (permalink / raw)
  To: netdev
  Cc: davem, edumazet, kuba, pabeni, horms, jiri, jhs, shuah,
	linux-kernel, linux-kselftest, vishy0777, tahiliani,
	Hemendra M. Naik
In-Reply-To: <20260614125000.6058-1-hemendranaik@gmail.com>

  FQ-PIE schedules independent PIE controllers per flow but exposes no
  per-flow AQM state. Without class-level statistics there is no way to
  observe the per-flow drop probability, queue delay, deficit or
  dequeue rate from userspace.

  Extend tc_fq_pie_xstats to support both qdisc and class-level
  extended statistics.

  - Add enum with QDISC and CLASS type discriminators.
  - Add struct tc_fq_pie_cl_stats for per-flow metrics (prob,
    delay, deficit, avg_dq_rate, dq_rate_estimating).
  - Add empty struct tc_fq_pie_xqd_stats placeholder.

  Wire up fq_pie_class_ops (.walk, .dump, .dump_stats) so that
  'tc -s class show' against an fq_pie qdisc reports per-flow state:

    prob               per-flow PIE drop probability
    delay              per-flow queue sojourn time (microseconds)
    deficit            remaining DRR byte credits (signed integer)
    avg_dq_rate        dequeue rate estimate in bytes/second
                        (dq_rate_estimator mode only)
    dq_rate_estimating flag indicating active delay estimation mode

  Fix the 'delay' field comment in struct tc_pie_xstats from "in ms" to
  "in microseconds" to match the kernel's
  PSCHED_TICKS2NS / NSEC_PER_USEC conversion.

  Also correct the avg_dq_rate comment in tc_pie_xstats from
  "bits/pie_time" to "bytes/second" to match the actual kernel
  conversion (avg_dq_rate * PSCHED_TICKS_PER_SEC >> PIE_SCALE).

Signed-off-by: Hemendra M. Naik <hemendranaik@gmail.com>
Signed-off-by: Vishal Kamath <vishy0777@gmail.com>
Signed-off-by: Mohit P. Tahiliani <tahiliani@nitk.edu.in>
---
 Changelog:
  v2: Addressed ABI backward compatibility issue for tc_fq_pie_xstats.
  
  v1: https://lore.kernel.org/netdev/20260531125314.22492-2-hemendranaik@gmail.com

---
 include/uapi/linux/pkt_sched.h       |  29 ++++++-
 net/sched/sch_fq_pie.c               | 118 +++++++++++++++++++++++++--
 tools/include/uapi/linux/pkt_sched.h |   4 +-
 3 files changed, 141 insertions(+), 10 deletions(-)

diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index 66e8072f44df..fd4daad98abc 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -910,9 +910,9 @@ enum {
 
 struct tc_pie_xstats {
 	__u64 prob;			/* current probability */
-	__u32 delay;			/* current delay in ms */
+	__u32 delay;			/* current delay in microseconds */
 	__u32 avg_dq_rate;		/* current average dq_rate in
-					 * bits/pie_time
+					 * bytes/second
 					 */
 	__u32 dq_rate_estimating;	/* is avg_dq_rate being calculated? */
 	__u32 packets_in;		/* total number of packets enqueued */
@@ -943,6 +943,25 @@ enum {
 };
 #define TCA_FQ_PIE_MAX   (__TCA_FQ_PIE_MAX - 1)
 
+enum {
+	TCA_FQ_PIE_XSTATS_QDISC,
+	TCA_FQ_PIE_XSTATS_CLASS,
+};
+
+struct tc_fq_pie_cl_stats {
+	__u64 prob;			/* current probability */
+	__u32 delay;			/* current delay in microseconds */
+	__s32 deficit;		/* number of remaining byte credits */
+	__u32 avg_dq_rate;		/* current average dq_rate in
+					 * bytes/second
+					 */
+	__u32 dq_rate_estimating;	/* is avg_dq_rate being calculated? */
+};
+
+struct tc_fq_pie_xqd_stats {
+	/* placeholder for new qdisc-level stats */
+};
+
 struct tc_fq_pie_xstats {
 	__u32 packets_in;	/* total number of packets enqueued */
 	__u32 dropped;		/* packets dropped due to fq_pie_action */
@@ -953,6 +972,12 @@ struct tc_fq_pie_xstats {
 	__u32 new_flows_len;	/* count of flows in new list */
 	__u32 old_flows_len;	/* count of flows in old list */
 	__u32 memory_usage;	/* total memory across all queues */
+	__u32 type;
+	union {
+		struct tc_fq_pie_cl_stats class_stats;
+		struct tc_fq_pie_xqd_stats xqdisc_stats;
+	};
+
 };
 
 /* CBS */
diff --git a/net/sched/sch_fq_pie.c b/net/sched/sch_fq_pie.c
index 7becbf5362b3..2778d3cda956 100644
--- a/net/sched/sch_fq_pie.c
+++ b/net/sched/sch_fq_pie.c
@@ -330,7 +330,7 @@ static int fq_pie_change(struct Qdisc *sch, struct nlattr *opt,
 	/* tupdate is in jiffies */
 	if (tb[TCA_FQ_PIE_TUPDATE])
 		WRITE_ONCE(q->p_params.tupdate,
-			usecs_to_jiffies(nla_get_u32(tb[TCA_FQ_PIE_TUPDATE])));
+			   usecs_to_jiffies(nla_get_u32(tb[TCA_FQ_PIE_TUPDATE])));
 
 	if (tb[TCA_FQ_PIE_ALPHA])
 		WRITE_ONCE(q->p_params.alpha,
@@ -509,7 +509,9 @@ static int fq_pie_dump(struct Qdisc *sch, struct sk_buff *skb)
 static int fq_pie_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
 {
 	struct fq_pie_sched_data *q = qdisc_priv(sch);
-	struct tc_fq_pie_xstats st = { 0 };
+	struct tc_fq_pie_xstats st = {
+		.type	= TCA_FQ_PIE_XSTATS_QDISC,
+	};
 	struct list_head *pos;
 
 	sch_tree_lock(sch);
@@ -517,10 +519,10 @@ static int fq_pie_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
 	st.packets_in	= q->stats.packets_in;
 	st.overlimit	= q->stats.overlimit;
 	st.overmemory	= q->overmemory;
-	st.dropped	= q->stats.dropped;
-	st.ecn_mark	= q->stats.ecn_mark;
-	st.new_flow_count = q->new_flow_count;
-	st.memory_usage   = q->memory_usage;
+	st.dropped		= q->stats.dropped;
+	st.ecn_mark		= q->stats.ecn_mark;
+	st.new_flow_count	= q->new_flow_count;
+	st.memory_usage	= q->memory_usage;
 
 	list_for_each(pos, &q->new_flows)
 		st.new_flows_len++;
@@ -561,7 +563,111 @@ static void fq_pie_destroy(struct Qdisc *sch)
 	kvfree(q->flows);
 }
 
+static struct Qdisc *fq_pie_leaf(struct Qdisc *sch, unsigned long arg)
+{
+	return NULL;
+}
+
+static unsigned long fq_pie_find(struct Qdisc *sch, u32 classid)
+{
+	return 0;
+}
+
+static unsigned long fq_pie_bind(struct Qdisc *sch, unsigned long parent,
+				 u32 classid)
+{
+	return 0;
+}
+
+static void fq_pie_unbind(struct Qdisc *q, unsigned long cl)
+{
+}
+
+static struct tcf_block *fq_pie_tcf_block(struct Qdisc *sch, unsigned long cl,
+					  struct netlink_ext_ack *extack)
+{
+	struct fq_pie_sched_data *q = qdisc_priv(sch);
+
+	if (cl)
+		return NULL;
+	return q->block;
+}
+
+static int fq_pie_dump_class(struct Qdisc *sch, unsigned long cl,
+			     struct sk_buff *skb, struct tcmsg *tcm)
+{
+	tcm->tcm_handle |= TC_H_MIN(cl);
+	return 0;
+}
+
+static int fq_pie_dump_class_stats(struct Qdisc *sch, unsigned long cl,
+				   struct gnet_dump *d)
+{
+	struct fq_pie_sched_data *q = qdisc_priv(sch);
+	struct gnet_stats_queue qs = { 0 };
+	struct tc_fq_pie_xstats xstats;
+	u32 idx = cl - 1;
+
+	if (idx < q->flows_cnt) {
+		const struct fq_pie_flow *flow = &q->flows[idx];
+
+		memset(&xstats, 0, sizeof(xstats));
+		xstats.type = TCA_FQ_PIE_XSTATS_CLASS;
+		xstats.class_stats.prob = READ_ONCE(flow->vars.prob) << BITS_PER_BYTE;
+		xstats.class_stats.delay =
+			((u32)PSCHED_TICKS2NS(READ_ONCE(flow->vars.qdelay))) /
+			NSEC_PER_USEC;
+		xstats.class_stats.deficit = READ_ONCE(flow->deficit);
+		xstats.class_stats.dq_rate_estimating =
+			READ_ONCE(q->p_params.dq_rate_estimator);
+
+		if (xstats.class_stats.dq_rate_estimating) {
+			xstats.class_stats.avg_dq_rate =
+				READ_ONCE(flow->vars.avg_dq_rate) *
+				(PSCHED_TICKS_PER_SEC) >> PIE_SCALE;
+		}
+
+		qs.qlen    = READ_ONCE(flow->qlen);
+		qs.backlog = READ_ONCE(flow->backlog);
+	}
+	if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0)
+		return -1;
+	if (idx < q->flows_cnt)
+		return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
+	return 0;
+}
+
+static void fq_pie_walk(struct Qdisc *sch, struct qdisc_walker *arg)
+{
+	struct fq_pie_sched_data *q = qdisc_priv(sch);
+	unsigned int i;
+
+	if (arg->stop)
+		return;
+
+	for (i = 0; i < q->flows_cnt; i++) {
+		if (list_empty(&q->flows[i].flowchain)) {
+			arg->count++;
+			continue;
+		}
+		if (!tc_qdisc_stats_dump(sch, i + 1, arg))
+			break;
+	}
+}
+
+static const struct Qdisc_class_ops fq_pie_class_ops = {
+	.leaf		=	fq_pie_leaf,
+	.find		=	fq_pie_find,
+	.tcf_block	=	fq_pie_tcf_block,
+	.bind_tcf	=	fq_pie_bind,
+	.unbind_tcf	=	fq_pie_unbind,
+	.dump		=	fq_pie_dump_class,
+	.dump_stats	=	fq_pie_dump_class_stats,
+	.walk		=	fq_pie_walk,
+};
+
 static struct Qdisc_ops fq_pie_qdisc_ops __read_mostly = {
+	.cl_ops		=	&fq_pie_class_ops,
 	.id		= "fq_pie",
 	.priv_size	= sizeof(struct fq_pie_sched_data),
 	.enqueue	= fq_pie_qdisc_enqueue,
diff --git a/tools/include/uapi/linux/pkt_sched.h b/tools/include/uapi/linux/pkt_sched.h
index 587481a19433..45ea10026742 100644
--- a/tools/include/uapi/linux/pkt_sched.h
+++ b/tools/include/uapi/linux/pkt_sched.h
@@ -847,8 +847,8 @@ enum {
 
 struct tc_pie_xstats {
 	__u32 prob;             /* current probability */
-	__u32 delay;            /* current delay in ms */
-	__u32 avg_dq_rate;      /* current average dq_rate in bits/pie_time */
+	__u32 delay;            /* current delay in micoseconds */
+	__u32 avg_dq_rate;      /* current average dq_rate in bytes/second */
 	__u32 packets_in;       /* total number of packets enqueued */
 	__u32 dropped;          /* packets dropped due to pie_action */
 	__u32 overlimit;        /* dropped due to lack of space in queue */
-- 
2.34.1


^ permalink raw reply related

* [PATCH net-next v2 2/2] selftests: tc-testing: add fq_pie per-flow class stats test
From: Hemendra M. Naik @ 2026-06-14 12:50 UTC (permalink / raw)
  To: netdev
  Cc: davem, edumazet, kuba, pabeni, horms, jiri, jhs, shuah,
	linux-kernel, linux-kselftest, vishy0777, tahiliani,
	Hemendra M. Naik
In-Reply-To: <20260614125000.6058-1-hemendranaik@gmail.com>

Add a tc-testing entry (id: 83c0) to verify the fq_pie class ops
wired up in the previous patch do not crash and integrate cleanly
with the tc class show path.

The test creates an fq_pie root qdisc on a dummy interface and runs
'tc -s class show'.

Signed-off-by: Hemendra M. Naik <hemendranaik@gmail.com>
Signed-off-by: Vishal Kamath <vishy0777@gmail.com>
Signed-off-by: Mohit P. Tahiliani <tahiliani@nitk.edu.in>
---
Changelog:
 v2: No changes from v1.

 v1: https://lore.kernel.org/netdev/20260531125314.22492-3-hemendranaik@gmail.com/
---
 .../tc-testing/tc-tests/qdiscs/fq_pie.json    | 22 +++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/tools/testing/selftests/tc-testing/tc-tests/qdiscs/fq_pie.json b/tools/testing/selftests/tc-testing/tc-tests/qdiscs/fq_pie.json
index 229fe1bf4a90..88139f429430 100644
--- a/tools/testing/selftests/tc-testing/tc-tests/qdiscs/fq_pie.json
+++ b/tools/testing/selftests/tc-testing/tc-tests/qdiscs/fq_pie.json
@@ -40,5 +40,27 @@
         "matchPattern": "qdisc fq_pie 1: root refcnt [0-9]+ limit 1p",
         "matchCount": "1",
         "teardown": ["$TC qdisc del dev $DEV1 handle 1: root"]
+    },
+    {
+        "id": "83c0",
+        "name": "FQ-PIE class stats accessible via tc class show",
+        "category": [
+            "qdisc",
+            "fq_pie"
+        ],
+        "plugins": {
+            "requires": "nsPlugin"
+        },
+        "setup": [
+            "$TC qdisc add dev $DUMMY handle 1: root fq_pie"
+        ],
+        "cmdUnderTest": "$TC -s class show dev $DUMMY",
+        "expExitCode": "0",
+        "verifyCmd": "$TC -s class show dev $DUMMY",
+        "matchPattern": "class fq_pie",
+        "matchCount": "0",
+        "teardown": [
+            "$TC qdisc del dev $DUMMY handle 1: root"
+        ]
     }
 ]
-- 
2.34.1


^ permalink raw reply related

* [PATCH net] net: rds: check cmsg_len before reading rds_rdma_args in size pass
From: Michael Bommarito @ 2026-06-14 13:07 UTC (permalink / raw)
  To: Allison Henderson, David S . Miller, Jakub Kicinski, Paolo Abeni,
	Eric Dumazet
  Cc: Simon Horman, netdev, linux-rdma, rds-devel, linux-kernel

For RDS_CMSG_RDMA_ARGS, rds_rm_size() calls rds_rdma_extra_size() after
only CMSG_OK(), without checking that cmsg_len covers struct
rds_rdma_args. rds_rdma_extra_size() reads args->local_vec_addr and
args->nr_local, so a short control message reads past the copied control
buffer. The value bounds an allocation count, so this is an
out-of-bounds read in the kernel, not a leak to user space, and an
unprivileged AF_RDS socket can trigger it with one short cmsg.

The two later RDS_RDMA passes (rds_cmsg_rdma_args() and the rdma-bytes
loop in rds_sendmsg()) already reject cmsg_len < CMSG_LEN(sizeof(struct
rds_rdma_args)); only this size pass does not. Reject it the same way.

Reproduced under KASAN on QEMU via a KUnit driving the real
rds_rm_size(); the out-of-bounds read is gone after this change.

Fixes: ff87e97a9d70 ("RDS: make m_rdma_op a member of rds_message")
Cc: stable@vger.kernel.org
Assisted-by: Claude:claude-opus-4-8
Signed-off-by: Michael Bommarito <michael.bommarito@gmail.com>
---
A short RDS_CMSG_RDMA_ARGS placed at a page boundary makes
rds_rdma_extra_size() read the args fields past the allocation:

  BUG: KASAN: slab-out-of-bounds in rds_rdma_extra_size

an 8-byte read. On stock it faults; patched it returns -EINVAL with no
report. Controls (well-formed args; a short cmsg with args still in
bounds) drive the same pass cleanly on both trees.

No in-tree selftest exercises rds_rm_size(); I can send the KUnit suite
as a separate net-next patch if wanted, kept out so the fix is not held.

 net/rds/send.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/rds/send.c b/net/rds/send.c
index d8b14ff9d366b..6ca3192b1d8af 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -967,6 +967,8 @@ static int rds_rm_size(struct msghdr *msg, int num_sgs,

 		switch (cmsg->cmsg_type) {
 		case RDS_CMSG_RDMA_ARGS:
+			if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_rdma_args)))
+				return -EINVAL;
 			if (vct->indx >= vct->len) {
 				vct->len += vct->incr;
 				tmp_iov =

base-commit: 5200f5f493f79f14bbdc349e402a40dfb32f23c8
-- 
2.53.0

^ permalink raw reply related

* [PATCH iproute2-next v2] tc: fq_pie: add support for printing per-flow PIE statistics
From: Hemendra M. Naik @ 2026-06-14 13:07 UTC (permalink / raw)
  To: netdev; +Cc: jiri, jhs, linux-kernel, vishy0777, tahiliani, Hemendra M. Naik

'tc -s class show' against an fq_pie qdisc now prints:

 prob           drop probability for the flow
 delay          per-flow queue sojourn time (microseconds)
 deficit        remaining DRR byte credits (signed integer)
 avg_dq_rate    dequeue rate estimate in bytes/second
             	(dq_rate_estimator mode only)

avg_dq_rate is formatted using tc_print_rate(), which converts the
kernel's bytes/second value to a human-readable bits/second string
(e.g. '3906Kbit'), consistent with how other tc schedulers display
rate fields. Apply the same fix to tc/q_pie.c, where avg_dq_rate was
also printed as a raw integer without a unit.

Update the UAPI header to mirror tc_fq_pie_cl_stats from the kernel.
Fix the 'delay' field comment in struct tc_pie_xstats from "in ms" to
"in microseconds" to match the kernel's
PSCHED_TICKS2NS / NSEC_PER_USEC conversion.

Add a 'tc -s class show' example to tc-fq_pie(8) with dq_rate_estimator
enabled, showing all per-flow fields (prob, delay, deficit, avg_dq_rate)
across multiple flows. Update tc-pie(8) avg_dq_rate example from a raw
integer to a formatted bits/second string.

The corresponding kernel patch can be viewed here:
https://lore.kernel.org/netdev/20260614125000.6058-1-hemendranaik@gmail.com/

Signed-off-by: Hemendra M. Naik <hemendranaik@gmail.com>
Signed-off-by: Vishal Kamath <vishy0777@gmail.com>
Signed-off-by: Mohit P. Tahiliani <tahiliani@nitk.edu.in>

---
Changelog:
v2: Addressed ABI backward compatibility issue for tc_fq_pie_xstats

v1: https://lore.kernel.org/netdev/20260531131411.28213-1-hemendranaik@gmail.com/
---
 include/uapi/linux/pkt_sched.h | 29 +++++++++++++++--
 man/man8/tc-fq_pie.8           | 18 +++++++++++
 man/man8/tc-pie.8              |  2 +-
 tc/q_fq_pie.c                  | 57 +++++++++++++++++++++++-----------
 tc/q_pie.c                     |  4 +--
 5 files changed, 87 insertions(+), 23 deletions(-)

diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index fb07a889..4927f7b8 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -910,9 +910,9 @@ enum {
 
 struct tc_pie_xstats {
 	__u64 prob;			/* current probability */
-	__u32 delay;			/* current delay in ms */
+	__u32 delay;			/* current delay in microseconds */
 	__u32 avg_dq_rate;		/* current average dq_rate in
-					 * bits/pie_time
+					 * bytes/second
 					 */
 	__u32 dq_rate_estimating;	/* is avg_dq_rate being calculated? */
 	__u32 packets_in;		/* total number of packets enqueued */
@@ -943,6 +943,25 @@ enum {
 };
 #define TCA_FQ_PIE_MAX   (__TCA_FQ_PIE_MAX - 1)
 
+enum {
+	TCA_FQ_PIE_XSTATS_QDISC,
+	TCA_FQ_PIE_XSTATS_CLASS,
+};
+
+struct tc_fq_pie_cl_stats {
+	__u64 prob;			/* current probability */
+	__u32 delay;			/* current delay in microseconds */
+	__s32 deficit;		/* number of remaining byte credits */
+	__u32 avg_dq_rate;		/* current average dq_rate in
+					 * bytes/second
+					 */
+	__u32 dq_rate_estimating;	/* is avg_dq_rate being calculated? */
+};
+
+struct tc_fq_pie_xqd_stats {
+	/* placeholder for new qdisc-level stats */
+};
+
 struct tc_fq_pie_xstats {
 	__u32 packets_in;	/* total number of packets enqueued */
 	__u32 dropped;		/* packets dropped due to fq_pie_action */
@@ -953,6 +972,12 @@ struct tc_fq_pie_xstats {
 	__u32 new_flows_len;	/* count of flows in new list */
 	__u32 old_flows_len;	/* count of flows in old list */
 	__u32 memory_usage;	/* total memory across all queues */
+	__u32 type;
+	union {
+		struct tc_fq_pie_cl_stats class_stats;
+		struct tc_fq_pie_xqd_stats xqdisc_stats;
+	};
+
 };
 
 /* CBS */
diff --git a/man/man8/tc-fq_pie.8 b/man/man8/tc-fq_pie.8
index 457a56bb..bf988f5f 100644
--- a/man/man8/tc-fq_pie.8
+++ b/man/man8/tc-fq_pie.8
@@ -153,6 +153,24 @@ dq_rate_estimator
   pkts_in 6082 overlimit 0 overmemory 0 dropped 4 ecn_mark 0
   new_flow_count 94 new_flows_len 0 old_flows_len 8 memory_used 1157632
 
+# tc qdisc add dev eth0 parent 100:1 handle 200: fq_pie target 2ms flows 3
+.br
+# tc -s class show dev eth0
+.br
+class fq_pie 200:2 parent 200:
+ Sent 0 bytes 0 pkt (dropped 0, overlimits 0 requeues 0)
+ backlog 22800b 2p requeues 0
+  prob 0.57679 delay 2.38ms
+
+# tc qdisc add dev eth0 parent 100:1 handle 200: fq_pie target 2ms flows 3 dq_rate_estimator
+.br
+# tc -s class show dev eth0
+.br
+class fq_pie 200:2 parent 200:
+ Sent 0 bytes 0 pkt (dropped 0, overlimits 0 requeues 0)
+ backlog 22800b 2p requeues 0
+  prob 0.57679 delay 2.38ms avg_dq_rate 10742Kbit
+
 .SH SEE ALSO
 .BR tc (8),
 .BR tc-pie (8),
diff --git a/man/man8/tc-pie.8 b/man/man8/tc-pie.8
index 5a8c7820..f3a09616 100644
--- a/man/man8/tc-pie.8
+++ b/man/man8/tc-pie.8
@@ -115,7 +115,7 @@ is turned off.
    qdisc pie 8036: dev eth0 root refcnt 2 limit 1000p target 15.0ms tupdate 16.0ms alpha 2 beta 20
     Sent 63947420 bytes 42414 pkt (dropped 41, overlimits 0 requeues 0)
     backlog 271006b 179p requeues 0
-     prob 0.000092 delay 22200us avg_dq_rate 12145996
+     prob 0.000092 delay 22200us avg_dq_rate 10742Kbit
      pkts_in 41 overlimit 343 dropped 0 maxq 50 ecn_mark 0
 
  # tc qdisc add dev eth0 root pie limit 100 target 20ms tupdate 30ms ecn
diff --git a/tc/q_fq_pie.c b/tc/q_fq_pie.c
index dc2710cd..678f045f 100644
--- a/tc/q_fq_pie.c
+++ b/tc/q_fq_pie.c
@@ -274,6 +274,8 @@ static int fq_pie_print_xstats(const struct qdisc_util *qu, FILE *f,
 {
 	struct tc_fq_pie_xstats _st = {}, *st;
 
+	SPRINT_BUF(b1);
+
 	if (xstats == NULL)
 		return 0;
 
@@ -283,25 +285,43 @@ static int fq_pie_print_xstats(const struct qdisc_util *qu, FILE *f,
 		st = &_st;
 	}
 
-	print_uint(PRINT_ANY, "pkts_in", "  pkts_in %u",
-		   st->packets_in);
-	print_uint(PRINT_ANY, "overlimit", " overlimit %u",
-		   st->overlimit);
-	print_uint(PRINT_ANY, "overmemory", " overmemory %u",
-		   st->overmemory);
-	print_uint(PRINT_ANY, "dropped", " dropped %u",
-		   st->dropped);
-	print_uint(PRINT_ANY, "ecn_mark", " ecn_mark %u",
-		   st->ecn_mark);
+	if (!st->type || st->type == TCA_FQ_PIE_XSTATS_QDISC) {
+		print_uint(PRINT_ANY, "pkts_in", "  pkts_in %u",
+			   st->packets_in);
+		print_uint(PRINT_ANY, "overlimit", " overlimit %u",
+			   st->overlimit);
+		print_uint(PRINT_ANY, "overmemory", " overmemory %u",
+			   st->overmemory);
+		print_uint(PRINT_ANY, "dropped", " dropped %u",
+			   st->dropped);
+		print_uint(PRINT_ANY, "ecn_mark", " ecn_mark %u",
+			   st->ecn_mark);
+		print_nl();
+		print_uint(PRINT_ANY, "new_flow_count", "  new_flow_count %u",
+			   st->new_flow_count);
+		print_uint(PRINT_ANY, "new_flows_len", " new_flows_len %u",
+			   st->new_flows_len);
+		print_uint(PRINT_ANY, "old_flows_len", " old_flows_len %u",
+			   st->old_flows_len);
+		print_uint(PRINT_ANY, "memory_used", " memory_used %u",
+			   st->memory_usage);
+	}
+
+	if (st->type == TCA_FQ_PIE_XSTATS_CLASS) {
+		print_float(PRINT_ANY, "prob", " prob %lg",
+			    (double)st->class_stats.prob / (double)UINT64_MAX);
+		print_uint(PRINT_JSON, "delay", NULL, st->class_stats.delay);
+		print_string(PRINT_FP, NULL, " delay %s",
+			     sprint_time(st->class_stats.delay, b1));
+		print_int(PRINT_ANY, "deficit", " deficit %d",
+			  st->class_stats.deficit);
+
+		if (st->class_stats.dq_rate_estimating) {
+			tc_print_rate(PRINT_ANY, "avg_dq_rate", " avg_dq_rate %s",
+				      st->class_stats.avg_dq_rate);
+		}
+	}
 	print_nl();
-	print_uint(PRINT_ANY, "new_flow_count", "  new_flow_count %u",
-		   st->new_flow_count);
-	print_uint(PRINT_ANY, "new_flows_len", " new_flows_len %u",
-		   st->new_flows_len);
-	print_uint(PRINT_ANY, "old_flows_len", " old_flows_len %u",
-		   st->old_flows_len);
-	print_uint(PRINT_ANY, "memory_used", " memory_used %u",
-		   st->memory_usage);
 
 	return 0;
 
diff --git a/tc/q_pie.c b/tc/q_pie.c
index 04c9aa61..abae1ced 100644
--- a/tc/q_pie.c
+++ b/tc/q_pie.c
@@ -220,8 +220,8 @@ static int pie_print_xstats(const struct qdisc_util *qu, FILE *f,
 	print_string(PRINT_FP, NULL, " delay %s", sprint_time(st->delay, b1));
 
 	if (st->dq_rate_estimating)
-		print_uint(PRINT_ANY, "avg_dq_rate", " avg_dq_rate %u",
-			   st->avg_dq_rate);
+		tc_print_rate(PRINT_ANY, "avg_dq_rate", " avg_dq_rate %s",
+			      st->avg_dq_rate);
 
 	print_nl();
 	print_uint(PRINT_ANY, "pkts_in", "  pkts_in %u", st->packets_in);
-- 
2.34.1


^ permalink raw reply related

* Re: [PATCH net-next v7 08/11] udp: Set length in UDP header to 0 for big GSO packets
From: Paolo Abeni @ 2026-06-14 13:19 UTC (permalink / raw)
  To: Alice Mikityanska, Daniel Borkmann, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Xin Long, Willem de Bruijn, Willem de Bruijn,
	David Ahern, Nikolay Aleksandrov
  Cc: Shuah Khan, Stanislav Fomichev, Andrew Lunn, Simon Horman,
	Florian Westphal, netdev, Alice Mikityanska
In-Reply-To: <20260611192955.604661-9-alice.kernel@fastmail.im>

On 6/11/26 9:29 PM, Alice Mikityanska wrote:
> diff --git a/net/ipv6/ip6_udp_tunnel.c b/net/ipv6/ip6_udp_tunnel.c
> index dcff7fb16ff6..32525a051a6f 100644
> --- a/net/ipv6/ip6_udp_tunnel.c
> +++ b/net/ipv6/ip6_udp_tunnel.c
> @@ -93,7 +93,7 @@ void udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sock *sk,
>  	uh->dest = dst_port;
>  	uh->source = src_port;
>  
> -	udp_set_len_short(uh, skb->len);
> +	udp_set_len(uh, skb->len);

Both Sashikos noted the above breaks GSO csum, as the following
udp_set_csum() will use skb->len to compute the csum partial.

I think it would be useful to consolidate udp_set_len() and
udp_set_csum() in a single helper - say udp_set_csum_len():

void udp_set_csum_len(bool nocheck, struct sk_buff *skb,
		      __be32 saddr, __be32 daddr, int len)
{
	struct udphdr *uh = udp_hdr(skb);

	
	uh->len = htons(len);
	if (nocheck) {
		uh->check = 0;
	} else if (skb_is_gso(skb)) {
		uh->len = len < GRO_LEGACY_MAX_SIZE ? htons(len) : 0;
		uh->check = ~udp_v4_check(len, saddr, daddr, 0);
	} else if (skb->ip_summed == CHECKSUM_PARTIAL) {
		uh->check = 0;
		uh->check = udp_v4_check(len, saddr, daddr,
	// ...

so that csum and len are always consistent, the 'set' logic is symmetric
with udp_get_len(), and no duplicate checks are needed.

/P


^ permalink raw reply

* Re: [PATCH net-next v7 11/11] selftests: net: Add a test for BIG TCP in UDP tunnels
From: Paolo Abeni @ 2026-06-14 13:23 UTC (permalink / raw)
  To: Alice Mikityanska, Daniel Borkmann, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Xin Long, Willem de Bruijn, Willem de Bruijn,
	David Ahern, Nikolay Aleksandrov
  Cc: Shuah Khan, Stanislav Fomichev, Andrew Lunn, Simon Horman,
	Florian Westphal, netdev, Alice Mikityanska
In-Reply-To: <20260611192955.604661-12-alice.kernel@fastmail.im>

On 6/11/26 9:29 PM, Alice Mikityanska wrote:
> +trap cleanup EXIT
> +setup
> +for tunnel in vxlan geneve; do
> +	for tun_family in 4 6; do
> +		for traffic_family in 4 6; do
> +			setup_tunnel "$tunnel" "$tun_family" || exit "$?"
> +			do_test "$traffic_family" || exit "$?"
> +			cleanup_tunnel
> +		done
> +	done
> +done

Sashiko noted that the above does not validate the GRO/GSO csum code
path. It could be useful to add another variant, with veth csum offload
disabled.

/P


^ permalink raw reply

* Re: [PATCH net-next v7 02/11] geneve: Fix off-by-one comparing with GRO_LEGACY_MAX_SIZE
From: Paolo Abeni @ 2026-06-14 13:25 UTC (permalink / raw)
  To: Alice Mikityanska, Daniel Borkmann, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Xin Long, Willem de Bruijn, Willem de Bruijn,
	David Ahern, Nikolay Aleksandrov
  Cc: Shuah Khan, Stanislav Fomichev, Andrew Lunn, Simon Horman,
	Florian Westphal, netdev, Alice Mikityanska
In-Reply-To: <20260611192955.604661-3-alice.kernel@fastmail.im>

On 6/11/26 9:29 PM, Alice Mikityanska wrote:
> From: Alice Mikityanska <alice@isovalent.com>
> 
> GRO_LEGACY_MAX_SIZE = 65536; total_len being 65536 is too big to fit
> into a u16. As can be seen in skb_gro_receive, packets bigger or equal
> to gro_max_size (or GRO_LEGACY_MAX_SIZE) are dropped with -E2BIG. Apply
> the same boundary to geneve_post_decap_hint to avoid writing 65536 to a
> 16-bit iph->tot_len field with an overflow.
> 
> Fixes: fd0dd796576e ("geneve: use GRO hint option in the RX path")
> Signed-off-by: Alice Mikityanska <alice@isovalent.com>
> Reviewed-by: Willem de Bruijn <willemb@google.com>

This patch and the previous one are pretty much uncontroversial. Let me
apply them, so that the next iteration will be smaller.

/P


^ permalink raw reply

* [PATCH net-next v11 0/2] net: sfp: extend SMBus support
From: Jonas Jelonek @ 2026-06-14 13:34 UTC (permalink / raw)
  To: Russell King, Andrew Lunn, Heiner Kallweit, David S . Miller,
	Eric Dumazet, Jakub Kicinski, Paolo Abeni, Maxime Chevallier
  Cc: netdev, linux-kernel, Bjørn Mork, Simon Horman,
	Jonas Jelonek

Today, the SFP driver only drives I2C adapters that advertise full
I2C_FUNC_I2C, or SMBus-only adapters via single-byte transfers (with
hwmon disabled). Several SoCs ship I2C/SMBus-only controllers that
support more than just byte access -- e.g. word and I2C block -- and
have SFP cages wired to them. Today, those adapters either work
poorly or not at all.

This series teaches the SFP driver to use the larger SMBus access
modes when the adapter advertises them, and along the way starts
honoring i2c_adapter quirks on read/write length so adapters that
cap below the SFP block size are handled correctly. Patch 1 is a
small prep doing only the quirks handling; patch 2 extends the
SMBus path itself.

Capability matrix supported by patch 2:
  - BYTE only:                   single-byte access (unchanged).
  - BYTE + WORD:                 word for >=2-byte chunks, byte tail.
  - I2C_BLOCK present:           block as the universal transport.
  - WORD only (no BYTE/BLOCK):   accepted with WARN_ONCE; works for
                                 even-length transfers, odd-length
                                 transfers will error at xfer time.

Adapters with asymmetric R/W capabilities (e.g. only READ_I2C_BLOCK
without WRITE_I2C_BLOCK) remain functionally correct but use the
worse-supported direction's max for both directions, since
i2c_max_block_size is a single field. No mainline I2C driver was
seen advertising such asymmetry; per-direction sizes can be added
later if needed.

Note:
Sashiko keeps raising an issue about the write_len cap possibly
limiting reads too harsh and disabling hwmon functionality although
not needed. This is theoretically true but I've acknowledged this in
the past versions multiple times, being rather theoretical because no
in-tree drivers will trigger this.

---
v10 -> v11:
  - use smbus_data.block[0] instead of this_len to address issue
    flagged by Sashiko in patch 2 [7]
  - dropped Reviewed-by from Maxime on patch 2 due to changes
v10: https://lore.kernel.org/netdev/20260612142840.462664-1-jelonek.jonas@gmail.com/

v9 -> v10:
  - dropped accepted and merged precursor patch
  - resend as requested by Jakub
  - added Reviewed-By from Maxime to first patch
  - acknowledged potential issue flagged by Sashika, impact is
    theoretical, see [6]
v9: https://lore.kernel.org/netdev/20260528205242.971410-1-jelonek.jonas@gmail.com/

v8 -> v9:
  - added precursor patch to mitigate issue flagged by Jakub's AI [3]
  - issue flagged in [4] acknowledged in [5] but not fixed, due to no
    practical impact
  - added Reviewed-By from Maxime to patch 3 (was patch 2 before)
v8: https://lore.kernel.org/netdev/20260516135442.2234729-1-jelonek.jonas@gmail.com/

v7 -> v8:
  - avoid leaking uninitialized memory on short reads by
    zero-initializing i2c_smbus_data variable (Simon)
  - theoretical issue without practical impact raised at [1]
    not addressed but acknowledged at [2]
v7: https://lore.kernel.org/netdev/20260507093301.1144740-1-jelonek.jonas@gmail.com/

v6 -> v7:
  - use i2c_block_size instead of i2c_max_block_size (Maxime)
  - move WARN_ONCE into 'else if ()' (Maxime)
  - reword comments
  - included Maxime's Reviewed-by for patch 1
v6: https://lore.kernel.org/netdev/20260505200647.1125311-1-jelonek.jonas@gmail.com/

v5 -> v6:
  - Split adapter-quirks handling into a separate prep patch (1/2).
  - Use I2C_SMBUS_I2C_BLOCK_DATA in the block-write branch (was
    I2C_SMBUS_WORD_DATA), so block writes actually transfer this_len
    bytes (also flagged by Jakub's AI bot review).
  - In sfp_smbus_read/write, check i2c_smbus_xfer() return before
    copying smbus_data into the caller's buffer.
  - Use I2C_BLOCK as the universal transport when available (carries
    any length 1..32); drop the this_len > 2 guard on the block
    branches.
  - Broaden the SMBus gate to also accept BLOCK-only adapters
    (Russell).
  - Accept word-only adapters with WARN_ONCE rather than rejecting
    them (Andrew).
  - Add a short comment in sfp_i2c_configure() explaining the access
    hierarchy (Maxime).
  - Use the all-bits-set form via i2c_check_functionality() for the
    composite I2C_FUNC_SMBUS_* checks (Russell).
v5: https://lore.kernel.org/netdev/20260116113105.244592-1-jelonek.jonas@gmail.com/

v4 -> v5:
  - made a more general approach, also covering word access
v4: https://lore.kernel.org/netdev/20260109101321.2804-1-jelonek.jonas@gmail.com/

v3 -> v4:
  - fix formal issues
v3: https://lore.kernel.org/netdev/20260105161242.578487-1-jelonek.jonas@gmail.com/

v2 -> v3:
  - fix previous attempt of v2 to fix return value
v2: https://lore.kernel.org/netdev/20260105154653.575397-1-jelonek.jonas@gmail.com/

v1 -> v2:
  - return number of written bytes instead of zero
v1: https://lore.kernel.org/netdev/20251228213331.472887-1-jelonek.jonas@gmail.com/

[1] https://lore.kernel.org/netdev/20260510164726.1401317-1-horms@kernel.org/
[2] https://lore.kernel.org/netdev/5129a58d-8852-4395-85e1-8991934810b8@gmail.com/
[3] https://lore.kernel.org/netdev/20260520234208.565366-1-kuba@kernel.org/
[4] https://lore.kernel.org/netdev/20260520234204.565333-1-kuba@kernel.org/
[5] https://lore.kernel.org/netdev/4a1b13f4-9c68-4f4c-a676-fd61e2aeeab0@gmail.com/
[6] https://lore.kernel.org/netdev/ec27912c-a66e-4a81-8c8c-318d27ec62b5@gmail.com/
[7] https://sashiko.dev/#/patchset/20260612142840.462664-1-jelonek.jonas%40gmail.com

---
Jonas Jelonek (2):
  net: sfp: apply I2C adapter quirks to limit block size
  net: sfp: extend SMBus support

 drivers/net/phy/sfp.c | 149 ++++++++++++++++++++++++++++++++++--------
 1 file changed, 120 insertions(+), 29 deletions(-)


base-commit: 383bad5ffeb8a84fcb4b87544429edb82aa5d223
-- 
2.51.0


^ permalink raw reply

* [PATCH net-next v11 1/2] net: sfp: apply I2C adapter quirks to limit block size
From: Jonas Jelonek @ 2026-06-14 13:34 UTC (permalink / raw)
  To: Russell King, Andrew Lunn, Heiner Kallweit, David S . Miller,
	Eric Dumazet, Jakub Kicinski, Paolo Abeni, Maxime Chevallier
  Cc: netdev, linux-kernel, Bjørn Mork, Simon Horman,
	Jonas Jelonek
In-Reply-To: <20260614133418.2068201-1-jelonek.jonas@gmail.com>

The SFP driver assumes all I2C adapters support reading and writing the
pre-defined block size SFP_EEPROM_BLOCK_SIZE of 16 bytes. This constant
was probably chosen based on good guesses and known limitations of a
range of I2C adapters and SFP modules.

However, I2C adapters may even support less and usually need to specify
this via I2C quirks. Theoretically, such an adapter may provide full
functionality but only support a read and write length of e.g. 8 bytes.
Currently, the SFP driver doesn't account for that.

Add handling for I2C quirks in SFP I2C configuration taking the fields
max_read_len and max_write_len in struct i2c_adapter_quirks into account
to further limit the maximum block size if needed.

Signed-off-by: Jonas Jelonek <jelonek.jonas@gmail.com>
Reviewed-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
---
 drivers/net/phy/sfp.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/drivers/net/phy/sfp.c b/drivers/net/phy/sfp.c
index 18f2584dbe7b..3c232991faa1 100644
--- a/drivers/net/phy/sfp.c
+++ b/drivers/net/phy/sfp.c
@@ -810,21 +810,29 @@ static int sfp_smbus_byte_write(struct sfp *sfp, bool a2, u8 dev_addr,
 
 static int sfp_i2c_configure(struct sfp *sfp, struct i2c_adapter *i2c)
 {
+	size_t max_block_size;
+
 	sfp->i2c = i2c;
 
 	if (i2c_check_functionality(i2c, I2C_FUNC_I2C)) {
 		sfp->read = sfp_i2c_read;
 		sfp->write = sfp_i2c_write;
-		sfp->i2c_max_block_size = SFP_EEPROM_BLOCK_SIZE;
+		max_block_size = SFP_EEPROM_BLOCK_SIZE;
 	} else if (i2c_check_functionality(i2c, I2C_FUNC_SMBUS_BYTE_DATA)) {
 		sfp->read = sfp_smbus_byte_read;
 		sfp->write = sfp_smbus_byte_write;
-		sfp->i2c_max_block_size = 1;
+		max_block_size = 1;
 	} else {
 		sfp->i2c = NULL;
 		return -EINVAL;
 	}
 
+	if (i2c->quirks && i2c->quirks->max_read_len)
+		max_block_size = min(max_block_size, i2c->quirks->max_read_len);
+	if (i2c->quirks && i2c->quirks->max_write_len)
+		max_block_size = min(max_block_size, i2c->quirks->max_write_len);
+
+	sfp->i2c_max_block_size = max_block_size;
 	sfp->i2c_block_size = sfp->i2c_max_block_size;
 	return 0;
 }
-- 
2.51.0


^ permalink raw reply related

* [PATCH net-next v11 2/2] net: sfp: extend SMBus support
From: Jonas Jelonek @ 2026-06-14 13:34 UTC (permalink / raw)
  To: Russell King, Andrew Lunn, Heiner Kallweit, David S . Miller,
	Eric Dumazet, Jakub Kicinski, Paolo Abeni, Maxime Chevallier
  Cc: netdev, linux-kernel, Bjørn Mork, Simon Horman,
	Jonas Jelonek
In-Reply-To: <20260614133418.2068201-1-jelonek.jonas@gmail.com>

Commit 7662abf4db94 ("net: phy: sfp: Add support for SMBus module access")
added SMBus access for SFP modules, but limited it to single-byte
transfers. As a side effect, hwmon is disabled (16-bit reads cannot be
guaranteed atomic) and a warning is printed.

Many SMBus-only I2C controllers in the wild support more than just
byte access, and SFP cages are often wired to such controllers
rather than to a full-featured I2C controller -- e.g. the SMBus
controllers in the Realtek longan and mango SoCs, which advertise
word access and I2C block reads. Today, they cannot drive an SFP at
all without falling back to the byte-only path.

Extend sfp_smbus_read()/sfp_smbus_write() so that, in addition to
the existing byte access, they also use SMBus word access and SMBus
I2C block access whenever the adapter advertises them. Both
directions are handled in a single read and a single write helper
that pick the largest supported transfer per chunk and fall back as
needed.

I2C-block is preferred unconditionally when available: the protocol
carries any length 1..32, so it can serve every chunk -- including
the 1- and 2-byte tails -- without help from word or byte access.
Note that this requires I2C_FUNC_SMBUS_I2C_BLOCK, which reads a
caller-specified number of bytes. This deviates from the official
SMBus Block Read (length is supplied by the slave) but is widely
supported by Linux I2C controllers/drivers.

Capability matrix this implementation supports:

  - BYTE only:                  works (unchanged behaviour); 1-byte
                                xfers, hwmon disabled.
  - BYTE + WORD:                word for >=2-byte chunks, byte for
                                trailing odd byte.
  - I2C_BLOCK present (with or
    without BYTE/WORD):         block as the universal transport for
                                every chunk.
  - WORD only (no BYTE/BLOCK):  accepted with WARN_ONCE. Even-length
                                transfers work; odd-length transfers
                                (e.g. the 3-byte cotsworks fixup
                                write) hit the BYTE branch which the
                                adapter does not implement, so the
                                xfer returns an error and the
                                operation is aborted. No mainline
                                I2C driver was found to advertise
                                WORD without BYTE; the warning lets
                                us learn about it if it ever shows
                                up.

Adapters with asymmetric R/W capabilities (e.g. only READ_I2C_BLOCK
but not WRITE_I2C_BLOCK) remain functionally correct -- the
per-iteration fallback uses the direction-specific bits -- but the
shared i2c_max_block_size is sized by the all-bits-set check, so a
transfer in the better-supported direction is not upgraded. None of
the mainline I2C bus drivers surveyed during review advertise such
asymmetry; promoting i2c_max_block_size to per-direction sizes can
be revisited if needed.

Signed-off-by: Jonas Jelonek <jelonek.jonas@gmail.com>
---
 drivers/net/phy/sfp.c | 139 +++++++++++++++++++++++++++++++++---------
 1 file changed, 111 insertions(+), 28 deletions(-)

diff --git a/drivers/net/phy/sfp.c b/drivers/net/phy/sfp.c
index 3c232991faa1..f8136c935808 100644
--- a/drivers/net/phy/sfp.c
+++ b/drivers/net/phy/sfp.c
@@ -14,6 +14,7 @@
 #include <linux/platform_device.h>
 #include <linux/rtnetlink.h>
 #include <linux/slab.h>
+#include <linux/unaligned.h>
 #include <linux/workqueue.h>
 
 #include "sfp.h"
@@ -759,50 +760,113 @@ static int sfp_i2c_write(struct sfp *sfp, bool a2, u8 dev_addr, void *buf,
 	return ret == ARRAY_SIZE(msgs) ? len : 0;
 }
 
-static int sfp_smbus_byte_read(struct sfp *sfp, bool a2, u8 dev_addr,
-			       void *buf, size_t len)
+static int sfp_smbus_read(struct sfp *sfp, bool a2, u8 dev_addr, void *buf,
+			  size_t len)
 {
-	union i2c_smbus_data smbus_data;
+	union i2c_smbus_data smbus_data = {0};
 	u8 bus_addr = a2 ? 0x51 : 0x50;
+	size_t this_len, transferred;
+	u32 functionality;
 	u8 *data = buf;
 	int ret;
 
-	while (len) {
-		ret = i2c_smbus_xfer(sfp->i2c, bus_addr, 0,
-				     I2C_SMBUS_READ, dev_addr,
-				     I2C_SMBUS_BYTE_DATA, &smbus_data);
-		if (ret < 0)
-			return ret;
+	functionality = i2c_get_functionality(sfp->i2c);
 
-		*data = smbus_data.byte;
+	while (len) {
+		this_len = min(len, sfp->i2c_block_size);
+
+		if (functionality & I2C_FUNC_SMBUS_READ_I2C_BLOCK) {
+			smbus_data.block[0] = this_len;
+			ret = i2c_smbus_xfer(sfp->i2c, bus_addr, 0,
+					     I2C_SMBUS_READ, dev_addr,
+					     I2C_SMBUS_I2C_BLOCK_DATA, &smbus_data);
+			if (ret < 0)
+				return ret;
+
+			transferred = min_t(size_t, smbus_data.block[0], this_len);
+			if (!transferred)
+				return -EIO;
+
+			memcpy(data, &smbus_data.block[1], transferred);
+		} else if (this_len >= 2 &&
+			   (functionality & I2C_FUNC_SMBUS_READ_WORD_DATA)) {
+			ret = i2c_smbus_xfer(sfp->i2c, bus_addr, 0,
+					     I2C_SMBUS_READ, dev_addr,
+					     I2C_SMBUS_WORD_DATA, &smbus_data);
+			if (ret < 0)
+				return ret;
+
+			put_unaligned_le16(smbus_data.word, data);
+			transferred = 2;
+		} else {
+			ret = i2c_smbus_xfer(sfp->i2c, bus_addr, 0,
+					     I2C_SMBUS_READ, dev_addr,
+					     I2C_SMBUS_BYTE_DATA, &smbus_data);
+			if (ret < 0)
+				return ret;
+
+			*data = smbus_data.byte;
+			transferred = 1;
+		}
 
-		len--;
-		data++;
-		dev_addr++;
+		data += transferred;
+		len -= transferred;
+		dev_addr += transferred;
 	}
 
 	return data - (u8 *)buf;
 }
 
-static int sfp_smbus_byte_write(struct sfp *sfp, bool a2, u8 dev_addr,
-				void *buf, size_t len)
+static int sfp_smbus_write(struct sfp *sfp, bool a2, u8 dev_addr, void *buf,
+			   size_t len)
 {
 	union i2c_smbus_data smbus_data;
 	u8 bus_addr = a2 ? 0x51 : 0x50;
+	size_t this_len, transferred;
+	u32 functionality;
 	u8 *data = buf;
 	int ret;
 
+	functionality = i2c_get_functionality(sfp->i2c);
+
 	while (len) {
-		smbus_data.byte = *data;
-		ret = i2c_smbus_xfer(sfp->i2c, bus_addr, 0,
-				     I2C_SMBUS_WRITE, dev_addr,
-				     I2C_SMBUS_BYTE_DATA, &smbus_data);
-		if (ret)
-			return ret;
+		this_len = min(len, sfp->i2c_block_size);
+
+		if (functionality & I2C_FUNC_SMBUS_WRITE_I2C_BLOCK) {
+			smbus_data.block[0] = this_len;
+			memcpy(&smbus_data.block[1], data, this_len);
+
+			ret = i2c_smbus_xfer(sfp->i2c, bus_addr, 0,
+					     I2C_SMBUS_WRITE, dev_addr,
+					     I2C_SMBUS_I2C_BLOCK_DATA, &smbus_data);
+			if (ret < 0)
+				return ret;
+
+			transferred = this_len;
+		} else if (this_len >= 2 &&
+			   (functionality & I2C_FUNC_SMBUS_WRITE_WORD_DATA)) {
+			smbus_data.word = get_unaligned_le16(data);
+			ret = i2c_smbus_xfer(sfp->i2c, bus_addr, 0,
+					     I2C_SMBUS_WRITE, dev_addr,
+					     I2C_SMBUS_WORD_DATA, &smbus_data);
+			if (ret < 0)
+				return ret;
+
+			transferred = 2;
+		} else {
+			smbus_data.byte = *data;
+			ret = i2c_smbus_xfer(sfp->i2c, bus_addr, 0,
+					     I2C_SMBUS_WRITE, dev_addr,
+					     I2C_SMBUS_BYTE_DATA, &smbus_data);
+			if (ret < 0)
+				return ret;
+
+			transferred = 1;
+		}
 
-		len--;
-		data++;
-		dev_addr++;
+		data += transferred;
+		len -= transferred;
+		dev_addr += transferred;
 	}
 
 	return data - (u8 *)buf;
@@ -818,10 +882,29 @@ static int sfp_i2c_configure(struct sfp *sfp, struct i2c_adapter *i2c)
 		sfp->read = sfp_i2c_read;
 		sfp->write = sfp_i2c_write;
 		max_block_size = SFP_EEPROM_BLOCK_SIZE;
-	} else if (i2c_check_functionality(i2c, I2C_FUNC_SMBUS_BYTE_DATA)) {
-		sfp->read = sfp_smbus_byte_read;
-		sfp->write = sfp_smbus_byte_write;
-		max_block_size = 1;
+	} else if (i2c_check_functionality(i2c, I2C_FUNC_SMBUS_BYTE_DATA) ||
+		   i2c_check_functionality(i2c, I2C_FUNC_SMBUS_I2C_BLOCK)) {
+		/* Either protocol alone covers any length: I2C-block carries
+		 * 1..32 bytes per xfer, byte iterates one byte at a time.
+		 */
+		sfp->read = sfp_smbus_read;
+		sfp->write = sfp_smbus_write;
+
+		if (i2c_check_functionality(i2c, I2C_FUNC_SMBUS_I2C_BLOCK))
+			max_block_size = SFP_EEPROM_BLOCK_SIZE;
+		else if (i2c_check_functionality(i2c, I2C_FUNC_SMBUS_WORD_DATA))
+			max_block_size = 2;
+		else
+			max_block_size = 1;
+	} else if (WARN_ONCE(i2c_check_functionality(i2c, I2C_FUNC_SMBUS_WORD_DATA),
+			     "SMBus word-only adapter; odd-length transfers will fail\n")) {
+		/* Word-only: even-length xfers work; odd-length xfers fall
+		 * to BYTE, which the adapter does not advertise and will
+		 * likely fail.
+		 */
+		sfp->read = sfp_smbus_read;
+		sfp->write = sfp_smbus_write;
+		max_block_size = 2;
 	} else {
 		sfp->i2c = NULL;
 		return -EINVAL;
-- 
2.51.0


^ permalink raw reply related

* Re: [PATCH net-next v7 00/11] BIG TCP for UDP tunnels
From: patchwork-bot+netdevbpf @ 2026-06-14 13:50 UTC (permalink / raw)
  To: Alice Mikityanska
  Cc: daniel, davem, edumazet, kuba, pabeni, lucien.xin,
	willemdebruijn.kernel, willemb, dsahern, razor, shuah, stfomichev,
	andrew+netdev, horms, fw, netdev, alice
In-Reply-To: <20260611192955.604661-1-alice.kernel@fastmail.im>

Hello:

This series was applied to netdev/net-next.git (main)
by Paolo Abeni <pabeni@redhat.com>:

On Thu, 11 Jun 2026 21:29:44 +0200 you wrote:
> From: Alice Mikityanska <alice@isovalent.com>
> 
> This series is a follow-up to "BIG TCP without HBH in IPv6", and it adds
> support for BIG TCP IPv4/IPv6 workloads in vxlan and geneve. Now that
> IPv6 BIG TCP doesn't require stripping the HBH in all various
> combinations in tunneled traffic, adding BIG TCP becomes feasible.
> 
> [...]

Here is the summary with links:
  - [net-next,v7,01/11] net/sched: act_csum: don't mangle UDP tunnel GSO packets
    https://git.kernel.org/netdev/net-next/c/9bcb30b389ec
  - [net-next,v7,02/11] geneve: Fix off-by-one comparing with GRO_LEGACY_MAX_SIZE
    https://git.kernel.org/netdev/net-next/c/2319688890d9
  - [net-next,v7,03/11] net: Use helpers to get/set UDP len tree-wide
    (no matching commit)
  - [net-next,v7,04/11] net: Enable BIG TCP with partial GSO
    (no matching commit)
  - [net-next,v7,05/11] udp: Support BIG TCP GSO packets where they can occur
    (no matching commit)
  - [net-next,v7,06/11] udp: Support gro_ipv4_max_size > 65536
    (no matching commit)
  - [net-next,v7,07/11] udp: Validate UDP length in udp_gro_receive
    (no matching commit)
  - [net-next,v7,08/11] udp: Set length in UDP header to 0 for big GSO packets
    (no matching commit)
  - [net-next,v7,09/11] vxlan: Enable BIG TCP packets
    (no matching commit)
  - [net-next,v7,10/11] geneve: Enable BIG TCP packets
    (no matching commit)
  - [net-next,v7,11/11] selftests: net: Add a test for BIG TCP in UDP tunnels
    (no matching commit)

You are awesome, thank you!
-- 
Deet-doot-dot, I am a bot.
https://korg.docs.kernel.org/patchwork/pwbot.html



^ permalink raw reply

* Re: [PATCH net-next v13 1/2] tcp: rehash onto different local ECMP path on retransmit timeout
From: Paolo Abeni @ 2026-06-14 13:54 UTC (permalink / raw)
  To: Neil Spring, netdev
  Cc: edumazet, ncardwell, kuniyu, davem, kuba, dsahern, horms, shuah,
	linux-kselftest, bpf, martin.lau, daniel
In-Reply-To: <20260612010047.1377331-2-ntspring@meta.com>

On 6/12/26 3:00 AM, Neil Spring wrote:
> Currently sk_rethink_txhash() re-rolls the socket's txhash on RTO, PLB,
> and spurious-retransmission events, but the cached route is reused and
> the new hash is not propagated into the ECMP path selection logic.  Two
> changes are needed to make rehash select a different local ECMP path:
> 
> 1. Add __sk_dst_reset() alongside sk_rethink_txhash() in
>    tcp_write_timeout(), tcp_rcv_spurious_retrans(), and
>    tcp_plb_check_rehash() so the cached dst is invalidated and the
>    next transmit triggers a fresh route lookup.
> 
> 2. Set fl6->mp_hash from sk_txhash (or tcp_rsk(req)->txhash for
>    SYN/ACK retransmits and syncookies) in tcp_v6_connect(),
>    inet6_sk_rebuild_header(), inet6_csk_route_req(),
>    inet6_csk_route_socket(), tcp_v6_send_response(), and
>    cookie_v6_check() so fib6_select_path() picks a path based on the
>    new hash.
> 
> The mp_hash override only applies to fib_multipath_hash_policy 0 (the
> default L3 policy).  Its hash includes the flow label, but that is 0 by
> default -- np->flow_label is unset, and auto_flowlabels only computes
> the on-wire label later, per packet -- so flows to the same peer share
> one local path.  Keying the hash on sk_txhash makes the local path
> per-connection and lets a rehash re-select it.  Policies 1-3 are left
> unchanged.
> 
> The mp_hash assignment is factored into a small helper,
> ip6_ecmp_set_mp_hash(), shared by inet6_csk_route_req(),
> inet6_csk_route_socket(), tcp_v6_connect(), inet6_sk_rebuild_header(),
> tcp_v6_send_response(), and cookie_v6_check().  It applies
> (txhash >> 1) ?: 1 for policy 0 (the >> 1 keeps mp_hash in the 31-bit
> range; ?: 1 keeps it non-zero, since 0 would fall back to
> rt6_multipath_hash()).  inet6_csk_route_socket() calls it only for
> sk_protocol == IPPROTO_TCP so that non-TCP callers (e.g., L2TP via
> inet6_csk_xmit) fall through to rt6_multipath_hash() and retain their
> existing flow-key-based ECMP behavior.
> 
> tcp_v6_send_response() also sets mp_hash from the response txhash so
> that a control packet (a RST from the full socket, or an ACK from a
> time-wait socket) selects the same local ECMP nexthop as the
> connection's txhash rather than falling back to the flow hash.  The
> time-wait socket's tw_txhash is copied from sk_txhash when the
> connection enters TIME_WAIT, so it reflects any rehash that occurred.
> 
> Setting mp_hash explicitly is necessary because the default ECMP hash
> derives from fl6->flowlabel via np->flow_label, which is not updated
> from sk_txhash (REPFLOW is off by default).  ip6_make_flowlabel()
> cannot help either, as it runs after the route lookup.
> 
> As a consequence, for policy 0 the local ECMP path of an IPv6 TCP
> flow follows sk_txhash even when fl6->flowlabel is non-zero, e.g. a
> reflected (REPFLOW) or explicitly set (IPV6_FLOWLABEL_MGR) flow
> label.  This is intentional: only local path selection changes, so
> rehash can recover from a failed path; the on-wire flow label is
> unchanged.
> 
> sk_set_txhash() is moved before ip6_dst_lookup_flow() in
> tcp_v6_connect() so the initial ECMP path is selected by the same
> txhash that subsequent route rebuilds will use.  This avoids
> unintended path changes when the cached dst is naturally invalidated
> (e.g., by PMTU discovery or route changes).
> 
> The rehash sites (tcp_write_timeout(), tcp_plb_check_rehash(), and
> tcp_rcv_spurious_retrans()) call __sk_rethink_txhash_reset_dst(),
> which re-rolls the txhash and, when it changed, drops the cached dst
> so the next transmit re-runs route selection.  The dst reset is
> guarded by sk->sk_family == AF_INET6 since IPv4 ECMP does not
> currently use sk_txhash for path selection.  For IPv4-mapped IPv6
> sockets this produces a redundant dst reset on a cold path
> (RTO/PLB); the subsequent IPv4 route lookup returns the same result.
> The helper is deliberately separate from sk_rethink_txhash() itself:
> dst_negative_advice() calls sk_rethink_txhash() before its own dst op,
> so resetting the dst inside sk_rethink_txhash() would skip that op
> (e.g. rt6_remove_exception_rt()).
> 
> For syncookies, cookie_init_sequence() computes the cookie value
> before route_req() and sets txhash so the SYN-ACK selects the same
> ECMP path that cookie_v6_check() will use when the full socket is
> created.  cookie_tcp_reqsk_init() derives txhash from the cookie so
> the full socket's ECMP path matches the SYN-ACK.  Both the SYN-ACK
> assignment in tcp_conn_request() and the full-socket assignment in
> cookie_tcp_reqsk_init() are keyed on the packet family
> (skb->protocol == ETH_P_IPV6), not sk->sk_family: a dual-stack
> AF_INET6 listener also serves IPv4 connections, and the v4 cookie has
> mssind bits that would bias TX queue distribution if used as txhash.
> IPv4 connections retain net_tx_rndhash().
> 
> cookie_init_sequence() is split from the former version that also
> called tcp_synq_overflow() and incremented SYNCOOKIESSENT; those
> side effects are now in cookie_record_sent(), called after
> route_req() succeeds so they are not bumped when route_req() fails.
> cookie_record_sent() is guarded by CONFIG_SYN_COOKIES to
> match the guard on tcp_synq_overflow().  route_req() receives 0 as
> tw_isn for the syncookie path so that tcp_v6_init_req() still saves
> ireq->pktopts for REPFLOW flowlabel reflection and IPv6 cmsg
> options.  The ecn_ok clear for syncookies without timestamps stays
> after tcp_ecn_create_request() so it takes precedence.
> 
> Signed-off-by: Neil Spring <ntspring@meta.com>

This looks good to me, with a minor commit below.

> diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
> index df479277fb80..cc71d84df42b 100644
> --- a/net/ipv4/syncookies.c
> +++ b/net/ipv4/syncookies.c
> @@ -280,9 +280,18 @@ static int cookie_tcp_reqsk_init(struct sock *sk, struct sk_buff *skb,
>  	treq->snt_synack = 0;
>  	treq->snt_tsval_first = 0;
>  	treq->tfo_listener = false;
> -	treq->txhash = net_tx_rndhash();
>  	treq->rcv_isn = ntohl(th->seq) - 1;
>  	treq->snt_isn = ntohl(th->ack_seq) - 1;
> +	if (skb->protocol == htons(ETH_P_IPV6)) {
> +		/* Use the cookie as txhash so the ECMP path matches
> +		 * the SYN-ACK, where txhash was also set to the
> +		 * cookie.  The original request socket (and its
> +		 * txhash) was freed after sending the SYN-ACK.
> +		 */
> +		treq->txhash = treq->snt_isn;
> +	} else {
> +		treq->txhash = net_tx_rndhash();

I'm wondering if it would make sense always using snt_isn for txhash in
the syn cookie case, regardless of the IP protocol. Beyond reducing the
differences between ipv4 and ipv6 it will make the code a little simpler.

Not a blocker in any case.

Still I think this could deserve an explicit ack from Eric.

/P


^ permalink raw reply

* Re: [PATCH net v3] net: protect egress device access in the output path with rcu_read_lock
From: Paolo Abeni @ 2026-06-14 14:02 UTC (permalink / raw)
  To: Hyunwoo Kim, dsahern, idosch, davem, edumazet, kuba, horms,
	steffen.klassert, herbert, andrew+netdev, kuniyu, jlayton
  Cc: netdev
In-Reply-To: <aitI7sLX8SIC5RqQ@v4bel>

On 6/12/26 1:46 AM, Hyunwoo Kim wrote:
> The locally generated output path reads the egress network device from
> the route attached to the skb (skb_dst(skb)->dev, skb_dst_dev() or
> rt->dst.dev) and uses it as the 'out' device argument of an
> NF_HOOK()/nf_hook() call, or for direct field reads, without holding
> rcu_read_lock().
> 
> dst->dev is protected by RCU. When a device is unregistered its value is
> replaced with blackhole_netdev and the previous device is freed after an
> RCU grace period. A section that reads dst->dev and uses that pointer
> must therefore hold rcu_read_lock(). Otherwise a LOCAL_OUT / POST_ROUTING
> hook consumer (nft meta oif, selinux_ip_postroute_compat, etc.) or an
> early field read can reference a device that is no longer valid when the
> egress device is unregistered concurrently with transmission.
> 
> Rather than taking the lock in each dst->output leaf, take it once at the
> common ip_local_out() and ip6_local_out() level. This covers
> __ip_local_out() / __ip6_local_out() (the LOCAL_OUT hook) and
> dst_output(), and therefore ip_output(), ip_mc_output(), ip_mr_output(),
> xfrm4_output() and vrf_output(), as well as the IPv6 leaves ip6_output(),
> ip6_mr_output(), xfrm6_output() and vrf_output6(), in one place.
> 
> raw_send_hdrinc() and rawv6_send_hdrinc() do not go through
> ip_local_out() / ip6_local_out(); they run their own NF_HOOK() and also
> read the device (mtu, LL_RESERVED_SPACE(), needed_tailroom) before that
> hook, so they take their own rcu_read_lock(). The device fields are read
> under a short rcu_read_lock() at function entry that is dropped before
> the blocking sock_alloc_send_skb(); the NF_HOOK() itself runs under
> rcu_read_lock() (added in raw_send_hdrinc(), already present in
> rawv6_send_hdrinc()).
> 
> xfrm_output_resume() is left unchanged. It reads the device locklessly
> and is only reached either under the rcu_read_lock() held by its
> dst_output() caller (now including ip_local_out() / ip6_local_out() for
> locally generated traffic) or with softirqs disabled: the async crypto
> completion runs in BH-off context (cryptd and padata invoke it under
> local_bh_disable(); hardware crypto drivers complete it from softirq),
> and the IPTFS output runs from a HRTIMER_MODE_REL_SOFT timer. For the
> same reason __ip_local_out() and __ip6_local_out() keep the lockless
> skb_dst_dev() accessor: they are also reached from xfrm_output_resume()
> via ->local_out() in BH-off context, where the rcu_dereference() based
> accessor would trip CONFIG_PROVE_RCU.
> 
> Fixes: 4a6ce2b6f2ec ("net: introduce a new function dst_dev_put()")
> Signed-off-by: Hyunwoo Kim <imv4bel@gmail.com>
> ---
> Changes in v3:
> - Take the lock once at ip_local_out() / ip6_local_out() instead of in
>   each output leaf, and cover the IPv6 path as well (David Ahern, Ido
>   Schimmel).
> - Drop the xfrm_output_resume() change; its resumption paths already run
>   with BH off (Herbert Xu). Its synchronous entry is reached under the
>   rcu held by its dst_output() caller.
> - Also protect the early device reads (mtu, headroom) in
>   raw_send_hdrinc() / rawv6_send_hdrinc(), split so the lock is not held
>   across the blocking allocation.
> - The now redundant rcu_read_lock() nested in ip_output() / ip6_output()
>   will be removed in a follow-up.
> - v2: https://lore.kernel.org/all/ahveCu-zzFlpeVut@v4bel/
> 
> Changes in v2:
> - Changed to a net-wide patch that also fixes the issue in
>   raw_send_hdrinc(), xfrm_output_resume() and vrf_output().
> - v1: https://lore.kernel.org/all/ahqIg6vURwYI0LJ5@v4bel/
> ---
>  net/ipv4/ip_output.c   |  2 ++
>  net/ipv4/raw.c         | 19 ++++++++++++++-----
>  net/ipv6/output_core.c |  2 ++
>  net/ipv6/raw.c         | 16 ++++++++++++----
>  4 files changed, 30 insertions(+), 9 deletions(-)
> 
> diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
> index 5bcd73cbdb41c..26b51ef0763fa 100644
> --- a/net/ipv4/ip_output.c
> +++ b/net/ipv4/ip_output.c
> @@ -126,9 +126,11 @@ int ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb)
>  {
>  	int err;
>  
> +	rcu_read_lock();
>  	err = __ip_local_out(net, sk, skb);
>  	if (likely(err == 1))
>  		err = dst_output(net, sk, skb);
> +	rcu_read_unlock();

IN RH we are observing small but measurable and reproducible regressions
due to increased number of rcu lock pair in the fast-path.
ip_local_out() is invoked by __ip_queue_xmit() already under the RCU lock.

I think it could be usedful either providing an _rcu less variant and
use the latter in __ip_queue_xmit(), or open-code ip_local_out() there.

AFAICS the ipv6 counter part is not in the local output fastpath.

/P


^ permalink raw reply

* Re: [PATCH v2 2/5] binder: Make shrinker rely solely on per-VMA lock
From: Carlos Llamas @ 2026-06-14 14:10 UTC (permalink / raw)
  To: Alice Ryhl
  Cc: Dave Hansen, Suren Baghdasaryan, Vlastimil Babka (SUSE),
	Dave Hansen, linux-kernel, Andrew Morton,
	Arve Hjønnevåg, Christian Brauner, David Ahern,
	David S. Miller, Greg Kroah-Hartman, Liam R. Howlett, linux-mm,
	Lorenzo Stoakes, netdev, Shakeel Butt, Todd Kjos
In-Reply-To: <aixi-DxMuc0MiGeO@google.com>

On Fri, Jun 12, 2026 at 07:50:16PM +0000, Alice Ryhl wrote:
> On Fri, Jun 12, 2026 at 11:47:59AM -0700, Dave Hansen wrote:
> > On 6/12/26 10:44, Suren Baghdasaryan wrote:
> > >> It's not impossible, but I do think it is irrelevant. Or at least that
> > >> the *VMA* is irrelevant in this case. binder_alloc_is_mapped()==false
> > >> means that the binder VMA is gone. It's not in the maple tree, and it's
> > >> not coming back. If a VMA is found, it's an impostor.
> > > Right, but before your change we were bailing out early. With your
> > > change we would be generating the traces and freeing the page. I think
> > > that's a functional change. Was that your intention?
> > 
> > Yeah, it was intentional.
> > 
> > I think the existing behavior is buggy. It also complicates the goal of
> > removing the mmap lock fallback. I've broken that behavior change out
> > into a separate patch. (attached here)
> 
> I think you can just:
> 
> 1. do a lock_vma_under_rcu().
> 2. if it fails, check binder_alloc_is_mapped().
> 3. if still mapped, return LRU_SKIP, otherwise behave like a failed
>    vma_lookup() does today under the mmap read lock.

Right! This is the same suggestion I sent.

...
Also, I would _prefer_ if the commit message was more accurate. The
mmap_lock fallback was there because of "compatibility", as per-vma
locking is technically behind CONFIG_PER_VMA_LOCK. This would be the
only part that IMO describes the actual reason for the change:

> Now that per-VMA locks are universally available, lock_vma_under_rcu()
> will not persistently fail. Rely on it alone and simplify the code.

Cheers,
--
Carlos Llamas

^ permalink raw reply

* [PATCH net-next] net: dsa: sja1105: fix lastused timestamp in flower stats
From: David Yang @ 2026-06-14 14:13 UTC (permalink / raw)
  To: netdev
  Cc: David Yang, Vladimir Oltean, Andrew Lunn, David S. Miller,
	Eric Dumazet, Jakub Kicinski, Paolo Abeni, linux-kernel

flow_stats_update() takes an absolute timestamp for lastused, not delta.
Fix that.

Signed-off-by: David Yang <mmyangfl@gmail.com>
---
 drivers/net/dsa/sja1105/sja1105_vl.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/net/dsa/sja1105/sja1105_vl.c b/drivers/net/dsa/sja1105/sja1105_vl.c
index 0ae9cb5ea8d1..e6ba9d4f8d1e 100644
--- a/drivers/net/dsa/sja1105/sja1105_vl.c
+++ b/drivers/net/dsa/sja1105/sja1105_vl.c
@@ -791,8 +791,7 @@ int sja1105_vl_stats(struct sja1105_private *priv, int port,
 	pkts = timingerr + unreleased + lengtherr;
 
 	flow_stats_update(stats, 0, pkts - rule->vl.stats.pkts, 0,
-			  jiffies - rule->vl.stats.lastused,
-			  FLOW_ACTION_HW_STATS_IMMEDIATE);
+			  jiffies, FLOW_ACTION_HW_STATS_IMMEDIATE);
 
 	rule->vl.stats.pkts = pkts;
 	rule->vl.stats.lastused = jiffies;
-- 
2.53.0


^ permalink raw reply related

* [PATCH net] atm: br2684: reject short VC-MUX bridged frames
From: Yizhou Zhao @ 2026-06-14 15:27 UTC (permalink / raw)
  To: netdev
  Cc: Yizhou Zhao, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, Simon Horman, Kees Cook, linux-kernel, Yuxiang Yang,
	Ao Wang, Xuewei Feng, Qi Li, Ke Xu, stable

br2684_push() validates the two-byte pad at the start of received
VC-MUX bridged frames with memcmp(), but does not first make sure that
those two bytes are present in the skb.

A short AAL5 PDU can reach this path after a BR2684 VCC is attached with
BR2684_ENCAPS_VC and bridged payload.  If skb->len is 0 or 1, the pad
comparison reads beyond the valid skb data.  When the bytes beyond
skb->len compare as zero, the code then continues toward eth_type_trans()
with the malformed frame.

Reject frames shorter than BR2684_PAD_LEN before checking the pad.  This
keeps the existing validation for valid VC-MUX bridged frames, which must
carry the two-byte pad before the Ethernet header.

Fixes: 7e903c2ae36e ("atm: [br2864] fix routed vcmux support")
Cc: stable@vger.kernel.org
Reported-by: Yizhou Zhao <zhaoyz24@mails.tsinghua.edu.cn>
Reported-by: Yuxiang Yang <yangyx22@mails.tsinghua.edu.cn>
Reported-by: Ao Wang <wangao@seu.edu.cn>
Reported-by: Xuewei Feng <fengxw06@126.com>
Reported-by: Qi Li <qli01@tsinghua.edu.cn>
Reported-by: Ke Xu <xuke@tsinghua.edu.cn>
Assisted-by: GLM:GLM-5.1
Signed-off-by: Yizhou Zhao <zhaoyz24@mails.tsinghua.edu.cn>
---
 net/atm/br2684.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/atm/br2684.c b/net/atm/br2684.c
index 6580d67c3456..07283c475a40 100644
--- a/net/atm/br2684.c
+++ b/net/atm/br2684.c
@@ -491,6 +491,8 @@ static void br2684_push(struct atm_vcc *atmvcc, struct sk_buff *skb)
 			skb->pkt_type = PACKET_HOST;
 		} else { /* p_bridged */
 			/* first 2 chars should be 0 */
+			if (skb->len < BR2684_PAD_LEN)
+				goto error;
 			if (memcmp(skb->data, pad, BR2684_PAD_LEN) != 0)
 				goto error;
 			skb_pull(skb, BR2684_PAD_LEN);

-- 
2.43.0

^ permalink raw reply related

* Re: [PATCH RESEND 1/6] sock: add sock_kzalloc helper
From: Thorsten Blum @ 2026-06-14 15:32 UTC (permalink / raw)
  To: Herbert Xu, David S. Miller, Eric Dumazet, Kuniyuki Iwashima,
	Paolo Abeni, Willem de Bruijn, Jakub Kicinski, Simon Horman
  Cc: linux-crypto, linux-kernel, netdev
In-Reply-To: <20260527082509.1133816-8-thorsten.blum@linux.dev>

On Wed, May 27, 2026 at 10:25:11AM +0200, Thorsten Blum wrote:
> Add sock_kzalloc() helper - the sock equivalent to kzalloc().
> 
> Reviewed-by: Kuniyuki Iwashima <kuniyu@google.com>
> Signed-off-by: Thorsten Blum <thorsten.blum@linux.dev>
> ---
> Patch 1/6 needs an Acked-by: from netdev maintainers for the series to
> go through Herbert's crypto tree:
> https://lore.kernel.org/lkml/ahVkZOxZtFes6Huf@gondor.apana.org.au/
> ---
>  include/net/sock.h | 5 +++++
>  1 file changed, 5 insertions(+)
> 
> diff --git a/include/net/sock.h b/include/net/sock.h
> index 76bfd3e56d63..b521bd34ac9f 100644
> --- a/include/net/sock.h
> +++ b/include/net/sock.h
> @@ -1913,6 +1913,11 @@ void sock_kfree_s(struct sock *sk, void *mem, int size);
>  void sock_kzfree_s(struct sock *sk, void *mem, int size);
>  void sk_send_sigurg(struct sock *sk);
>  
> +static inline void *sock_kzalloc(struct sock *sk, int size, gfp_t priority)
> +{
> +	return sock_kmalloc(sk, size, priority | __GFP_ZERO);
> +}
> +
>  static inline void sock_replace_proto(struct sock *sk, struct proto *proto)
>  {
>  	if (sk->sk_socket)

Gentle ping? Patch 1/6 still needs an ack from netdev maintainers.

Thanks,
Thorsten

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox