Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH net v4] net: dsa: Fix skb ownership in taggers
From: Linus Walleij @ 2026-06-22 12:47 UTC (permalink / raw)
  To: Andrew Lunn, Vladimir Oltean, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Simon Horman, Florian Fainelli,
	Jonas Gorski, Hauke Mehrtens, Kurt Kanzenbach, Woojung Huh,
	UNGLinuxDriver, Chester A. Unal, Daniel Golle, Matthias Brugger,
	AngeloGioacchino Del Regno, Wei Fang, Clark Wang,
	Clément Léger, George McCollister, David Yang
  Cc: netdev, Sashiko AI Review, Linus Walleij

The tag_8021q.c tagger calls vlan_insert_tag() in dsa_8021q_xmit().
vlan_insert_tag() will consume the skb with kfree_skb() on failure
and return NULL.

When NULL is returned as error code to ->xmit() in dsa_user_xmit()
it will free the same skb again leading to a double-free.

The idea of dsa_user_xmit() and dsa_switch_rcv() dropping the skb
they held before the call to ->xmit() and ->rcv() is conceptually
wrong: the pattern elsewhere in the networking code is that consumers
drop their skb:s on failure.

Modify the ->xmit() and ->rcv() call sites to not drop the SKB if
the taggers return NULL from any of these calls. Move those drops into
the taggers so every callback error path that retains ownership consumes
the skb before returning NULL.

Keep the existing helper ownership rules: VLAN insertion helpers already
free on failure (this is the case in tag_8021q.c), while deferred
transmit paths either transfer the skb reference to worker context or
hold a worker reference with skb_get() and drop the caller's reference.

For SJA1105 meta RX, transfer the buffered stampable skb under the meta
lock and return NULL while the skb is waiting for its meta frame: the
skb is not dropped in this case.

NOTICE: Backporting patches to taggers (e.g. for stable kernels) after
this point cannot be mechanical or they will introduce double
kfree_skb().

Reported-by: Sashiko AI Review <sashiko-bot@kernel.org>
Closes: https://lore.kernel.org/r/20260610153952.1685895-1-kuba@kernel.org/
Suggested-by: Jakub Kicinski <kuba@kernel.org>
Assisted-by: Codex:gpt-5-5
Acked-by: David Yang <mmyangfl@gmail.com> # yt921x
Acked-by: Kurt Kanzenbach <kurt@linutronix.de> # hellcreek
Reviewed-by: Wei Fang <wei.fang@nxp.com> # netc
Signed-off-by: Linus Walleij <linusw@kernel.org>
---
Changes in v4:
- Add a kfree_skb() on the else{} path of if (likely(skb->dev)) {} after
  skb->dev = dsa_conduit_find_user(dev, 0, port);
  Doing this explicitly rather than keeping the old code is more readable.
- Tag for net now that net-next is closed.
- Link to v3: https://patch.msgid.link/20260617-dsa-fix-free-skb-v3-1-cdd4e0778a39@kernel.org

Changes in v3:
- Simplify __skb_put_padto(skb, ETH_ZLEN, false) and
  skb_put_padto(skb, ETH_ZLEN) to eth_skb_pad().
- Pick up Wei's review tag.
- Link to v2: https://patch.msgid.link/20260616-dsa-fix-free-skb-v2-1-9dbda6a19e97@kernel.org

Changes in v2:
- In some instances __skb_pad() and __skb_put_padto() followed by a
  kfree_skb() could be simplified to just call skb_pad() and
  skb_put_padto() which will free the skb on failure.
- Use a label and goto for the kfree_skb(); return NULL; in
  the netc_rcv() callback in tag_netc.c as requested.
- Collect ACKs.
- Retag for net-next.
- Link to v1: https://patch.msgid.link/20260616-dsa-fix-free-skb-v1-1-fd30b35dcf66@kernel.org
---
 net/dsa/tag.c               |  7 ++++---
 net/dsa/tag_ar9331.c        | 10 ++++++++--
 net/dsa/tag_brcm.c          | 39 ++++++++++++++++++++++++---------------
 net/dsa/tag_dsa.c           | 15 ++++++++++++---
 net/dsa/tag_gswip.c         |  8 ++++++--
 net/dsa/tag_hellcreek.c     |  9 +++++++--
 net/dsa/tag_ksz.c           | 44 +++++++++++++++++++++++++++++++-------------
 net/dsa/tag_lan9303.c       |  2 ++
 net/dsa/tag_mtk.c           |  8 ++++++--
 net/dsa/tag_mxl-gsw1xx.c    |  3 +++
 net/dsa/tag_mxl862xx.c      |  3 +++
 net/dsa/tag_netc.c          | 18 ++++++++++--------
 net/dsa/tag_ocelot.c        |  4 +++-
 net/dsa/tag_ocelot_8021q.c  | 20 +++++++++++++-------
 net/dsa/tag_qca.c           | 14 +++++++++++---
 net/dsa/tag_rtl4_a.c        |  8 ++++++--
 net/dsa/tag_rtl8_4.c        | 24 ++++++++++++++++++------
 net/dsa/tag_rzn1_a5psw.c    |  8 ++++++--
 net/dsa/tag_sja1105.c       | 42 +++++++++++++++++++++++++++---------------
 net/dsa/tag_trailer.c       | 16 ++++++++++++----
 net/dsa/tag_vsc73xx_8021q.c |  1 +
 net/dsa/tag_xrs700x.c       | 12 +++++++++---
 net/dsa/tag_yt921x.c        |  7 ++++++-
 net/dsa/user.c              |  7 +++----
 24 files changed, 231 insertions(+), 98 deletions(-)

diff --git a/net/dsa/tag.c b/net/dsa/tag.c
index 79ad105902d9..107e93250b94 100644
--- a/net/dsa/tag.c
+++ b/net/dsa/tag.c
@@ -79,15 +79,16 @@ static int dsa_switch_rcv(struct sk_buff *skb, struct net_device *dev,
 		if (likely(skb->dev)) {
 			dsa_default_offload_fwd_mark(skb);
 			nskb = skb;
+		} else {
+			/* Just drop the skb if we can't find the user */
+			kfree_skb(skb);
 		}
 	} else {
 		nskb = cpu_dp->rcv(skb, dev);
 	}
 
-	if (!nskb) {
-		kfree_skb(skb);
+	if (!nskb)
 		return 0;
-	}
 
 	skb = nskb;
 	skb_push(skb, ETH_HLEN);
diff --git a/net/dsa/tag_ar9331.c b/net/dsa/tag_ar9331.c
index cbb588ca73aa..2e2388143b02 100644
--- a/net/dsa/tag_ar9331.c
+++ b/net/dsa/tag_ar9331.c
@@ -51,8 +51,10 @@ static struct sk_buff *ar9331_tag_rcv(struct sk_buff *skb,
 	u8 ver, port;
 	u16 hdr;
 
-	if (unlikely(!pskb_may_pull(skb, AR9331_HDR_LEN)))
+	if (unlikely(!pskb_may_pull(skb, AR9331_HDR_LEN))) {
+		kfree_skb(skb);
 		return NULL;
+	}
 
 	hdr = le16_to_cpu(*(__le16 *)skb_mac_header(skb));
 
@@ -60,12 +62,14 @@ static struct sk_buff *ar9331_tag_rcv(struct sk_buff *skb,
 	if (unlikely(ver != AR9331_HDR_VERSION)) {
 		netdev_warn_once(ndev, "%s:%i wrong header version 0x%2x\n",
 				 __func__, __LINE__, hdr);
+		kfree_skb(skb);
 		return NULL;
 	}
 
 	if (unlikely(hdr & AR9331_HDR_FROM_CPU)) {
 		netdev_warn_once(ndev, "%s:%i packet should not be from cpu 0x%2x\n",
 				 __func__, __LINE__, hdr);
+		kfree_skb(skb);
 		return NULL;
 	}
 
@@ -75,8 +79,10 @@ static struct sk_buff *ar9331_tag_rcv(struct sk_buff *skb,
 	port = FIELD_GET(AR9331_HDR_PORT_NUM_MASK, hdr);
 
 	skb->dev = dsa_conduit_find_user(ndev, 0, port);
-	if (!skb->dev)
+	if (!skb->dev) {
+		kfree_skb(skb);
 		return NULL;
+	}
 
 	return skb;
 }
diff --git a/net/dsa/tag_brcm.c b/net/dsa/tag_brcm.c
index cf9420439054..411e3b57d16a 100644
--- a/net/dsa/tag_brcm.c
+++ b/net/dsa/tag_brcm.c
@@ -102,9 +102,9 @@ static struct sk_buff *brcm_tag_xmit_ll(struct sk_buff *skb,
 	 * (including FCS and tag) because the length verification is done after
 	 * the Broadcom tag is stripped off the ingress packet.
 	 *
-	 * Let dsa_user_xmit() free the SKB
+	 * Free the SKB on error.
 	 */
-	if (__skb_put_padto(skb, ETH_ZLEN + BRCM_TAG_LEN, false))
+	if (skb_put_padto(skb, ETH_ZLEN + BRCM_TAG_LEN))
 		return NULL;
 
 	skb_push(skb, BRCM_TAG_LEN);
@@ -151,27 +151,35 @@ static struct sk_buff *brcm_tag_rcv_ll(struct sk_buff *skb,
 	int source_port;
 	u8 *brcm_tag;
 
-	if (unlikely(!pskb_may_pull(skb, BRCM_TAG_LEN)))
+	if (unlikely(!pskb_may_pull(skb, BRCM_TAG_LEN))) {
+		kfree_skb(skb);
 		return NULL;
+	}
 
 	brcm_tag = skb->data - offset;
 
 	/* The opcode should never be different than 0b000 */
-	if (unlikely((brcm_tag[0] >> BRCM_OPCODE_SHIFT) & BRCM_OPCODE_MASK))
+	if (unlikely((brcm_tag[0] >> BRCM_OPCODE_SHIFT) & BRCM_OPCODE_MASK)) {
+		kfree_skb(skb);
 		return NULL;
+	}
 
 	/* We should never see a reserved reason code without knowing how to
 	 * handle it
 	 */
-	if (unlikely(brcm_tag[2] & BRCM_EG_RC_RSVD))
+	if (unlikely(brcm_tag[2] & BRCM_EG_RC_RSVD)) {
+		kfree_skb(skb);
 		return NULL;
+	}
 
 	/* Locate which port this is coming from */
 	source_port = brcm_tag[3] & BRCM_EG_PID_MASK;
 
 	skb->dev = dsa_conduit_find_user(dev, 0, source_port);
-	if (!skb->dev)
+	if (!skb->dev) {
+		kfree_skb(skb);
 		return NULL;
+	}
 
 	/* Remove Broadcom tag and update checksum */
 	skb_pull_rcsum(skb, BRCM_TAG_LEN);
@@ -228,8 +236,10 @@ static struct sk_buff *brcm_leg_tag_rcv(struct sk_buff *skb,
 	__be16 *proto;
 	u8 *brcm_tag;
 
-	if (unlikely(!pskb_may_pull(skb, BRCM_LEG_TAG_LEN + VLAN_HLEN)))
+	if (unlikely(!pskb_may_pull(skb, BRCM_LEG_TAG_LEN + VLAN_HLEN))) {
+		kfree_skb(skb);
 		return NULL;
+	}
 
 	brcm_tag = dsa_etype_header_pos_rx(skb);
 	proto = (__be16 *)(brcm_tag + BRCM_LEG_TAG_LEN);
@@ -237,8 +247,10 @@ static struct sk_buff *brcm_leg_tag_rcv(struct sk_buff *skb,
 	source_port = brcm_tag[5] & BRCM_LEG_PORT_ID;
 
 	skb->dev = dsa_conduit_find_user(dev, 0, source_port);
-	if (!skb->dev)
+	if (!skb->dev) {
+		kfree_skb(skb);
 		return NULL;
+	}
 
 	/* The internal switch in BCM63XX SoCs always tags on egress on the CPU
 	 * port. We use VID 0 internally for untagged traffic, so strip the tag
@@ -273,10 +285,8 @@ static struct sk_buff *brcm_leg_tag_xmit(struct sk_buff *skb,
 	 * need to make sure that packets are at least 70 bytes
 	 * (including FCS and tag) because the length verification is done after
 	 * the Broadcom tag is stripped off the ingress packet.
-	 *
-	 * Let dsa_user_xmit() free the SKB
 	 */
-	if (__skb_put_padto(skb, ETH_ZLEN + BRCM_LEG_TAG_LEN, false))
+	if (skb_put_padto(skb, ETH_ZLEN + BRCM_LEG_TAG_LEN))
 		return NULL;
 
 	skb_push(skb, BRCM_LEG_TAG_LEN);
@@ -325,10 +335,8 @@ static struct sk_buff *brcm_leg_fcs_tag_xmit(struct sk_buff *skb,
 	 * need to make sure that packets are at least 70 bytes (including FCS
 	 * and tag) because the length verification is done after the Broadcom
 	 * tag is stripped off the ingress packet.
-	 *
-	 * Let dsa_user_xmit() free the SKB.
 	 */
-	if (__skb_put_padto(skb, ETH_ZLEN + BRCM_LEG_TAG_LEN, false))
+	if (skb_put_padto(skb, ETH_ZLEN + BRCM_LEG_TAG_LEN))
 		return NULL;
 
 	fcs_len = skb->len;
@@ -351,8 +359,9 @@ static struct sk_buff *brcm_leg_fcs_tag_xmit(struct sk_buff *skb,
 	brcm_tag[5] = dp->index & BRCM_LEG_PORT_ID;
 
 	/* Original FCS value */
-	if (__skb_pad(skb, ETH_FCS_LEN, false))
+	if (skb_pad(skb, ETH_FCS_LEN))
 		return NULL;
+
 	skb_put_data(skb, &fcs_val, ETH_FCS_LEN);
 
 	return skb;
diff --git a/net/dsa/tag_dsa.c b/net/dsa/tag_dsa.c
index 2a2c4fb61a65..d5ffee35fbb5 100644
--- a/net/dsa/tag_dsa.c
+++ b/net/dsa/tag_dsa.c
@@ -224,6 +224,7 @@ static struct sk_buff *dsa_rcv_ll(struct sk_buff *skb, struct net_device *dev,
 			/* Remote management is not implemented yet,
 			 * drop.
 			 */
+			kfree_skb(skb);
 			return NULL;
 		case DSA_CODE_ARP_MIRROR:
 		case DSA_CODE_POLICY_MIRROR:
@@ -244,12 +245,14 @@ static struct sk_buff *dsa_rcv_ll(struct sk_buff *skb, struct net_device *dev,
 			/* Reserved code, this could be anything. Drop
 			 * seems like the safest option.
 			 */
+			kfree_skb(skb);
 			return NULL;
 		}
 
 		break;
 
 	default:
+		kfree_skb(skb);
 		return NULL;
 	}
 
@@ -271,8 +274,10 @@ static struct sk_buff *dsa_rcv_ll(struct sk_buff *skb, struct net_device *dev,
 						 source_port);
 	}
 
-	if (!skb->dev)
+	if (!skb->dev) {
+		kfree_skb(skb);
 		return NULL;
+	}
 
 	/* When using LAG offload, skb->dev is not a DSA user interface,
 	 * so we cannot call dsa_default_offload_fwd_mark and we need to
@@ -335,8 +340,10 @@ static struct sk_buff *dsa_xmit(struct sk_buff *skb, struct net_device *dev)
 
 static struct sk_buff *dsa_rcv(struct sk_buff *skb, struct net_device *dev)
 {
-	if (unlikely(!pskb_may_pull(skb, DSA_HLEN)))
+	if (unlikely(!pskb_may_pull(skb, DSA_HLEN))) {
+		kfree_skb(skb);
 		return NULL;
+	}
 
 	return dsa_rcv_ll(skb, dev, 0);
 }
@@ -375,8 +382,10 @@ static struct sk_buff *edsa_xmit(struct sk_buff *skb, struct net_device *dev)
 
 static struct sk_buff *edsa_rcv(struct sk_buff *skb, struct net_device *dev)
 {
-	if (unlikely(!pskb_may_pull(skb, EDSA_HLEN)))
+	if (unlikely(!pskb_may_pull(skb, EDSA_HLEN))) {
+		kfree_skb(skb);
 		return NULL;
+	}
 
 	skb_pull_rcsum(skb, EDSA_HLEN - DSA_HLEN);
 
diff --git a/net/dsa/tag_gswip.c b/net/dsa/tag_gswip.c
index 5fa436121087..5c407d448c9f 100644
--- a/net/dsa/tag_gswip.c
+++ b/net/dsa/tag_gswip.c
@@ -80,16 +80,20 @@ static struct sk_buff *gswip_tag_rcv(struct sk_buff *skb,
 	int port;
 	u8 *gswip_tag;
 
-	if (unlikely(!pskb_may_pull(skb, GSWIP_RX_HEADER_LEN)))
+	if (unlikely(!pskb_may_pull(skb, GSWIP_RX_HEADER_LEN))) {
+		kfree_skb(skb);
 		return NULL;
+	}
 
 	gswip_tag = skb->data - ETH_HLEN;
 
 	/* Get source port information */
 	port = (gswip_tag[7] & GSWIP_RX_SPPID_MASK) >> GSWIP_RX_SPPID_SHIFT;
 	skb->dev = dsa_conduit_find_user(dev, 0, port);
-	if (!skb->dev)
+	if (!skb->dev) {
+		kfree_skb(skb);
 		return NULL;
+	}
 
 	/* remove GSWIP tag */
 	skb_pull_rcsum(skb, GSWIP_RX_HEADER_LEN);
diff --git a/net/dsa/tag_hellcreek.c b/net/dsa/tag_hellcreek.c
index 544ab15685a2..dd9f328f3182 100644
--- a/net/dsa/tag_hellcreek.c
+++ b/net/dsa/tag_hellcreek.c
@@ -27,8 +27,10 @@ static struct sk_buff *hellcreek_xmit(struct sk_buff *skb,
 	 * checksums after the switch strips the tag.
 	 */
 	if (skb->ip_summed == CHECKSUM_PARTIAL &&
-	    skb_checksum_help(skb))
+	    skb_checksum_help(skb)) {
+		kfree_skb(skb);
 		return NULL;
+	}
 
 	/* Tag encoding */
 	tag  = skb_put(skb, HELLCREEK_TAG_LEN);
@@ -47,11 +49,14 @@ static struct sk_buff *hellcreek_rcv(struct sk_buff *skb,
 	skb->dev = dsa_conduit_find_user(dev, 0, port);
 	if (!skb->dev) {
 		netdev_warn_once(dev, "Failed to get source port: %d\n", port);
+		kfree_skb(skb);
 		return NULL;
 	}
 
-	if (pskb_trim_rcsum(skb, skb->len - HELLCREEK_TAG_LEN))
+	if (pskb_trim_rcsum(skb, skb->len - HELLCREEK_TAG_LEN)) {
+		kfree_skb(skb);
 		return NULL;
+	}
 
 	dsa_default_offload_fwd_mark(skb);
 
diff --git a/net/dsa/tag_ksz.c b/net/dsa/tag_ksz.c
index d2475c3bbb7d..67fa89f102e0 100644
--- a/net/dsa/tag_ksz.c
+++ b/net/dsa/tag_ksz.c
@@ -88,11 +88,15 @@ static struct sk_buff *ksz_common_rcv(struct sk_buff *skb,
 				      unsigned int port, unsigned int len)
 {
 	skb->dev = dsa_conduit_find_user(dev, 0, port);
-	if (!skb->dev)
+	if (!skb->dev) {
+		kfree_skb(skb);
 		return NULL;
+	}
 
-	if (pskb_trim_rcsum(skb, skb->len - len))
+	if (pskb_trim_rcsum(skb, skb->len - len)) {
+		kfree_skb(skb);
 		return NULL;
+	}
 
 	dsa_default_offload_fwd_mark(skb);
 
@@ -123,8 +127,10 @@ static struct sk_buff *ksz8795_xmit(struct sk_buff *skb, struct net_device *dev)
 	struct ethhdr *hdr;
 	u8 *tag;
 
-	if (skb->ip_summed == CHECKSUM_PARTIAL && skb_checksum_help(skb))
+	if (skb->ip_summed == CHECKSUM_PARTIAL && skb_checksum_help(skb)) {
+		kfree_skb(skb);
 		return NULL;
+	}
 
 	/* Tag encoding */
 	tag = skb_put(skb, KSZ_INGRESS_TAG_LEN);
@@ -141,8 +147,10 @@ static struct sk_buff *ksz8795_rcv(struct sk_buff *skb, struct net_device *dev)
 {
 	u8 *tag;
 
-	if (skb_linearize(skb))
+	if (skb_linearize(skb)) {
+		kfree_skb(skb);
 		return NULL;
+	}
 
 	tag = skb_tail_pointer(skb) - KSZ_EGRESS_TAG_LEN;
 
@@ -255,22 +263,24 @@ static struct sk_buff *ksz_defer_xmit(struct dsa_port *dp, struct sk_buff *skb)
 	xmit_work_fn = tagger_data->xmit_work_fn;
 	xmit_worker = priv->xmit_worker;
 
-	if (!xmit_work_fn || !xmit_worker)
+	if (!xmit_work_fn || !xmit_worker) {
+		kfree_skb(skb);
 		return NULL;
+	}
 
 	xmit_work = kzalloc_obj(*xmit_work, GFP_ATOMIC);
-	if (!xmit_work)
+	if (!xmit_work) {
+		kfree_skb(skb);
 		return NULL;
+	}
 
 	kthread_init_work(&xmit_work->work, xmit_work_fn);
-	/* Increase refcount so the kfree_skb in dsa_user_xmit
-	 * won't really free the packet.
-	 */
 	xmit_work->dp = dp;
 	xmit_work->skb = skb_get(skb);
 
 	kthread_queue_work(xmit_worker, &xmit_work->work);
 
+	kfree_skb(skb);
 	return NULL;
 }
 
@@ -284,8 +294,10 @@ static struct sk_buff *ksz9477_xmit(struct sk_buff *skb,
 	__be16 *tag;
 	u16 val;
 
-	if (skb->ip_summed == CHECKSUM_PARTIAL && skb_checksum_help(skb))
+	if (skb->ip_summed == CHECKSUM_PARTIAL && skb_checksum_help(skb)) {
+		kfree_skb(skb);
 		return NULL;
+	}
 
 	/* Tag encoding */
 	ksz_xmit_timestamp(dp, skb);
@@ -310,8 +322,10 @@ static struct sk_buff *ksz9477_rcv(struct sk_buff *skb, struct net_device *dev)
 	unsigned int port;
 	u8 *tag;
 
-	if (skb_linearize(skb))
+	if (skb_linearize(skb)) {
+		kfree_skb(skb);
 		return NULL;
+	}
 
 	/* Tag decoding */
 	tag = skb_tail_pointer(skb) - KSZ_EGRESS_TAG_LEN;
@@ -352,8 +366,10 @@ static struct sk_buff *ksz9893_xmit(struct sk_buff *skb,
 	struct ethhdr *hdr;
 	u8 *tag;
 
-	if (skb->ip_summed == CHECKSUM_PARTIAL && skb_checksum_help(skb))
+	if (skb->ip_summed == CHECKSUM_PARTIAL && skb_checksum_help(skb)) {
+		kfree_skb(skb);
 		return NULL;
+	}
 
 	/* Tag encoding */
 	ksz_xmit_timestamp(dp, skb);
@@ -418,8 +434,10 @@ static struct sk_buff *lan937x_xmit(struct sk_buff *skb,
 	__be16 *tag;
 	u16 val;
 
-	if (skb->ip_summed == CHECKSUM_PARTIAL && skb_checksum_help(skb))
+	if (skb->ip_summed == CHECKSUM_PARTIAL && skb_checksum_help(skb)) {
+		kfree_skb(skb);
 		return NULL;
+	}
 
 	ksz_xmit_timestamp(dp, skb);
 
diff --git a/net/dsa/tag_lan9303.c b/net/dsa/tag_lan9303.c
index 258e5d7dc5ef..d1194696499a 100644
--- a/net/dsa/tag_lan9303.c
+++ b/net/dsa/tag_lan9303.c
@@ -85,6 +85,7 @@ static struct sk_buff *lan9303_rcv(struct sk_buff *skb, struct net_device *dev)
 	if (unlikely(!pskb_may_pull(skb, LAN9303_TAG_LEN))) {
 		dev_warn_ratelimited(&dev->dev,
 				     "Dropping packet, cannot pull\n");
+		kfree_skb(skb);
 		return NULL;
 	}
 
@@ -102,6 +103,7 @@ static struct sk_buff *lan9303_rcv(struct sk_buff *skb, struct net_device *dev)
 	skb->dev = dsa_conduit_find_user(dev, 0, source_port);
 	if (!skb->dev) {
 		dev_warn_ratelimited(&dev->dev, "Dropping packet due to invalid source port\n");
+		kfree_skb(skb);
 		return NULL;
 	}
 
diff --git a/net/dsa/tag_mtk.c b/net/dsa/tag_mtk.c
index dea3eecaf093..c7dc7731675e 100644
--- a/net/dsa/tag_mtk.c
+++ b/net/dsa/tag_mtk.c
@@ -72,8 +72,10 @@ static struct sk_buff *mtk_tag_rcv(struct sk_buff *skb, struct net_device *dev)
 	int port;
 	__be16 *phdr;
 
-	if (unlikely(!pskb_may_pull(skb, MTK_HDR_LEN)))
+	if (unlikely(!pskb_may_pull(skb, MTK_HDR_LEN))) {
+		kfree_skb(skb);
 		return NULL;
+	}
 
 	phdr = dsa_etype_header_pos_rx(skb);
 	hdr = ntohs(*phdr);
@@ -87,8 +89,10 @@ static struct sk_buff *mtk_tag_rcv(struct sk_buff *skb, struct net_device *dev)
 	port = (hdr & MTK_HDR_RECV_SOURCE_PORT_MASK);
 
 	skb->dev = dsa_conduit_find_user(dev, 0, port);
-	if (!skb->dev)
+	if (!skb->dev) {
+		kfree_skb(skb);
 		return NULL;
+	}
 
 	dsa_default_offload_fwd_mark(skb);
 
diff --git a/net/dsa/tag_mxl-gsw1xx.c b/net/dsa/tag_mxl-gsw1xx.c
index 60f7c445e656..4b1b6ef94196 100644
--- a/net/dsa/tag_mxl-gsw1xx.c
+++ b/net/dsa/tag_mxl-gsw1xx.c
@@ -73,6 +73,7 @@ static struct sk_buff *gsw1xx_tag_rcv(struct sk_buff *skb,
 
 	if (unlikely(!pskb_may_pull(skb, GSW1XX_HEADER_LEN))) {
 		dev_warn_ratelimited(&dev->dev, "Dropping packet, cannot pull SKB\n");
+		kfree_skb(skb);
 		return NULL;
 	}
 
@@ -81,6 +82,7 @@ static struct sk_buff *gsw1xx_tag_rcv(struct sk_buff *skb,
 	if (unlikely(ntohs(gsw1xx_tag[0]) != ETH_P_MXLGSW)) {
 		dev_warn_ratelimited(&dev->dev, "Dropping packet due to invalid special tag\n");
 		dev_warn_ratelimited(&dev->dev, "Tag: %8ph\n", gsw1xx_tag);
+		kfree_skb(skb);
 		return NULL;
 	}
 
@@ -90,6 +92,7 @@ static struct sk_buff *gsw1xx_tag_rcv(struct sk_buff *skb,
 	if (!skb->dev) {
 		dev_warn_ratelimited(&dev->dev, "Dropping packet due to invalid source port\n");
 		dev_warn_ratelimited(&dev->dev, "Tag: %8ph\n", gsw1xx_tag);
+		kfree_skb(skb);
 		return NULL;
 	}
 
diff --git a/net/dsa/tag_mxl862xx.c b/net/dsa/tag_mxl862xx.c
index 8daefeb8d49d..87b80ddf0946 100644
--- a/net/dsa/tag_mxl862xx.c
+++ b/net/dsa/tag_mxl862xx.c
@@ -64,6 +64,7 @@ static struct sk_buff *mxl862_tag_rcv(struct sk_buff *skb,
 
 	if (unlikely(!pskb_may_pull(skb, MXL862_HEADER_LEN))) {
 		dev_warn_ratelimited(&dev->dev, "Cannot pull SKB, packet dropped\n");
+		kfree_skb(skb);
 		return NULL;
 	}
 
@@ -73,6 +74,7 @@ static struct sk_buff *mxl862_tag_rcv(struct sk_buff *skb,
 		dev_warn_ratelimited(&dev->dev,
 				     "Invalid special tag marker, packet dropped, tag: %8ph\n",
 				     mxl862_tag);
+		kfree_skb(skb);
 		return NULL;
 	}
 
@@ -83,6 +85,7 @@ static struct sk_buff *mxl862_tag_rcv(struct sk_buff *skb,
 		dev_warn_ratelimited(&dev->dev,
 				     "Invalid source port, packet dropped, tag: %8ph\n",
 				     mxl862_tag);
+		kfree_skb(skb);
 		return NULL;
 	}
 
diff --git a/net/dsa/tag_netc.c b/net/dsa/tag_netc.c
index ccedfe3a80b6..df72a61796ad 100644
--- a/net/dsa/tag_netc.c
+++ b/net/dsa/tag_netc.c
@@ -131,14 +131,13 @@ static struct sk_buff *netc_rcv(struct sk_buff *skb,
 	int type, subtype;
 
 	if (unlikely(!pskb_may_pull(skb, NETC_TAG_MAX_LEN)))
-		return NULL;
+		goto err_free_skb;
 
 	tag_cmn = dsa_etype_header_pos_rx(skb);
 	if (ntohs(tag_cmn->tpid) != ETH_P_NXP_NETC) {
 		dev_warn_ratelimited(&ndev->dev, "Unknown TPID 0x%04x\n",
 				     ntohs(tag_cmn->tpid));
-
-		return NULL;
+		goto err_free_skb;
 	}
 
 	if (tag_cmn->qos & NETC_TAG_QV)
@@ -149,14 +148,13 @@ static struct sk_buff *netc_rcv(struct sk_buff *skb,
 	if (!sw_id) {
 		dev_warn_ratelimited(&ndev->dev,
 				     "VEPA switch ID is not supported yet\n");
-
-		return NULL;
+		goto err_free_skb;
 	}
 
 	port = FIELD_GET(NETC_TAG_PORT, tag_cmn->switch_port);
 	skb->dev = dsa_conduit_find_user(ndev, sw_id, port);
 	if (!skb->dev)
-		return NULL;
+		goto err_free_skb;
 
 	type = FIELD_GET(NETC_TAG_TYPE, tag_cmn->type);
 	subtype = FIELD_GET(NETC_TAG_SUBTYPE, tag_cmn->type);
@@ -165,11 +163,11 @@ static struct sk_buff *netc_rcv(struct sk_buff *skb,
 	} else if (type == NETC_TAG_TO_HOST) {
 		/* Currently only subtype0 supported */
 		if (subtype != NETC_TAG_TH_SUBTYPE0)
-			return NULL;
+			goto err_free_skb;
 	} else {
 		dev_warn_ratelimited(&ndev->dev,
 				     "Unexpected  tag type %d\n", type);
-		return NULL;
+		goto err_free_skb;
 	}
 
 	/* Remove Switch tag from the frame */
@@ -178,6 +176,10 @@ static struct sk_buff *netc_rcv(struct sk_buff *skb,
 	dsa_strip_etype_header(skb, tag_len);
 
 	return skb;
+
+err_free_skb:
+	kfree_skb(skb);
+	return NULL;
 }
 
 static void netc_flow_dissect(const struct sk_buff *skb, __be16 *proto,
diff --git a/net/dsa/tag_ocelot.c b/net/dsa/tag_ocelot.c
index 3405def79c2d..d208c7322cd6 100644
--- a/net/dsa/tag_ocelot.c
+++ b/net/dsa/tag_ocelot.c
@@ -107,14 +107,16 @@ static struct sk_buff *ocelot_rcv(struct sk_buff *skb,
 	ocelot_xfh_get_rew_val(extraction, &rew_val);
 
 	skb->dev = dsa_conduit_find_user(netdev, 0, src_port);
-	if (!skb->dev)
+	if (!skb->dev) {
 		/* The switch will reflect back some frames sent through
 		 * sockets opened on the bare DSA conduit. These will come back
 		 * with src_port equal to the index of the CPU port, for which
 		 * there is no user registered. So don't print any error
 		 * message here (ignore and drop those frames).
 		 */
+		kfree_skb(skb);
 		return NULL;
+	}
 
 	dsa_default_offload_fwd_mark(skb);
 	skb->priority = qos_class;
diff --git a/net/dsa/tag_ocelot_8021q.c b/net/dsa/tag_ocelot_8021q.c
index e89d9254e90a..f50f1cd83f16 100644
--- a/net/dsa/tag_ocelot_8021q.c
+++ b/net/dsa/tag_ocelot_8021q.c
@@ -33,30 +33,34 @@ static struct sk_buff *ocelot_defer_xmit(struct dsa_port *dp,
 	xmit_work_fn = data->xmit_work_fn;
 	xmit_worker = priv->xmit_worker;
 
-	if (!xmit_work_fn || !xmit_worker)
+	if (!xmit_work_fn || !xmit_worker) {
+		kfree_skb(skb);
 		return NULL;
+	}
 
 	/* PTP over IP packets need UDP checksumming. We may have inherited
 	 * NETIF_F_HW_CSUM from the DSA conduit, but these packets are not sent
 	 * through the DSA conduit, so calculate the checksum here.
 	 */
-	if (skb->ip_summed == CHECKSUM_PARTIAL && skb_checksum_help(skb))
+	if (skb->ip_summed == CHECKSUM_PARTIAL && skb_checksum_help(skb)) {
+		kfree_skb(skb);
 		return NULL;
+	}
 
 	xmit_work = kzalloc_obj(*xmit_work, GFP_ATOMIC);
-	if (!xmit_work)
+	if (!xmit_work) {
+		kfree_skb(skb);
 		return NULL;
+	}
 
 	/* Calls felix_port_deferred_xmit in felix.c */
 	kthread_init_work(&xmit_work->work, xmit_work_fn);
-	/* Increase refcount so the kfree_skb in dsa_user_xmit
-	 * won't really free the packet.
-	 */
 	xmit_work->dp = dp;
 	xmit_work->skb = skb_get(skb);
 
 	kthread_queue_work(xmit_worker, &xmit_work->work);
 
+	kfree_skb(skb);
 	return NULL;
 }
 
@@ -84,8 +88,10 @@ static struct sk_buff *ocelot_rcv(struct sk_buff *skb,
 	dsa_8021q_rcv(skb, &src_port, &switch_id, NULL, NULL);
 
 	skb->dev = dsa_conduit_find_user(netdev, switch_id, src_port);
-	if (!skb->dev)
+	if (!skb->dev) {
+		kfree_skb(skb);
 		return NULL;
+	}
 
 	dsa_default_offload_fwd_mark(skb);
 
diff --git a/net/dsa/tag_qca.c b/net/dsa/tag_qca.c
index 9e3b429e8b36..510792fbfa92 100644
--- a/net/dsa/tag_qca.c
+++ b/net/dsa/tag_qca.c
@@ -46,16 +46,20 @@ static struct sk_buff *qca_tag_rcv(struct sk_buff *skb, struct net_device *dev)
 
 	tagger_data = ds->tagger_data;
 
-	if (unlikely(!pskb_may_pull(skb, QCA_HDR_LEN)))
+	if (unlikely(!pskb_may_pull(skb, QCA_HDR_LEN))) {
+		kfree_skb(skb);
 		return NULL;
+	}
 
 	phdr = dsa_etype_header_pos_rx(skb);
 	hdr = ntohs(*phdr);
 
 	/* Make sure the version is correct */
 	ver = FIELD_GET(QCA_HDR_RECV_VERSION, hdr);
-	if (unlikely(ver != QCA_HDR_VERSION))
+	if (unlikely(ver != QCA_HDR_VERSION)) {
+		kfree_skb(skb);
 		return NULL;
+	}
 
 	/* Get pk type */
 	pk_type = FIELD_GET(QCA_HDR_RECV_TYPE, hdr);
@@ -64,6 +68,7 @@ static struct sk_buff *qca_tag_rcv(struct sk_buff *skb, struct net_device *dev)
 	if (pk_type == QCA_HDR_RECV_TYPE_RW_REG_ACK) {
 		if (likely(tagger_data->rw_reg_ack_handler))
 			tagger_data->rw_reg_ack_handler(ds, skb);
+		kfree_skb(skb);
 		return NULL;
 	}
 
@@ -71,6 +76,7 @@ static struct sk_buff *qca_tag_rcv(struct sk_buff *skb, struct net_device *dev)
 	if (pk_type == QCA_HDR_RECV_TYPE_MIB) {
 		if (likely(tagger_data->mib_autocast_handler))
 			tagger_data->mib_autocast_handler(ds, skb);
+		kfree_skb(skb);
 		return NULL;
 	}
 
@@ -78,8 +84,10 @@ static struct sk_buff *qca_tag_rcv(struct sk_buff *skb, struct net_device *dev)
 	port = FIELD_GET(QCA_HDR_RECV_SOURCE_PORT, hdr);
 
 	skb->dev = dsa_conduit_find_user(dev, 0, port);
-	if (!skb->dev)
+	if (!skb->dev) {
+		kfree_skb(skb);
 		return NULL;
+	}
 
 	/* Remove QCA tag and recalculate checksum */
 	skb_pull_rcsum(skb, QCA_HDR_LEN);
diff --git a/net/dsa/tag_rtl4_a.c b/net/dsa/tag_rtl4_a.c
index 3cc63eacfa03..590ea3b921c9 100644
--- a/net/dsa/tag_rtl4_a.c
+++ b/net/dsa/tag_rtl4_a.c
@@ -41,7 +41,7 @@ static struct sk_buff *rtl4a_tag_xmit(struct sk_buff *skb,
 	u16 out;
 
 	/* Pad out to at least 60 bytes */
-	if (unlikely(__skb_put_padto(skb, ETH_ZLEN, false)))
+	if (unlikely(eth_skb_pad(skb)))
 		return NULL;
 
 	netdev_dbg(dev, "add realtek tag to package to port %d\n",
@@ -75,8 +75,10 @@ static struct sk_buff *rtl4a_tag_rcv(struct sk_buff *skb,
 	u8 prot;
 	u8 port;
 
-	if (unlikely(!pskb_may_pull(skb, RTL4_A_HDR_LEN)))
+	if (unlikely(!pskb_may_pull(skb, RTL4_A_HDR_LEN))) {
+		kfree_skb(skb);
 		return NULL;
+	}
 
 	tag = dsa_etype_header_pos_rx(skb);
 	p = (__be16 *)tag;
@@ -92,6 +94,7 @@ static struct sk_buff *rtl4a_tag_rcv(struct sk_buff *skb,
 	prot = (protport >> RTL4_A_PROTOCOL_SHIFT) & 0x0f;
 	if (prot != RTL4_A_PROTOCOL_RTL8366RB) {
 		netdev_err(dev, "unknown realtek protocol 0x%01x\n", prot);
+		kfree_skb(skb);
 		return NULL;
 	}
 	port = protport & 0xff;
@@ -99,6 +102,7 @@ static struct sk_buff *rtl4a_tag_rcv(struct sk_buff *skb,
 	skb->dev = dsa_conduit_find_user(dev, 0, port);
 	if (!skb->dev) {
 		netdev_dbg(dev, "could not find user for port %d\n", port);
+		kfree_skb(skb);
 		return NULL;
 	}
 
diff --git a/net/dsa/tag_rtl8_4.c b/net/dsa/tag_rtl8_4.c
index 852c6b88079a..4da3beebef75 100644
--- a/net/dsa/tag_rtl8_4.c
+++ b/net/dsa/tag_rtl8_4.c
@@ -143,8 +143,10 @@ static struct sk_buff *rtl8_4t_tag_xmit(struct sk_buff *skb,
 	/* Calculate the checksum here if not done yet as trailing tags will
 	 * break either software or hardware based checksum
 	 */
-	if (skb->ip_summed == CHECKSUM_PARTIAL && skb_checksum_help(skb))
+	if (skb->ip_summed == CHECKSUM_PARTIAL && skb_checksum_help(skb)) {
+		kfree_skb(skb);
 		return NULL;
+	}
 
 	rtl8_4_write_tag(skb, dev, skb_put(skb, RTL8_4_TAG_LEN));
 
@@ -201,11 +203,15 @@ static int rtl8_4_read_tag(struct sk_buff *skb, struct net_device *dev,
 static struct sk_buff *rtl8_4_tag_rcv(struct sk_buff *skb,
 				      struct net_device *dev)
 {
-	if (unlikely(!pskb_may_pull(skb, RTL8_4_TAG_LEN)))
+	if (unlikely(!pskb_may_pull(skb, RTL8_4_TAG_LEN))) {
+		kfree_skb(skb);
 		return NULL;
+	}
 
-	if (unlikely(rtl8_4_read_tag(skb, dev, dsa_etype_header_pos_rx(skb))))
+	if (unlikely(rtl8_4_read_tag(skb, dev, dsa_etype_header_pos_rx(skb)))) {
+		kfree_skb(skb);
 		return NULL;
+	}
 
 	/* Remove tag and recalculate checksum */
 	skb_pull_rcsum(skb, RTL8_4_TAG_LEN);
@@ -218,14 +224,20 @@ static struct sk_buff *rtl8_4_tag_rcv(struct sk_buff *skb,
 static struct sk_buff *rtl8_4t_tag_rcv(struct sk_buff *skb,
 				       struct net_device *dev)
 {
-	if (skb_linearize(skb))
+	if (skb_linearize(skb)) {
+		kfree_skb(skb);
 		return NULL;
+	}
 
-	if (unlikely(rtl8_4_read_tag(skb, dev, skb_tail_pointer(skb) - RTL8_4_TAG_LEN)))
+	if (unlikely(rtl8_4_read_tag(skb, dev, skb_tail_pointer(skb) - RTL8_4_TAG_LEN))) {
+		kfree_skb(skb);
 		return NULL;
+	}
 
-	if (pskb_trim_rcsum(skb, skb->len - RTL8_4_TAG_LEN))
+	if (pskb_trim_rcsum(skb, skb->len - RTL8_4_TAG_LEN)) {
+		kfree_skb(skb);
 		return NULL;
+	}
 
 	return skb;
 }
diff --git a/net/dsa/tag_rzn1_a5psw.c b/net/dsa/tag_rzn1_a5psw.c
index 10994b3470f6..734910156dc3 100644
--- a/net/dsa/tag_rzn1_a5psw.c
+++ b/net/dsa/tag_rzn1_a5psw.c
@@ -48,7 +48,7 @@ static struct sk_buff *a5psw_tag_xmit(struct sk_buff *skb, struct net_device *de
 	 * least 60 bytes otherwise they will be discarded when they enter the
 	 * switch port logic.
 	 */
-	if (__skb_put_padto(skb, ETH_ZLEN, false))
+	if (eth_skb_pad(skb))
 		return NULL;
 
 	/* provide 'A5PSW_TAG_LEN' bytes additional space */
@@ -77,6 +77,7 @@ static struct sk_buff *a5psw_tag_rcv(struct sk_buff *skb,
 	if (unlikely(!pskb_may_pull(skb, A5PSW_TAG_LEN))) {
 		dev_warn_ratelimited(&dev->dev,
 				     "Dropping packet, cannot pull\n");
+		kfree_skb(skb);
 		return NULL;
 	}
 
@@ -84,14 +85,17 @@ static struct sk_buff *a5psw_tag_rcv(struct sk_buff *skb,
 
 	if (tag->ctrl_tag != htons(ETH_P_DSA_A5PSW)) {
 		dev_warn_ratelimited(&dev->dev, "Dropping packet due to invalid TAG marker\n");
+		kfree_skb(skb);
 		return NULL;
 	}
 
 	port = FIELD_GET(A5PSW_CTRL_DATA_PORT, ntohs(tag->ctrl_data));
 
 	skb->dev = dsa_conduit_find_user(dev, 0, port);
-	if (!skb->dev)
+	if (!skb->dev) {
+		kfree_skb(skb);
 		return NULL;
+	}
 
 	skb_pull_rcsum(skb, A5PSW_TAG_LEN);
 	dsa_strip_etype_header(skb, A5PSW_TAG_LEN);
diff --git a/net/dsa/tag_sja1105.c b/net/dsa/tag_sja1105.c
index de6d4ce8668b..bfe1f746f55b 100644
--- a/net/dsa/tag_sja1105.c
+++ b/net/dsa/tag_sja1105.c
@@ -149,19 +149,20 @@ static struct sk_buff *sja1105_defer_xmit(struct dsa_port *dp,
 	xmit_work_fn = tagger_data->xmit_work_fn;
 	xmit_worker = priv->xmit_worker;
 
-	if (!xmit_work_fn || !xmit_worker)
+	if (!xmit_work_fn || !xmit_worker) {
+		kfree_skb(skb);
 		return NULL;
+	}
 
 	xmit_work = kzalloc_obj(*xmit_work, GFP_ATOMIC);
-	if (!xmit_work)
+	if (!xmit_work) {
+		kfree_skb(skb);
 		return NULL;
+	}
 
 	kthread_init_work(&xmit_work->work, xmit_work_fn);
-	/* Increase refcount so the kfree_skb in dsa_user_xmit
-	 * won't really free the packet.
-	 */
 	xmit_work->dp = dp;
-	xmit_work->skb = skb_get(skb);
+	xmit_work->skb = skb;
 
 	kthread_queue_work(xmit_worker, &xmit_work->work);
 
@@ -401,10 +402,7 @@ static struct sk_buff
 			kfree_skb(priv->stampable_skb);
 		}
 
-		/* Hold a reference to avoid dsa_switch_rcv
-		 * from freeing the skb.
-		 */
-		priv->stampable_skb = skb_get(skb);
+		priv->stampable_skb = skb;
 		spin_unlock(&priv->meta_lock);
 
 		/* Tell DSA we got nothing */
@@ -436,6 +434,7 @@ static struct sk_buff
 			dev_err_ratelimited(ds->dev,
 					    "Unexpected meta frame\n");
 			spin_unlock(&priv->meta_lock);
+			kfree_skb(skb);
 			return NULL;
 		}
 
@@ -443,6 +442,7 @@ static struct sk_buff
 			dev_err_ratelimited(ds->dev,
 					    "Meta frame on wrong port\n");
 			spin_unlock(&priv->meta_lock);
+			kfree_skb(skb);
 			return NULL;
 		}
 
@@ -501,18 +501,21 @@ static struct sk_buff *sja1105_rcv(struct sk_buff *skb,
 	/* Normal data plane traffic and link-local frames are tagged with
 	 * a tag_8021q VLAN which we have to strip
 	 */
-	if (sja1105_skb_has_tag_8021q(skb))
+	if (sja1105_skb_has_tag_8021q(skb)) {
 		dsa_8021q_rcv(skb, &source_port, &switch_id, &vbid, &vid);
-	else if (source_port == -1 && switch_id == -1)
+	} else if (source_port == -1 && switch_id == -1) {
 		/* Packets with no source information have no chance of
 		 * getting accepted, drop them straight away.
 		 */
+		kfree_skb(skb);
 		return NULL;
+	}
 
 	skb->dev = dsa_tag_8021q_find_user(netdev, source_port, switch_id,
 					   vid, vbid);
 	if (!skb->dev) {
 		netdev_warn(netdev, "Couldn't decode source port\n");
+		kfree_skb(skb);
 		return NULL;
 	}
 
@@ -539,12 +542,15 @@ static struct sk_buff *sja1110_rcv_meta(struct sk_buff *skb, u16 rx_header)
 	if (!ds) {
 		net_err_ratelimited("%s: cannot find switch id %d\n",
 				    conduit->name, switch_id);
+		kfree_skb(skb);
 		return NULL;
 	}
 
 	tagger_data = sja1105_tagger_data(ds);
-	if (!tagger_data->meta_tstamp_handler)
+	if (!tagger_data->meta_tstamp_handler) {
+		kfree_skb(skb);
 		return NULL;
+	}
 
 	for (i = 0; i <= n_ts; i++) {
 		u8 ts_id, source_port, dir;
@@ -562,6 +568,7 @@ static struct sk_buff *sja1110_rcv_meta(struct sk_buff *skb, u16 rx_header)
 	}
 
 	/* Discard the meta frame, we've consumed the timestamps it contained */
+	kfree_skb(skb);
 	return NULL;
 }
 
@@ -572,8 +579,10 @@ static struct sk_buff *sja1110_rcv_inband_control_extension(struct sk_buff *skb,
 {
 	u16 rx_header;
 
-	if (unlikely(!pskb_may_pull(skb, SJA1110_HEADER_LEN)))
+	if (unlikely(!pskb_may_pull(skb, SJA1110_HEADER_LEN))) {
+		kfree_skb(skb);
 		return NULL;
+	}
 
 	/* skb->data points to skb_mac_header(skb) + ETH_HLEN, which is exactly
 	 * what we need because the caller has checked the EtherType (which is
@@ -609,8 +618,10 @@ static struct sk_buff *sja1110_rcv_inband_control_extension(struct sk_buff *skb,
 		 * padding and trailer we need to account for the fact that
 		 * skb->data points to skb_mac_header(skb) + ETH_HLEN.
 		 */
-		if (pskb_trim_rcsum(skb, start_of_padding - ETH_HLEN))
+		if (pskb_trim_rcsum(skb, start_of_padding - ETH_HLEN)) {
+			kfree_skb(skb);
 			return NULL;
+		}
 	/* Trap-to-host frame, no timestamp trailer */
 	} else {
 		*source_port = SJA1110_RX_HEADER_SRC_PORT(rx_header);
@@ -653,6 +664,7 @@ static struct sk_buff *sja1110_rcv(struct sk_buff *skb,
 
 	if (!skb->dev) {
 		netdev_warn(netdev, "Couldn't decode source port\n");
+		kfree_skb(skb);
 		return NULL;
 	}
 
diff --git a/net/dsa/tag_trailer.c b/net/dsa/tag_trailer.c
index 4dce24cfe6a7..49c802c10ca6 100644
--- a/net/dsa/tag_trailer.c
+++ b/net/dsa/tag_trailer.c
@@ -30,22 +30,30 @@ static struct sk_buff *trailer_rcv(struct sk_buff *skb, struct net_device *dev)
 	u8 *trailer;
 	int source_port;
 
-	if (skb_linearize(skb))
+	if (skb_linearize(skb)) {
+		kfree_skb(skb);
 		return NULL;
+	}
 
 	trailer = skb_tail_pointer(skb) - 4;
 	if (trailer[0] != 0x80 || (trailer[1] & 0xf8) != 0x00 ||
-	    (trailer[2] & 0xef) != 0x00 || trailer[3] != 0x00)
+	    (trailer[2] & 0xef) != 0x00 || trailer[3] != 0x00) {
+		kfree_skb(skb);
 		return NULL;
+	}
 
 	source_port = trailer[1] & 7;
 
 	skb->dev = dsa_conduit_find_user(dev, 0, source_port);
-	if (!skb->dev)
+	if (!skb->dev) {
+		kfree_skb(skb);
 		return NULL;
+	}
 
-	if (pskb_trim_rcsum(skb, skb->len - 4))
+	if (pskb_trim_rcsum(skb, skb->len - 4)) {
+		kfree_skb(skb);
 		return NULL;
+	}
 
 	return skb;
 }
diff --git a/net/dsa/tag_vsc73xx_8021q.c b/net/dsa/tag_vsc73xx_8021q.c
index af121a9aff7f..f4736a1a7a0f 100644
--- a/net/dsa/tag_vsc73xx_8021q.c
+++ b/net/dsa/tag_vsc73xx_8021q.c
@@ -44,6 +44,7 @@ vsc73xx_rcv(struct sk_buff *skb, struct net_device *netdev)
 	if (!skb->dev) {
 		dev_warn_ratelimited(&netdev->dev,
 				     "Couldn't decode source port\n");
+		kfree_skb(skb);
 		return NULL;
 	}
 
diff --git a/net/dsa/tag_xrs700x.c b/net/dsa/tag_xrs700x.c
index a05219f702c6..bb268020ee86 100644
--- a/net/dsa/tag_xrs700x.c
+++ b/net/dsa/tag_xrs700x.c
@@ -30,15 +30,21 @@ static struct sk_buff *xrs700x_rcv(struct sk_buff *skb, struct net_device *dev)
 
 	source_port = ffs((int)trailer[0]) - 1;
 
-	if (source_port < 0)
+	if (source_port < 0) {
+		kfree_skb(skb);
 		return NULL;
+	}
 
 	skb->dev = dsa_conduit_find_user(dev, 0, source_port);
-	if (!skb->dev)
+	if (!skb->dev) {
+		kfree_skb(skb);
 		return NULL;
+	}
 
-	if (pskb_trim_rcsum(skb, skb->len - 1))
+	if (pskb_trim_rcsum(skb, skb->len - 1)) {
+		kfree_skb(skb);
 		return NULL;
+	}
 
 	/* Frame is forwarded by hardware, don't forward in software. */
 	dsa_default_offload_fwd_mark(skb);
diff --git a/net/dsa/tag_yt921x.c b/net/dsa/tag_yt921x.c
index f3ced99b1c85..294784ab6694 100644
--- a/net/dsa/tag_yt921x.c
+++ b/net/dsa/tag_yt921x.c
@@ -87,8 +87,10 @@ yt921x_tag_rcv(struct sk_buff *skb, struct net_device *netdev)
 	__be16 *tag;
 	u16 rx;
 
-	if (unlikely(!pskb_may_pull(skb, YT921X_TAG_LEN)))
+	if (unlikely(!pskb_may_pull(skb, YT921X_TAG_LEN))) {
+		kfree_skb(skb);
 		return NULL;
+	}
 
 	tag = dsa_etype_header_pos_rx(skb);
 
@@ -96,6 +98,7 @@ yt921x_tag_rcv(struct sk_buff *skb, struct net_device *netdev)
 		dev_warn_ratelimited(&netdev->dev,
 				     "Unexpected EtherType 0x%04x\n",
 				     ntohs(tag[0]));
+		kfree_skb(skb);
 		return NULL;
 	}
 
@@ -104,6 +107,7 @@ yt921x_tag_rcv(struct sk_buff *skb, struct net_device *netdev)
 	if (unlikely((rx & YT921X_TAG_PORT_EN) == 0)) {
 		dev_warn_ratelimited(&netdev->dev,
 				     "Unexpected rx tag 0x%04x\n", rx);
+		kfree_skb(skb);
 		return NULL;
 	}
 
@@ -112,6 +116,7 @@ yt921x_tag_rcv(struct sk_buff *skb, struct net_device *netdev)
 	if (unlikely(!skb->dev)) {
 		dev_warn_ratelimited(&netdev->dev,
 				     "Couldn't decode source port %u\n", port);
+		kfree_skb(skb);
 		return NULL;
 	}
 
diff --git a/net/dsa/user.c b/net/dsa/user.c
index 8704c1a3a5b7..072fa76972cc 100644
--- a/net/dsa/user.c
+++ b/net/dsa/user.c
@@ -935,13 +935,12 @@ static netdev_tx_t dsa_user_xmit(struct sk_buff *skb, struct net_device *dev)
 		eth_skb_pad(skb);
 
 	/* Transmit function may have to reallocate the original SKB,
-	 * in which case it must have freed it. Only free it here on error.
+	 * in which case it must have freed it. Taggers will drop the
+	 * passed skb on error.
 	 */
 	nskb = p->xmit(skb, dev);
-	if (!nskb) {
-		kfree_skb(skb);
+	if (!nskb)
 		return NETDEV_TX_OK;
-	}
 
 	return dsa_enqueue_skb(nskb, dev);
 }

---
base-commit: f34c6b3a3c3d98f34918e1d2ea846a5acccac6d1
change-id: 20260616-dsa-fix-free-skb-bb028ce90802

Best regards,
--  
Linus Walleij <linusw@kernel.org>


^ permalink raw reply related

* [PATCH] [net] eth: mlx5: fix macsec dependency
From: Arnd Bergmann @ 2026-06-22 12:41 UTC (permalink / raw)
  To: Saeed Mahameed, Leon Romanovsky, Tariq Toukan, Mark Bloch,
	Andrew Lunn, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, Sabrina Dubroca
  Cc: Arnd Bergmann, Daniel Zahka, Rahul Rameshbabu, Raed Salem, netdev,
	linux-rdma, linux-kernel

From: Arnd Bergmann <arnd@arndb.de>

Configurations with mlx5 built-in but macsec=m fail to link:

x86_64-linux-ld: drivers/infiniband/hw/mlx5/macsec.o: in function `mlx5r_add_gid_macsec_operations':
macsec.c:(.text+0x77d): undefined reference to `macsec_netdev_is_offloaded'
x86_64-linux-ld: drivers/infiniband/hw/mlx5/macsec.o: in function `mlx5r_del_gid_macsec_operations':
macsec.c:(.text+0xe81): undefined reference to `macsec_netdev_is_offloaded'

Fix the dependency so this configuration cannot happen.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
First seen on next-20260615, but probably an old bug.

I could not figure out what caused this error to appear, I have not seen
this combination in many years of randconfig builds that look like they
had the bug. My best guess is that there are so many other dependencies
that it is simply very unlikely.
---
 drivers/net/ethernet/mellanox/mlx5/core/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Kconfig b/drivers/net/ethernet/mellanox/mlx5/core/Kconfig
index 9cf394c66939..ba944763a737 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Kconfig
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Kconfig
@@ -144,7 +144,7 @@ config MLX5_CORE_IPOIB
 config MLX5_MACSEC
 	bool "Connect-X support for MACSec offload"
 	depends on MLX5_CORE_EN
-	depends on MACSEC
+	depends on MACSEC=y || MACSEC=MLX5_CORE
 	default n
 	help
 	  Build support for MACsec cryptography-offload acceleration in the NIC.
-- 
2.39.5


^ permalink raw reply related

* Re: [PATCH bpf-next v8 6/7] selftests/bpf: add bpf_icmp_send kfunc tc tests
From: bot+bpf-ci @ 2026-06-22 12:41 UTC (permalink / raw)
  To: mahe.tardy, bpf
  Cc: andrii, ast, daniel, edumazet, john.fastabend, jordan, kuba,
	martin.lau, netdev, netfilter-devel, pabeni, yonghong.song,
	mahe.tardy, ast, andrii, daniel, martin.lau, eddyz87,
	yonghong.song, clm, ihor.solodrai
In-Reply-To: <20260622120515.137082-7-mahe.tardy@gmail.com>

[-- Attachment #1: Type: text/plain, Size: 1622 bytes --]

> diff --git a/tools/testing/selftests/bpf/prog_tests/icmp_send_kfunc.c b/tools/testing/selftests/bpf/prog_tests/icmp_send_kfunc.c
> index a5ac1a6ea77a..66447681f72d 100644
> --- a/tools/testing/selftests/bpf/prog_tests/icmp_send_kfunc.c
> +++ b/tools/testing/selftests/bpf/prog_tests/icmp_send_kfunc.c
> @@ -178,3 +178,28 @@ void test_icmp_send_unreach_cgroup(void)
>  	if (cgroup_fd >= 0)
>  		close(cgroup_fd);
>  }
> +
> +void test_icmp_send_unreach_tc(void)
> +{
> +	LIBBPF_OPTS(bpf_tcx_opts, opts);
> +	struct icmp_send *skel;
> +	struct bpf_link *link = NULL;
> +
> +	skel = icmp_send__open_and_load();
> +	if (!ASSERT_OK_PTR(skel, "skel_open"))
> +		goto cleanup;
> +
> +	link = bpf_program__attach_tcx(skel->progs.tc_egress, 1, &opts);

This isn't a bug, but would a named constant read more clearly here than
the bare ifindex literal 1? Looking at prog_tests/tc_change_tail.c, it
defines LO_IFINDEX for this same purpose, and prog_tests/tc_links.c
resolves the loopback interface via if_nametoindex("lo").

> +	if (!ASSERT_OK_PTR(link, "prog_attach"))
> +		goto cleanup;
> +
> +	if (test__start_subtest("ipv4"))
> +		run_icmp_test(skel, AF_INET, "127.0.0.1", NR_ICMP_UNREACH);
> +
> +	if (test__start_subtest("ipv6"))
> +		run_icmp_test(skel, AF_INET6, "::1", ICMPV6_REJECT_ROUTE);
> +
> +cleanup:
> +	bpf_link__destroy(link);
> +	icmp_send__destroy(skel);
> +}

[ ... ]


---
AI reviewed your patch. Please fix the bug or email reply why it's not a bug.
See: https://github.com/kernel-patches/vmtest/blob/master/ci/claude/README.md

CI run summary: https://github.com/kernel-patches/bpf/actions/runs/27951892371

^ permalink raw reply

* Re: [PATCH bpf-next v8 4/7] selftests/bpf: add bpf_icmp_send kfunc cgroup_skb tests
From: bot+bpf-ci @ 2026-06-22 12:41 UTC (permalink / raw)
  To: mahe.tardy, bpf
  Cc: andrii, ast, daniel, edumazet, john.fastabend, jordan, kuba,
	martin.lau, netdev, netfilter-devel, pabeni, yonghong.song,
	mahe.tardy, ast, andrii, daniel, martin.lau, eddyz87,
	yonghong.song, clm, ihor.solodrai
In-Reply-To: <20260622120515.137082-5-mahe.tardy@gmail.com>

[-- Attachment #1: Type: text/plain, Size: 1280 bytes --]

> diff --git a/tools/testing/selftests/bpf/prog_tests/icmp_send_kfunc.c b/tools/testing/selftests/bpf/prog_tests/icmp_send_kfunc.c
> new file mode 100644
> index 000000000000..f4e5b883d4c8
> --- /dev/null
> +++ b/tools/testing/selftests/bpf/prog_tests/icmp_send_kfunc.c

[ ... ]

> +static void read_icmp_errqueue(int sockfd, int expected_code)
> +{

[ ... ]

> +	for (; cm; cm = CMSG_NXTHDR(&msg, cm)) {
> +		if (cm->cmsg_level != IPPROTO_IP || cm->cmsg_type != IP_RECVERR)
> +			continue;

[ ... ]

> +	}
> +
> +	ASSERT_FAIL("no IP_RECVERR/IPV6_RECVERR control message found");

This isn't a bug, but the failure string names both IP_RECVERR and
IPV6_RECVERR:

	ASSERT_FAIL("no IP_RECVERR/IPV6_RECVERR control message found");

while the loop above only matches IPv4:

	if (cm->cmsg_level != IPPROTO_IP || cm->cmsg_type != IP_RECVERR)
		continue;

and the caller is AF_INET only (start_server(AF_INET, ...) with a
struct sockaddr_in).

Should the IPV6_RECVERR part of the string be dropped to match what the
code actually inspects?

> +}

---
AI reviewed your patch. Please fix the bug or email reply why it's not a bug.
See: https://github.com/kernel-patches/vmtest/blob/master/ci/claude/README.md

CI run summary: https://github.com/kernel-patches/bpf/actions/runs/27951892371

^ permalink raw reply

* [PATCH v29 3/5] cxl/sfc: Initialize dpa without a mailbox
From: alejandro.lucero-palau @ 2026-06-22 12:40 UTC (permalink / raw)
  To: linux-cxl, netdev, dan.j.williams, edward.cree, davem, kuba,
	pabeni, edumazet, dave.jiang
  Cc: Alejandro Lucero, Dan Williams, Ben Cheatham, Jonathan Cameron,
	Edward Cree
In-Reply-To: <20260622124010.2192888-1-alejandro.lucero-palau@amd.com>

From: Alejandro Lucero <alucerop@amd.com>

Type3 relies on mailbox CXL_MBOX_OP_IDENTIFY command for initializing
memdev state params which end up being used for DPA initialization.

Allow a Type2 driver to initialize DPA simply by giving the size of its
volatile hardware partition.

Move related functions to memdev.

Add sfc driver as the client.

Signed-off-by: Alejandro Lucero <alucerop@amd.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: Dave Jiang <dave.jiang@intel.com>
Reviewed-by: Ben Cheatham <benjamin.cheatham@amd.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Acked-by: Edward Cree <ecree.xilinx@gmail.com>
---
 drivers/cxl/core/core.h            |  2 +
 drivers/cxl/core/mbox.c            | 51 +----------------------
 drivers/cxl/core/memdev.c          | 67 ++++++++++++++++++++++++++++++
 drivers/net/ethernet/sfc/efx_cxl.c |  5 +++
 include/cxl/cxl.h                  |  2 +
 5 files changed, 77 insertions(+), 50 deletions(-)

diff --git a/drivers/cxl/core/core.h b/drivers/cxl/core/core.h
index 07555ae63859..f7cebb026552 100644
--- a/drivers/cxl/core/core.h
+++ b/drivers/cxl/core/core.h
@@ -101,6 +101,8 @@ void __iomem *devm_cxl_iomap_block(struct device *dev, resource_size_t addr,
 struct dentry *cxl_debugfs_create_dir(const char *dir);
 int cxl_dpa_set_part(struct cxl_endpoint_decoder *cxled,
 		     enum cxl_partition_mode mode);
+struct cxl_memdev_state;
+int cxl_mem_get_partition_info(struct cxl_memdev_state *mds);
 int cxl_dpa_alloc(struct cxl_endpoint_decoder *cxled, u64 size);
 int cxl_dpa_free(struct cxl_endpoint_decoder *cxled);
 resource_size_t cxl_dpa_size(struct cxl_endpoint_decoder *cxled);
diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c
index 7c6c5b7450a5..97b1e61ad018 100644
--- a/drivers/cxl/core/mbox.c
+++ b/drivers/cxl/core/mbox.c
@@ -1152,7 +1152,7 @@ EXPORT_SYMBOL_NS_GPL(cxl_mem_get_event_records, "CXL");
  *
  * See CXL @8.2.9.5.2.1 Get Partition Info
  */
-static int cxl_mem_get_partition_info(struct cxl_memdev_state *mds)
+int cxl_mem_get_partition_info(struct cxl_memdev_state *mds)
 {
 	struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox;
 	struct cxl_mbox_get_partition_info pi;
@@ -1308,55 +1308,6 @@ int cxl_mem_sanitize(struct cxl_memdev *cxlmd, u16 cmd)
 	return -EBUSY;
 }
 
-static void add_part(struct cxl_dpa_info *info, u64 start, u64 size, enum cxl_partition_mode mode)
-{
-	int i = info->nr_partitions;
-
-	if (size == 0)
-		return;
-
-	info->part[i].range = (struct range) {
-		.start = start,
-		.end = start + size - 1,
-	};
-	info->part[i].mode = mode;
-	info->nr_partitions++;
-}
-
-int cxl_mem_dpa_fetch(struct cxl_memdev_state *mds, struct cxl_dpa_info *info)
-{
-	struct cxl_dev_state *cxlds = &mds->cxlds;
-	struct device *dev = cxlds->dev;
-	int rc;
-
-	if (!cxlds->media_ready) {
-		info->size = 0;
-		return 0;
-	}
-
-	info->size = mds->total_bytes;
-
-	if (mds->partition_align_bytes == 0) {
-		add_part(info, 0, mds->volatile_only_bytes, CXL_PARTMODE_RAM);
-		add_part(info, mds->volatile_only_bytes,
-			 mds->persistent_only_bytes, CXL_PARTMODE_PMEM);
-		return 0;
-	}
-
-	rc = cxl_mem_get_partition_info(mds);
-	if (rc) {
-		dev_err(dev, "Failed to query partition information\n");
-		return rc;
-	}
-
-	add_part(info, 0, mds->active_volatile_bytes, CXL_PARTMODE_RAM);
-	add_part(info, mds->active_volatile_bytes, mds->active_persistent_bytes,
-		 CXL_PARTMODE_PMEM);
-
-	return 0;
-}
-EXPORT_SYMBOL_NS_GPL(cxl_mem_dpa_fetch, "CXL");
-
 int cxl_get_dirty_count(struct cxl_memdev_state *mds, u32 *count)
 {
 	struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox;
diff --git a/drivers/cxl/core/memdev.c b/drivers/cxl/core/memdev.c
index 33a3d2e7b13a..2e457b1ebc7d 100644
--- a/drivers/cxl/core/memdev.c
+++ b/drivers/cxl/core/memdev.c
@@ -594,6 +594,73 @@ bool is_cxl_memdev(const struct device *dev)
 }
 EXPORT_SYMBOL_NS_GPL(is_cxl_memdev, "CXL");
 
+static void add_part(struct cxl_dpa_info *info, u64 start, u64 size, enum cxl_partition_mode mode)
+{
+	int i = info->nr_partitions;
+
+	if (size == 0)
+		return;
+
+	info->part[i].range = (struct range) {
+		.start = start,
+		.end = start + size - 1,
+	};
+	info->part[i].mode = mode;
+	info->nr_partitions++;
+}
+
+int cxl_mem_dpa_fetch(struct cxl_memdev_state *mds, struct cxl_dpa_info *info)
+{
+	struct cxl_dev_state *cxlds = &mds->cxlds;
+	struct device *dev = cxlds->dev;
+	int rc;
+
+	if (!cxlds->media_ready) {
+		info->size = 0;
+		return 0;
+	}
+
+	info->size = mds->total_bytes;
+
+	if (mds->partition_align_bytes == 0) {
+		add_part(info, 0, mds->volatile_only_bytes, CXL_PARTMODE_RAM);
+		add_part(info, mds->volatile_only_bytes,
+			 mds->persistent_only_bytes, CXL_PARTMODE_PMEM);
+		return 0;
+	}
+
+	rc = cxl_mem_get_partition_info(mds);
+	if (rc) {
+		dev_err(dev, "Failed to query partition information\n");
+		return rc;
+	}
+
+	add_part(info, 0, mds->active_volatile_bytes, CXL_PARTMODE_RAM);
+	add_part(info, mds->active_volatile_bytes, mds->active_persistent_bytes,
+		 CXL_PARTMODE_PMEM);
+
+	return 0;
+}
+EXPORT_SYMBOL_NS_GPL(cxl_mem_dpa_fetch, "CXL");
+
+
+/**
+ * cxl_set_capacity: initialize dpa by a driver without a mailbox.
+ *
+ * @cxlds: pointer to cxl_dev_state
+ * @capacity: device volatile memory size
+ */
+int cxl_set_capacity(struct cxl_dev_state *cxlds, u64 capacity)
+{
+	struct cxl_dpa_info range_info = {
+		.size = capacity,
+	};
+
+	add_part(&range_info, 0, capacity, CXL_PARTMODE_RAM);
+	return cxl_dpa_setup(cxlds, &range_info);
+}
+EXPORT_SYMBOL_NS_GPL(cxl_set_capacity, "CXL");
+
 /**
  * set_exclusive_cxl_commands() - atomically disable user cxl commands
  * @mds: The device state to operate on
diff --git a/drivers/net/ethernet/sfc/efx_cxl.c b/drivers/net/ethernet/sfc/efx_cxl.c
index 704b0ebae937..18b535b3ea40 100644
--- a/drivers/net/ethernet/sfc/efx_cxl.c
+++ b/drivers/net/ethernet/sfc/efx_cxl.c
@@ -68,6 +68,11 @@ int efx_cxl_init(struct efx_probe_data *probe_data)
 	 */
 	cxl->cxlds.media_ready = true;
 
+	if (cxl_set_capacity(&cxl->cxlds, EFX_CTPIO_BUFFER_SIZE)) {
+		pci_err(pci_dev, "dpa capacity setup failed\n");
+		return -ENODEV;
+	}
+
 	probe_data->cxl = cxl;
 
 	return 0;
diff --git a/include/cxl/cxl.h b/include/cxl/cxl.h
index 016c74fb747c..802b143de83d 100644
--- a/include/cxl/cxl.h
+++ b/include/cxl/cxl.h
@@ -226,4 +226,6 @@ struct cxl_dev_state *_devm_cxl_dev_state_create(struct device *dev,
 
 struct cxl_memdev *devm_cxl_probe_mem(struct cxl_dev_state *cxlds,
 				      struct range *range);
+
+int cxl_set_capacity(struct cxl_dev_state *cxlds, u64 capacity);
 #endif /* __CXL_CXL_H__ */
-- 
2.34.1


^ permalink raw reply related

* [PATCH v29 2/5] cxl/sfc: Map cxl regs
From: alejandro.lucero-palau @ 2026-06-22 12:40 UTC (permalink / raw)
  To: linux-cxl, netdev, dan.j.williams, edward.cree, davem, kuba,
	pabeni, edumazet, dave.jiang
  Cc: Alejandro Lucero, Dan Williams, Jonathan Cameron, Ben Cheatham,
	Edward Cree
In-Reply-To: <20260622124010.2192888-1-alejandro.lucero-palau@amd.com>

From: Alejandro Lucero <alucerop@amd.com>

Export cxl core functions for a Type2 driver being able to discover and
map the device registers.

Use it in sfc driver cxl initialization.

Signed-off-by: Alejandro Lucero <alucerop@amd.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Reviewed-by: Dave Jiang <dave.jiang@intel.com>
Reviewed-by: Ben Cheatham <benjamin.cheatham@amd.com>
Acked-by: Edward Cree <ecree.xilinx@gmail.com>
---
 drivers/cxl/core/pci.c             |  1 +
 drivers/cxl/core/port.c            |  1 +
 drivers/cxl/core/regs.c            |  1 +
 drivers/cxl/cxlpci.h               | 12 ------------
 drivers/cxl/pci.c                  |  1 +
 drivers/net/ethernet/sfc/efx_cxl.c | 26 ++++++++++++++++++++++++++
 include/cxl/pci.h                  | 22 ++++++++++++++++++++++
 7 files changed, 52 insertions(+), 12 deletions(-)
 create mode 100644 include/cxl/pci.h

diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c
index d1f487b3d809..2bcd683aa286 100644
--- a/drivers/cxl/core/pci.c
+++ b/drivers/cxl/core/pci.c
@@ -6,6 +6,7 @@
 #include <linux/delay.h>
 #include <linux/pci.h>
 #include <linux/pci-doe.h>
+#include <cxl/pci.h>
 #include <linux/aer.h>
 #include <cxlpci.h>
 #include <cxlmem.h>
diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c
index 1215ee4f4035..cb633e19151b 100644
--- a/drivers/cxl/core/port.c
+++ b/drivers/cxl/core/port.c
@@ -11,6 +11,7 @@
 #include <linux/idr.h>
 #include <linux/node.h>
 #include <cxl/einj.h>
+#include <cxl/pci.h>
 #include <cxlmem.h>
 #include <cxlpci.h>
 #include <cxl.h>
diff --git a/drivers/cxl/core/regs.c b/drivers/cxl/core/regs.c
index 93710cf4f0a6..20c2d9fbcfe7 100644
--- a/drivers/cxl/core/regs.c
+++ b/drivers/cxl/core/regs.c
@@ -4,6 +4,7 @@
 #include <linux/device.h>
 #include <linux/slab.h>
 #include <linux/pci.h>
+#include <cxl/pci.h>
 #include <cxlmem.h>
 #include <cxlpci.h>
 #include <pmu.h>
diff --git a/drivers/cxl/cxlpci.h b/drivers/cxl/cxlpci.h
index b826eb53cf7b..110ec9c44f09 100644
--- a/drivers/cxl/cxlpci.h
+++ b/drivers/cxl/cxlpci.h
@@ -13,16 +13,6 @@
  */
 #define CXL_PCI_DEFAULT_MAX_VECTORS 16
 
-/* Register Block Identifier (RBI) */
-enum cxl_regloc_type {
-	CXL_REGLOC_RBI_EMPTY = 0,
-	CXL_REGLOC_RBI_COMPONENT,
-	CXL_REGLOC_RBI_VIRT,
-	CXL_REGLOC_RBI_MEMDEV,
-	CXL_REGLOC_RBI_PMU,
-	CXL_REGLOC_RBI_TYPES
-};
-
 /*
  * Table Access DOE, CDAT Read Entry Response
  *
@@ -112,6 +102,4 @@ static inline void devm_cxl_port_ras_setup(struct cxl_port *port)
 }
 #endif
 
-int cxl_pci_setup_regs(struct pci_dev *pdev, enum cxl_regloc_type type,
-		       struct cxl_register_map *map);
 #endif /* __CXL_PCI_H__ */
diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
index 267c679b0b3c..bb892dbfdd6d 100644
--- a/drivers/cxl/pci.c
+++ b/drivers/cxl/pci.c
@@ -11,6 +11,7 @@
 #include <linux/pci.h>
 #include <linux/aer.h>
 #include <linux/io.h>
+#include <cxl/pci.h>
 #include <cxl/mailbox.h>
 #include "cxlmem.h"
 #include "cxlpci.h"
diff --git a/drivers/net/ethernet/sfc/efx_cxl.c b/drivers/net/ethernet/sfc/efx_cxl.c
index be252af972ab..704b0ebae937 100644
--- a/drivers/net/ethernet/sfc/efx_cxl.c
+++ b/drivers/net/ethernet/sfc/efx_cxl.c
@@ -7,6 +7,8 @@
 
 #include <linux/pci.h>
 
+#include <cxl/cxl.h>
+#include <cxl/pci.h>
 #include "net_driver.h"
 #include "efx_cxl.h"
 
@@ -18,6 +20,7 @@ int efx_cxl_init(struct efx_probe_data *probe_data)
 	struct pci_dev *pci_dev = efx->pci_dev;
 	struct efx_cxl *cxl;
 	u16 dvsec;
+	int rc;
 
 	/* Is the device configured with and using CXL? */
 	if (!pcie_is_cxl(pci_dev))
@@ -42,6 +45,29 @@ int efx_cxl_init(struct efx_probe_data *probe_data)
 	if (!cxl)
 		return -ENOMEM;
 
+	rc = cxl_pci_setup_regs(pci_dev, CXL_REGLOC_RBI_COMPONENT,
+				&cxl->cxlds.reg_map);
+	if (rc) {
+		pci_err(pci_dev, "No component registers\n");
+		return rc;
+	}
+
+	if (!cxl->cxlds.reg_map.component_map.hdm_decoder.valid) {
+		pci_err(pci_dev, "Expected HDM component register not found\n");
+		return -ENODEV;
+	}
+
+	if (!cxl->cxlds.reg_map.component_map.ras.valid) {
+		pci_err(pci_dev, "Expected RAS component register not found\n");
+		return -ENODEV;
+	}
+
+	/* Set media ready explicitly as there are neither mailbox for checking
+	 * this state nor the CXL register involved, both not mandatory for
+	 * type2.
+	 */
+	cxl->cxlds.media_ready = true;
+
 	probe_data->cxl = cxl;
 
 	return 0;
diff --git a/include/cxl/pci.h b/include/cxl/pci.h
new file mode 100644
index 000000000000..3e0000015871
--- /dev/null
+++ b/include/cxl/pci.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* Copyright(c) 2020 Intel Corporation. All rights reserved. */
+
+#ifndef __CXL_CXL_PCI_H__
+#define __CXL_CXL_PCI_H__
+
+/* Register Block Identifier (RBI) */
+enum cxl_regloc_type {
+	CXL_REGLOC_RBI_EMPTY = 0,
+	CXL_REGLOC_RBI_COMPONENT,
+	CXL_REGLOC_RBI_VIRT,
+	CXL_REGLOC_RBI_MEMDEV,
+	CXL_REGLOC_RBI_PMU,
+	CXL_REGLOC_RBI_TYPES
+};
+
+struct cxl_register_map;
+struct pci_dev;
+
+int cxl_pci_setup_regs(struct pci_dev *pdev, enum cxl_regloc_type type,
+		       struct cxl_register_map *map);
+#endif
-- 
2.34.1


^ permalink raw reply related

* [PATCH v29 4/5] sfc: obtain and map cxl range using devm_cxl_probe_mem
From: alejandro.lucero-palau @ 2026-06-22 12:40 UTC (permalink / raw)
  To: linux-cxl, netdev, dan.j.williams, edward.cree, davem, kuba,
	pabeni, edumazet, dave.jiang
  Cc: Alejandro Lucero, Edward Cree
In-Reply-To: <20260622124010.2192888-1-alejandro.lucero-palau@amd.com>

From: Alejandro Lucero <alucerop@amd.com>

Use core API for safely obtain the CXL range linked to an HDM committed
by the BIOS. Map such a range for being used as the ctpio buffer.

A potential user space action through sysfs unbinding or core cxl
modules remove will trigger sfc driver device detachment, with that case
not racing with this mapping as this is done during driver probe and
therefore protected with device lock against those user space actions.

Signed-off-by: Alejandro Lucero <alucerop@amd.com>
Reviewed-by: Dave Jiang <dave.jiang@intel.com>
Acked-by: Edward Cree <ecree.xilinx@gmail.com>
---
 drivers/net/ethernet/sfc/efx.c     |  2 ++
 drivers/net/ethernet/sfc/efx_cxl.c | 23 +++++++++++++++++++++++
 drivers/net/ethernet/sfc/efx_cxl.h |  3 +++
 3 files changed, 28 insertions(+)

diff --git a/drivers/net/ethernet/sfc/efx.c b/drivers/net/ethernet/sfc/efx.c
index 61cbb6cfc360..3806cd3dd7f4 100644
--- a/drivers/net/ethernet/sfc/efx.c
+++ b/drivers/net/ethernet/sfc/efx.c
@@ -984,6 +984,7 @@ static void efx_pci_remove(struct pci_dev *pci_dev)
 	efx_fini_io(efx);
 
 	probe_data = container_of(efx, struct efx_probe_data, efx);
+	efx_cxl_exit(probe_data);
 
 	pci_dbg(efx->pci_dev, "shutdown successful\n");
 
@@ -1242,6 +1243,7 @@ static int efx_pci_probe(struct pci_dev *pci_dev,
 	return 0;
 
  fail3:
+	efx_cxl_exit(probe_data);
 	efx_fini_io(efx);
  fail2:
 	efx_fini_struct(efx);
diff --git a/drivers/net/ethernet/sfc/efx_cxl.c b/drivers/net/ethernet/sfc/efx_cxl.c
index 18b535b3ea40..3e7c950f83e9 100644
--- a/drivers/net/ethernet/sfc/efx_cxl.c
+++ b/drivers/net/ethernet/sfc/efx_cxl.c
@@ -18,6 +18,7 @@ int efx_cxl_init(struct efx_probe_data *probe_data)
 {
 	struct efx_nic *efx = &probe_data->efx;
 	struct pci_dev *pci_dev = efx->pci_dev;
+	struct range cxl_pio_range;
 	struct efx_cxl *cxl;
 	u16 dvsec;
 	int rc;
@@ -73,9 +74,31 @@ int efx_cxl_init(struct efx_probe_data *probe_data)
 		return -ENODEV;
 	}
 
+	cxl->cxlmd = devm_cxl_probe_mem(&cxl->cxlds, &cxl_pio_range);
+	if (IS_ERR(cxl->cxlmd)) {
+		pci_err(pci_dev, "CXL accel memdev creation failed\n");
+		return PTR_ERR(cxl->cxlmd);
+	}
+
+	cxl->ctpio_cxl = ioremap_wc(cxl_pio_range.start,
+				    range_len(&cxl_pio_range));
+	if (!cxl->ctpio_cxl) {
+		pci_err(pci_dev, "CXL ioremap region (%pra) failed\n",
+			&cxl_pio_range);
+		return -ENOMEM;
+	}
+
 	probe_data->cxl = cxl;
 
 	return 0;
 }
 
+void efx_cxl_exit(struct efx_probe_data *probe_data)
+{
+	if (!probe_data->cxl)
+		return;
+
+	iounmap(probe_data->cxl->ctpio_cxl);
+}
+
 MODULE_IMPORT_NS("CXL");
diff --git a/drivers/net/ethernet/sfc/efx_cxl.h b/drivers/net/ethernet/sfc/efx_cxl.h
index 04e46278464d..3e2705cb063f 100644
--- a/drivers/net/ethernet/sfc/efx_cxl.h
+++ b/drivers/net/ethernet/sfc/efx_cxl.h
@@ -20,10 +20,13 @@ struct efx_probe_data;
 struct efx_cxl {
 	struct cxl_dev_state cxlds;
 	struct cxl_memdev *cxlmd;
+	void __iomem *ctpio_cxl;
 };
 
 int efx_cxl_init(struct efx_probe_data *probe_data);
+void efx_cxl_exit(struct efx_probe_data *probe_data);
 #else
 static inline int efx_cxl_init(struct efx_probe_data *probe_data) { return 0; }
+static inline void efx_cxl_exit(struct efx_probe_data *probe_data) {}
 #endif
 #endif
-- 
2.34.1


^ permalink raw reply related

* [PATCH v29 5/5] sfc: support pio mapping based on cxl
From: alejandro.lucero-palau @ 2026-06-22 12:40 UTC (permalink / raw)
  To: linux-cxl, netdev, dan.j.williams, edward.cree, davem, kuba,
	pabeni, edumazet, dave.jiang
  Cc: Alejandro Lucero, Edward Cree
In-Reply-To: <20260622124010.2192888-1-alejandro.lucero-palau@amd.com>

From: Alejandro Lucero <alucerop@amd.com>

A PIO buffer is a region of device memory to which the driver can write a
packet for TX, with the device handling the transmit doorbell without
requiring a DMA for getting the packet data, which helps reducing latency
in certain exchanges. With CXL mem protocol this latency can be lowered
further.

With a device supporting CXL and successfully initialised, use the cxl
region to map the memory range and use this mapping for PIO buffers.

Signed-off-by: Alejandro Lucero <alucerop@amd.com>
Reviewed-by: Dave Jiang <dave.jiang@intel.com>
Acked-by: Edward Cree <ecree.xilinx@gmail.com>
---
 drivers/net/ethernet/sfc/ef10.c       | 41 ++++++++++++++++++++++-----
 drivers/net/ethernet/sfc/efx_cxl.c    |  1 +
 drivers/net/ethernet/sfc/net_driver.h |  2 ++
 drivers/net/ethernet/sfc/nic.h        |  3 ++
 4 files changed, 40 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/sfc/ef10.c b/drivers/net/ethernet/sfc/ef10.c
index 7e04f115bbaa..73bc064929f6 100644
--- a/drivers/net/ethernet/sfc/ef10.c
+++ b/drivers/net/ethernet/sfc/ef10.c
@@ -24,6 +24,7 @@
 #include <linux/wait.h>
 #include <linux/workqueue.h>
 #include <net/udp_tunnel.h>
+#include "efx_cxl.h"
 
 /* Hardware control for EF10 architecture including 'Huntington'. */
 
@@ -106,7 +107,7 @@ static int efx_ef10_get_vf_index(struct efx_nic *efx)
 
 static int efx_ef10_init_datapath_caps(struct efx_nic *efx)
 {
-	MCDI_DECLARE_BUF(outbuf, MC_CMD_GET_CAPABILITIES_V4_OUT_LEN);
+	MCDI_DECLARE_BUF(outbuf, MC_CMD_GET_CAPABILITIES_V7_OUT_LEN);
 	struct efx_ef10_nic_data *nic_data = efx->nic_data;
 	size_t outlen;
 	int rc;
@@ -177,6 +178,12 @@ static int efx_ef10_init_datapath_caps(struct efx_nic *efx)
 			  efx->num_mac_stats);
 	}
 
+	if (outlen < MC_CMD_GET_CAPABILITIES_V7_OUT_LEN)
+		nic_data->datapath_caps3 = 0;
+	else
+		nic_data->datapath_caps3 = MCDI_DWORD(outbuf,
+						      GET_CAPABILITIES_V7_OUT_FLAGS3);
+
 	return 0;
 }
 
@@ -1140,6 +1147,9 @@ static int efx_ef10_dimension_resources(struct efx_nic *efx)
 	unsigned int channel_vis, pio_write_vi_base, max_vis;
 	struct efx_ef10_nic_data *nic_data = efx->nic_data;
 	unsigned int uc_mem_map_size, wc_mem_map_size;
+#ifdef CONFIG_SFC_CXL
+	struct efx_probe_data *probe_data;
+#endif
 	void __iomem *membase;
 	int rc;
 
@@ -1263,8 +1273,23 @@ static int efx_ef10_dimension_resources(struct efx_nic *efx)
 	iounmap(efx->membase);
 	efx->membase = membase;
 
-	/* Set up the WC mapping if needed */
-	if (wc_mem_map_size) {
+	if (!wc_mem_map_size)
+		goto skip_pio;
+
+	/* Set up the WC mapping */
+
+#ifdef CONFIG_SFC_CXL
+	probe_data = container_of(efx, struct efx_probe_data, efx);
+	if ((nic_data->datapath_caps3 &
+	    (1 << MC_CMD_GET_CAPABILITIES_V7_OUT_CXL_CONFIG_ENABLE_LBN)) &&
+	    probe_data->cxl_pio_initialised) {
+		/* Using PIO through CXL mapping */
+		nic_data->pio_write_base = probe_data->cxl->ctpio_cxl;
+		nic_data->pio_write_vi_base = pio_write_vi_base;
+	} else
+#endif
+	{
+		/* Using legacy PIO BAR mapping */
 		nic_data->wc_membase = ioremap_wc(efx->membase_phys +
 						  uc_mem_map_size,
 						  wc_mem_map_size);
@@ -1279,12 +1304,14 @@ static int efx_ef10_dimension_resources(struct efx_nic *efx)
 			nic_data->wc_membase +
 			(pio_write_vi_base * efx->vi_stride + ER_DZ_TX_PIOBUF -
 			 uc_mem_map_size);
-
-		rc = efx_ef10_link_piobufs(efx);
-		if (rc)
-			efx_ef10_free_piobufs(efx);
 	}
 
+	rc = efx_ef10_link_piobufs(efx);
+	if (rc)
+		efx_ef10_free_piobufs(efx);
+
+skip_pio:
+
 	netif_dbg(efx, probe, efx->net_dev,
 		  "memory BAR at %pa (virtual %p+%x UC, %p+%x WC)\n",
 		  &efx->membase_phys, efx->membase, uc_mem_map_size,
diff --git a/drivers/net/ethernet/sfc/efx_cxl.c b/drivers/net/ethernet/sfc/efx_cxl.c
index 3e7c950f83e9..348d7404cd7a 100644
--- a/drivers/net/ethernet/sfc/efx_cxl.c
+++ b/drivers/net/ethernet/sfc/efx_cxl.c
@@ -88,6 +88,7 @@ int efx_cxl_init(struct efx_probe_data *probe_data)
 		return -ENOMEM;
 	}
 
+	probe_data->cxl_pio_initialised = true;
 	probe_data->cxl = cxl;
 
 	return 0;
diff --git a/drivers/net/ethernet/sfc/net_driver.h b/drivers/net/ethernet/sfc/net_driver.h
index 563e6a6e85f1..3964b2c56609 100644
--- a/drivers/net/ethernet/sfc/net_driver.h
+++ b/drivers/net/ethernet/sfc/net_driver.h
@@ -1206,12 +1206,14 @@ struct efx_cxl;
  * @pci_dev: The PCI device
  * @efx: Efx NIC details
  * @cxl: details of related cxl objects
+ * @cxl_pio_initialised: cxl initialization outcome.
  */
 struct efx_probe_data {
 	struct pci_dev *pci_dev;
 	struct efx_nic efx;
 #ifdef CONFIG_SFC_CXL
 	struct efx_cxl *cxl;
+	bool cxl_pio_initialised;
 #endif
 };
 
diff --git a/drivers/net/ethernet/sfc/nic.h b/drivers/net/ethernet/sfc/nic.h
index ec3b2df43b68..7480f9995dfb 100644
--- a/drivers/net/ethernet/sfc/nic.h
+++ b/drivers/net/ethernet/sfc/nic.h
@@ -152,6 +152,8 @@ enum {
  *	%MC_CMD_GET_CAPABILITIES response)
  * @datapath_caps2: Further Capabilities of datapath firmware (FLAGS2 field of
  * %MC_CMD_GET_CAPABILITIES response)
+ * @datapath_caps3: Further Capabilities of datapath firmware (FLAGS3 field of
+ * %MC_CMD_GET_CAPABILITIES response)
  * @rx_dpcpu_fw_id: Firmware ID of the RxDPCPU
  * @tx_dpcpu_fw_id: Firmware ID of the TxDPCPU
  * @must_probe_vswitching: Flag: vswitching has yet to be setup after MC reboot
@@ -187,6 +189,7 @@ struct efx_ef10_nic_data {
 	bool must_check_datapath_caps;
 	u32 datapath_caps;
 	u32 datapath_caps2;
+	u32 datapath_caps3;
 	unsigned int rx_dpcpu_fw_id;
 	unsigned int tx_dpcpu_fw_id;
 	bool must_probe_vswitching;
-- 
2.34.1


^ permalink raw reply related

* [PATCH v29 1/5] sfc: add cxl support
From: alejandro.lucero-palau @ 2026-06-22 12:40 UTC (permalink / raw)
  To: linux-cxl, netdev, dan.j.williams, edward.cree, davem, kuba,
	pabeni, edumazet, dave.jiang
  Cc: Alejandro Lucero, Jonathan Cameron, Edward Cree, Alison Schofield,
	Dan Williams
In-Reply-To: <20260622124010.2192888-1-alejandro.lucero-palau@amd.com>

From: Alejandro Lucero <alucerop@amd.com>

Add CXL initialization based on new CXL API for accel drivers and make
it dependent on kernel CXL configuration.

Signed-off-by: Alejandro Lucero <alucerop@amd.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Acked-by: Edward Cree <ecree.xilinx@gmail.com>
Reviewed-by: Alison Schofield <alison.schofield@intel.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: Dave Jiang <dave.jiang@intel.com>
---
 drivers/net/ethernet/sfc/Kconfig      |  9 +++++
 drivers/net/ethernet/sfc/Makefile     |  1 +
 drivers/net/ethernet/sfc/efx.c        | 16 ++++++++-
 drivers/net/ethernet/sfc/efx_cxl.c    | 50 +++++++++++++++++++++++++++
 drivers/net/ethernet/sfc/efx_cxl.h    | 29 ++++++++++++++++
 drivers/net/ethernet/sfc/net_driver.h |  8 +++++
 6 files changed, 112 insertions(+), 1 deletion(-)
 create mode 100644 drivers/net/ethernet/sfc/efx_cxl.c
 create mode 100644 drivers/net/ethernet/sfc/efx_cxl.h

diff --git a/drivers/net/ethernet/sfc/Kconfig b/drivers/net/ethernet/sfc/Kconfig
index c4c43434f314..979f2801e2a8 100644
--- a/drivers/net/ethernet/sfc/Kconfig
+++ b/drivers/net/ethernet/sfc/Kconfig
@@ -66,6 +66,15 @@ config SFC_MCDI_LOGGING
 	  Driver-Interface) commands and responses, allowing debugging of
 	  driver/firmware interaction.  The tracing is actually enabled by
 	  a sysfs file 'mcdi_logging' under the PCI device.
+config SFC_CXL
+	bool "Solarflare SFC9100-family CXL support"
+	depends on SFC && CXL_BUS >= SFC
+	default SFC
+	help
+	  This enables SFC CXL support if the kernel is configuring CXL for
+	  using CTPIO with CXL.mem. The SFC device with CXL support and
+	  with a CXL-aware firmware can be used for minimizing latencies
+	  when sending through CTPIO.
 
 source "drivers/net/ethernet/sfc/falcon/Kconfig"
 source "drivers/net/ethernet/sfc/siena/Kconfig"
diff --git a/drivers/net/ethernet/sfc/Makefile b/drivers/net/ethernet/sfc/Makefile
index d99039ec468d..bb0f1891cde6 100644
--- a/drivers/net/ethernet/sfc/Makefile
+++ b/drivers/net/ethernet/sfc/Makefile
@@ -13,6 +13,7 @@ sfc-$(CONFIG_SFC_SRIOV)	+= sriov.o ef10_sriov.o ef100_sriov.o ef100_rep.o \
                            mae.o tc.o tc_bindings.o tc_counters.o \
                            tc_encap_actions.o tc_conntrack.o
 
+sfc-$(CONFIG_SFC_CXL)	+= efx_cxl.o
 obj-$(CONFIG_SFC)	+= sfc.o
 
 obj-$(CONFIG_SFC_FALCON) += falcon/
diff --git a/drivers/net/ethernet/sfc/efx.c b/drivers/net/ethernet/sfc/efx.c
index 8f136a11d396..61cbb6cfc360 100644
--- a/drivers/net/ethernet/sfc/efx.c
+++ b/drivers/net/ethernet/sfc/efx.c
@@ -34,6 +34,7 @@
 #include "selftest.h"
 #include "sriov.h"
 #include "efx_devlink.h"
+#include "efx_cxl.h"
 
 #include "mcdi_port_common.h"
 #include "mcdi_pcol.h"
@@ -981,12 +982,14 @@ static void efx_pci_remove(struct pci_dev *pci_dev)
 	efx_pci_remove_main(efx);
 
 	efx_fini_io(efx);
+
+	probe_data = container_of(efx, struct efx_probe_data, efx);
+
 	pci_dbg(efx->pci_dev, "shutdown successful\n");
 
 	efx_fini_devlink_and_unlock(efx);
 	efx_fini_struct(efx);
 	free_netdev(efx->net_dev);
-	probe_data = container_of(efx, struct efx_probe_data, efx);
 	kfree(probe_data);
 };
 
@@ -1190,6 +1193,17 @@ static int efx_pci_probe(struct pci_dev *pci_dev,
 	if (rc)
 		goto fail2;
 
+	/* A successful cxl initialization implies a CXL region created to be
+	 * used for PIO buffers. If there is no CXL support legacy PIO buffers
+	 * defined at specific PCI BAR regions will be used. If there is CXL
+	 * support and the cxl initialization fails, the driver probe fails.
+	 */
+	rc = efx_cxl_init(probe_data);
+	if (rc) {
+		pci_err(pci_dev, "CXL initialization failed with error %d\n", rc);
+		goto fail3;
+	}
+
 	rc = efx_pci_probe_post_io(efx);
 	if (rc) {
 		/* On failure, retry once immediately.
diff --git a/drivers/net/ethernet/sfc/efx_cxl.c b/drivers/net/ethernet/sfc/efx_cxl.c
new file mode 100644
index 000000000000..be252af972ab
--- /dev/null
+++ b/drivers/net/ethernet/sfc/efx_cxl.c
@@ -0,0 +1,50 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/****************************************************************************
+ *
+ * Driver for AMD network controllers and boards
+ * Copyright (C) 2025, Advanced Micro Devices, Inc.
+ */
+
+#include <linux/pci.h>
+
+#include "net_driver.h"
+#include "efx_cxl.h"
+
+#define EFX_CTPIO_BUFFER_SIZE	SZ_256M
+
+int efx_cxl_init(struct efx_probe_data *probe_data)
+{
+	struct efx_nic *efx = &probe_data->efx;
+	struct pci_dev *pci_dev = efx->pci_dev;
+	struct efx_cxl *cxl;
+	u16 dvsec;
+
+	/* Is the device configured with and using CXL? */
+	if (!pcie_is_cxl(pci_dev))
+		return 0;
+
+	dvsec = pci_find_dvsec_capability(pci_dev, PCI_VENDOR_ID_CXL,
+					  PCI_DVSEC_CXL_DEVICE);
+	if (!dvsec) {
+		pci_info(pci_dev, "CXL_DVSEC_PCIE_DEVICE capability not found\n");
+		return 0;
+	}
+
+	pci_dbg(pci_dev, "CXL_DVSEC_PCIE_DEVICE capability found\n");
+
+	/* Create a cxl_dev_state embedded in the cxl struct using cxl core api
+	 * specifying no mbox available.
+	 */
+	cxl = devm_cxl_dev_state_create(&pci_dev->dev, CXL_DEVTYPE_DEVMEM,
+					pci_get_dsn(pci_dev), dvsec,
+					struct efx_cxl, cxlds, false);
+
+	if (!cxl)
+		return -ENOMEM;
+
+	probe_data->cxl = cxl;
+
+	return 0;
+}
+
+MODULE_IMPORT_NS("CXL");
diff --git a/drivers/net/ethernet/sfc/efx_cxl.h b/drivers/net/ethernet/sfc/efx_cxl.h
new file mode 100644
index 000000000000..04e46278464d
--- /dev/null
+++ b/drivers/net/ethernet/sfc/efx_cxl.h
@@ -0,0 +1,29 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/****************************************************************************
+ * Driver for AMD network controllers and boards
+ * Copyright (C) 2025, Advanced Micro Devices, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ */
+
+#ifndef EFX_CXL_H
+#define EFX_CXL_H
+
+#ifdef CONFIG_SFC_CXL
+
+#include <cxl/cxl.h>
+
+struct efx_probe_data;
+
+struct efx_cxl {
+	struct cxl_dev_state cxlds;
+	struct cxl_memdev *cxlmd;
+};
+
+int efx_cxl_init(struct efx_probe_data *probe_data);
+#else
+static inline int efx_cxl_init(struct efx_probe_data *probe_data) { return 0; }
+#endif
+#endif
diff --git a/drivers/net/ethernet/sfc/net_driver.h b/drivers/net/ethernet/sfc/net_driver.h
index b98c259f672d..563e6a6e85f1 100644
--- a/drivers/net/ethernet/sfc/net_driver.h
+++ b/drivers/net/ethernet/sfc/net_driver.h
@@ -1197,14 +1197,22 @@ struct efx_nic {
 	atomic_t n_rx_noskb_drops;
 };
 
+#ifdef CONFIG_SFC_CXL
+struct efx_cxl;
+#endif
+
 /**
  * struct efx_probe_data - State after hardware probe
  * @pci_dev: The PCI device
  * @efx: Efx NIC details
+ * @cxl: details of related cxl objects
  */
 struct efx_probe_data {
 	struct pci_dev *pci_dev;
 	struct efx_nic efx;
+#ifdef CONFIG_SFC_CXL
+	struct efx_cxl *cxl;
+#endif
 };
 
 static inline struct efx_nic *efx_netdev_priv(struct net_device *dev)
-- 
2.34.1


^ permalink raw reply related

* [PATCH v29 0/5] Type2 device basic support
From: alejandro.lucero-palau @ 2026-06-22 12:40 UTC (permalink / raw)
  To: linux-cxl, netdev, dan.j.williams, edward.cree, davem, kuba,
	pabeni, edumazet, dave.jiang
  Cc: Alejandro Lucero

From: Alejandro Lucero <alucerop@amd.com>

This series adds the last bits for allowing a CXL Type2 driver to obtain
a CXL region linked to the device HDM decoders committed by the BIOS,
with the driver being the sfc network driver.

Changes from v28:

 - patch 1: 
	fix doc (Ed Cree)
	fix error path (Sashiko)

 - patch 3:
	removing extra + char (sashiko)

 - path5:
	remove stray change (Ed Cree)

Changes from v27:

 - patch 1: make driver probe failing if error in efx_cxl_init (Dan)
 - patch 4: add unmapping if error after efx_cxl_init (Dave)
 - patch 4/5: move cxl_pio_initialised from patch 4 to patch 5 (Dave)

Tested in the cxl_for_7.3 branch.

Alejandro Lucero (5):
  sfc: add cxl support
  cxl/sfc: Map cxl regs
  cxl/sfc: Initialize dpa without a mailbox
  sfc: obtain and map cxl range using devm_cxl_probe_mem
  sfc: support pio mapping based on cxl

 drivers/cxl/core/core.h               |   2 +
 drivers/cxl/core/mbox.c               |  51 +------------
 drivers/cxl/core/memdev.c             |  67 ++++++++++++++++
 drivers/cxl/core/pci.c                |   1 +
 drivers/cxl/core/port.c               |   1 +
 drivers/cxl/core/regs.c               |   1 +
 drivers/cxl/cxlpci.h                  |  12 ---
 drivers/cxl/pci.c                     |   1 +
 drivers/net/ethernet/sfc/Kconfig      |   9 +++
 drivers/net/ethernet/sfc/Makefile     |   1 +
 drivers/net/ethernet/sfc/ef10.c       |  41 ++++++++--
 drivers/net/ethernet/sfc/efx.c        |  18 ++++-
 drivers/net/ethernet/sfc/efx_cxl.c    | 105 ++++++++++++++++++++++++++
 drivers/net/ethernet/sfc/efx_cxl.h    |  32 ++++++++
 drivers/net/ethernet/sfc/net_driver.h |  10 +++
 drivers/net/ethernet/sfc/nic.h        |   3 +
 include/cxl/cxl.h                     |   2 +
 include/cxl/pci.h                     |  22 ++++++
 18 files changed, 309 insertions(+), 70 deletions(-)
 create mode 100644 drivers/net/ethernet/sfc/efx_cxl.c
 create mode 100644 drivers/net/ethernet/sfc/efx_cxl.h
 create mode 100644 include/cxl/pci.h


base-commit: 9b1e70e8f9ec4b5c6ce7fa774a0023bb6894c686
-- 
2.34.1


^ permalink raw reply

* Re: [PATCH net-next v3] virtio-net: xsk: support tx wake up
From: Menglong Dong @ 2026-06-22 12:38 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: xuanzhuo, Menglong Dong, eperezma, mst, jasowang, andrew+netdev,
	davem, edumazet, pabeni, netdev, virtualization, linux-kernel
In-Reply-To: <20260621150610.0ad5d02e@kernel.org>

On 2026/6/22 06:06 Jakub Kicinski <kuba@kernel.org> write:
> On Tue, 16 Jun 2026 19:59:12 +0800 Menglong Dong wrote:
> > For now, XDP_RING_NEED_WAKEUP is not supported properly by the virtio-net
> > in the tx path for example: we set xsk_set_tx_need_wakeup() in
> > virtnet_xsk_xmit(), but we didn't call xsk_clear_tx_need_wakeup()
> > anywhere, which means the user will call send() for every packet.
> > 
> > We call xsk_set_tx_need_wakeup() after virtnet_xsk_xmit_batch() if sq->vq
> > is empty, as we can't be wakeup by the skb_xmit_done() in this case.
> > Otherwise, we will clear the wakeup flag.
> > 
> > Race condition is considered for tx path.
> 
> Seems to follow what mlx5 does so presumably this is fine but IDK if

Yeah, I followed the logic of mlx5. It's amazing that you found it :)

> there's anything virtio-specific that we need to be worried about.
> 
> Xuan Zhuo, please TAL?
> -- 
> mping: VIRTIO NET DRIVER
> 
> 





^ permalink raw reply

* Re: [PATCH net v2] amt: don't read the IP source address from a reallocated skb header
From: Michael Bommarito @ 2026-06-22 12:37 UTC (permalink / raw)
  To: Taehee Yoo
  Cc: Jakub Kicinski, David S . Miller, Paolo Abeni, Eric Dumazet,
	Andrew Lunn, netdev, linux-kernel
In-Reply-To: <CAMArcTWH4a_O+V8aJ6QvnLT1_vWxeC8yF8LuphKt_oFH6nBkbw@mail.gmail.com>

On Mon, Jun 22, 2026 at 4:58 AM Taehee Yoo <ap420073@gmail.com> wrote:
> > Let's fix them all with one patch?
>
> Agreed.
> Michael, could you please fix the remaining ones Sashiko flagged?

Sure, will do

Thanks,
Mike

^ permalink raw reply

* Re: [PATCH] net: ixp4xx_hss: fix duplicate HDLC netdev allocation
From: Linus Walleij @ 2026-06-22 12:36 UTC (permalink / raw)
  To: Haoxiang Li
  Cc: kaloz, andrew+netdev, davem, edumazet, kuba, pabeni,
	huangguangbin2, lipeng321, linux-arm-kernel, netdev, linux-kernel,
	stable
In-Reply-To: <20260622043015.643637-1-haoxiang_li2024@163.com>

On Mon, Jun 22, 2026 at 6:30 AM Haoxiang Li <haoxiang_li2024@163.com> wrote:

> ixp4xx_hss_probe() allocates two HDLC netdevs. The first one is stored
> in ndev, initialized, and registered with register_hdlc_device(). The
> second one is stored in port->netdev and later used by the remove path
> for unregister_hdlc_device() and free_netdev().
>
> This means that the registered netdev is not the same object that is
> unregistered and freed on remove. It also leaks the first allocation if
> the second alloc_hdlcdev() call fails, and the first allocation is not
> checked before ndev is used.
>
> Older code allocated the HDLC netdev only once and stored the same object
> in both the local variable and port->netdev. The buggy conversion split
> this into two alloc_hdlcdev() calls. A later rename changed the local
> variable name to ndev, but the underlying mismatch remained.
>
> Fix this by allocating the HDLC netdev only once and assigning the same
> object to port->netdev.
>
> Fixes: 99ebe65eb9c0 ("net: ixp4xx_hss: move out assignment in if condition")
> Cc: stable@vger.kernel.org
> Signed-off-by: Haoxiang Li <haoxiang_li2024@163.com>

Reviewed-by: Linus Walleij <linusw@kernel.org>

Yours,
Linus Walleij

^ permalink raw reply

* Re: [patch V2 18/25] timekeeping: Prepare for cross timestamps on arbitrary clock IDs
From: David Woodhouse @ 2026-06-22 12:34 UTC (permalink / raw)
  To: Thomas Gleixner, LKML
  Cc: Miroslav Lichvar, John Stultz, Stephen Boyd, Anna-Maria Behnsen,
	Frederic Weisbecker, thomas.weissschuh, Arthur Kiyanovski,
	Rodolfo Giometti, Vincent Donnefort, Marc Zyngier, Oliver Upton,
	kvmarm, Oliver Upton, Richard Cochran, netdev, Takashi Iwai,
	Miri Korenblit, Johannes Berg, Jacob Keller, Tony Nguyen,
	Saeed Mahameed, Peter Hilber, Michael S. Tsirkin, virtualization,
	linux-wireless, linux-sound, Vadim Fedorenko
In-Reply-To: <87se6eltod.ffs@fw13>

[-- Attachment #1: Type: text/plain, Size: 861 bytes --]

On Mon, 2026-06-22 at 13:07 +0200, Thomas Gleixner wrote:
> On Mon, Jun 22 2026 at 09:55, David Woodhouse wrote:
> > We ended up with ktime_get_snapshot_id() also supporting CLOCK_BOOTTIME
> > and CLOCK_MONOTONIC_RAW, but not get_device_system_crosststamp().
> > Should we make that consistent?
> 
> Maybe. The BOOTTIME support is only there for that ARM64 hyper trace muck,
> but has no other relevance.
> 
> MONORAW is there for the PTP EXTENDED IOCTL, but with PRECISE the
> snapshot already contains the raw value and you'd have to prevent the
> historical adjustment part for RAW. So I don't see the actual value, but
> I don't have a strong opinion either.

Yeah, I'm not sure I see the need for it; it's just the consistency
thing that slightly bothered me once I had them both in my sights doing
the snapshot_ntp_error() thing in both.

[-- Attachment #2: smime.p7s --]
[-- Type: application/pkcs7-signature, Size: 5069 bytes --]

^ permalink raw reply

* Re: [Kernel Bug] INFO: task hung in xt_find_table
From: Longxing Li @ 2026-06-22 12:33 UTC (permalink / raw)
  To: Jiayuan Chen
  Cc: Pablo Neira Ayuso, syzkaller, edumazet, kuba, pabeni, horms,
	netfilter-devel, coreteam, netdev, linux-kernel
In-Reply-To: <d26c8934-6d4c-4171-9e6f-f58a249dd9ff@linux.dev>

Hi Jiayuan,
Thanks for explaining the situation. I will double check this problem.

Best regards,
Longxing Li

Jiayuan Chen <jiayuan.chen@linux.dev> 于2026年6月10日周三 17:26写道：
>
>
> On 6/10/26 3:14 PM, Longxing Li wrote:
> > sorry for not containing report plain text in last email. the report
> > is as follows:
> >
> > INFO: task syz-executor.4:42949 blocked for more than 143 seconds.
> >        Not tainted 7.0.6 #1
> > "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> > task:syz-executor.4  state:D stack:26456 pid:42949 tgid:42937
> > ppid:9759   task_flags:0x400140 flags:0x00080002
> > Call Trace:
> >   <TASK>
> >   context_switch kernel/sched/core.c:5298 [inline]
> >   __schedule+0x1006/0x5f00 kernel/sched/core.c:6911
> >   __schedule_loop kernel/sched/core.c:6993 [inline]
> >   schedule+0xe7/0x3a0 kernel/sched/core.c:7008
> >   schedule_preempt_disabled+0x13/0x30 kernel/sched/core.c:7065
> >   __mutex_lock_common kernel/locking/mutex.c:692 [inline]
> >   __mutex_lock+0xd9e/0x1df0 kernel/locking/mutex.c:776
> >   xt_find_table+0x59/0x1a0 net/netfilter/x_tables.c:1245
> >   ip6t_unregister_table_exit+0x22/0x50 net/ipv6/netfilter/ip6_tables.c:1808
> >   ops_exit_list net/core/net_namespace.c:199 [inline]
> >   ops_undo_list+0x2dd/0xa50 net/core/net_namespace.c:252
> >   setup_net+0x1f3/0x3a0 net/core/net_namespace.c:462
> >   copy_net_ns+0x351/0x7c0 net/core/net_namespace.c:579
> >   create_new_namespaces+0x3f6/0xac0 kernel/nsproxy.c:130
> >   copy_namespaces+0x45c/0x580 kernel/nsproxy.c:195
> >   copy_process+0x30cc/0x76d0 kernel/fork.c:2227
> >   kernel_clone+0xea/0x8f0 kernel/fork.c:2655
> >   __do_sys_clone+0xce/0x120 kernel/fork.c:2796
> >   do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline]
> >   do_syscall_64+0x11b/0xf80 arch/x86/entry/syscall_64.c:94
> >   entry_SYSCALL_64_after_hwframe+0x77/0x7f
> > RIP: 0033:0x471ecd
> > RSP: 002b:00007f51f163e008 EFLAGS: 00000202 ORIG_RAX: 0000000000000038
> > RAX: ffffffffffffffda RBX: 000000000059bf80 RCX: 0000000000471ecd
> > RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000040080020
> > RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000000
> > R10: 0000000000000000 R11: 0000000000000202 R12: 000000000059bf8c
> > R13: 000000000000000b R14: 000000000059bf80 R15: 00007f51f161e000
> >   </TASK>
>
>
>
> This is not a deadlock — there's no lock cycle.
>
> The runner is simply under heavy pressure on all three axes: CPU (zswap
> compression) + memory (direct reclaim) + IO (swap).
>
> The hung task is just a victim. The actual holder is another task that
> took the mutex and then fell into direct reclaim.
>
> Likely stack of the holder:
> get_entries
>    xt_find_table_lock
>    copy_entries_to_user
>      alloc_counters
>         vzalloc  -> direct reclaim
>
> "INFO: task hung" reports of this kind are common on the official
> syzkaller dashboard https://syzkaller.appspot.com/upstream/
>
>

^ permalink raw reply

* Re: [PATCH net v2 2/2] net: airoha: fix netif_set_real_num_tx_queues for sparse QoS channels
From: Simon Horman @ 2026-06-22 12:31 UTC (permalink / raw)
  To: Lorenzo Bianconi
  Cc: Andrew Lunn, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, Wayen Yan, linux-arm-kernel, linux-mediatek, netdev
In-Reply-To: <20260619-airoha-qos-fixes-v2-2-5c43485038f9@kernel.org>

On Fri, Jun 19, 2026 at 01:37:14PM +0200, Lorenzo Bianconi wrote:
> airoha_tc_htb_alloc_leaf_queue() assigns queue IDs based on the channel
> index (opt->qid = AIROHA_NUM_TX_RING + channel), but updates
> real_num_tx_queues with a simple increment (num_tx_queues + 1). When QoS
> channels are allocated sparsely (e.g., channels 0 and 3 without 1 and
> 2), the returned qid can exceed real_num_tx_queues, causing out-of-bounds
> accesses in the networking stack.
> For example, allocating channel 0 then channel 3 results in
> real_num_tx_queues = 34 but qid = 35, which is out of range [0, 34).
> Fix this by computing real_num_tx_queues based on the highest active
> channel index rather than using a simple counter, in both the allocation
> and deletion paths.
> 
> Fixes: ef1ca9271313b ("net: airoha: Add sched HTB offload support")
> Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>

Thanks for the update since v1.

Reviewed-by: Simon Horman <horms@kernel.org>

FTR, there is an AI-generated review of this patch on sashiko.dev.
I do not think that should impede the progress of this patch but
you may want to consider it in the context of follow-up.

^ permalink raw reply

* Re: [PATCH net-next v3] virtio-net: xsk: support tx wake up
From: Menglong Dong @ 2026-06-22 12:28 UTC (permalink / raw)
  To: Menglong Dong, Xuan Zhuo
  Cc: mst, jasowang, andrew+netdev, davem, edumazet, kuba, pabeni,
	netdev, virtualization, linux-kernel, eperezma
In-Reply-To: <1782096043.3540094-1-xuanzhuo@linux.alibaba.com>

On 2026/6/22 10:40 Xuan Zhuo <xuanzhuo@linux.alibaba.com> write:
> On Tue, 16 Jun 2026 19:59:12 +0800, Menglong Dong <menglong8.dong@gmail.com> wrote:
> > For now, XDP_RING_NEED_WAKEUP is not supported properly by the virtio-net
> > in the tx path for example: we set xsk_set_tx_need_wakeup() in
> > virtnet_xsk_xmit(), but we didn't call xsk_clear_tx_need_wakeup()
> > anywhere, which means the user will call send() for every packet.
> >
> > We call xsk_set_tx_need_wakeup() after virtnet_xsk_xmit_batch() if sq->vq
> > is empty, as we can't be wakeup by the skb_xmit_done() in this case.
> > Otherwise, we will clear the wakeup flag.
> >
> > Race condition is considered for tx path.
> >
> > Fixes: 89f86675cb03 ("virtio_net: xsk: tx: support xmit xsk buffer")
> 
> This is not a bug, so we do not need this.
> And you post this to net-next.

Okay, I'll remove this tag in the V4.

> 
> 
> > Signed-off-by: Menglong Dong <dongml2@chinatelecom.cn>
> > ---
> > v3:
[...]
> > +
> > +	if (need_wakeup && vring_size == sq->vq->num_free)
> > +		xsk_set_tx_need_wakeup(pool);
> 
> You need to comment this.

Ack!

> 
> 
> > +
[...]
> > +
> >  	if (!is_xdp_raw_buffer_queue(vi, sq - vi->sq))
> >  		check_sq_full_and_disable(vi, vi->dev, sq);
> 
> 
> After fixed above comments, you can add:
> 
> Reviewed-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>

OK! Thanks for the review :)

> 
> Thanks.
> 
> 
> >
> > @@ -1470,9 +1488,6 @@ static bool virtnet_xsk_xmit(struct send_queue *sq, struct xsk_buff_pool *pool,
> >  	u64_stats_add(&sq->stats.xdp_tx,  sent);
> >  	u64_stats_update_end(&sq->stats.syncp);
> >
> > -	if (xsk_uses_need_wakeup(pool))
> > -		xsk_set_tx_need_wakeup(pool);
> > -
> >  	return sent;
> >  }
> >
> > --
> > 2.54.0
> >
> 
> 





^ permalink raw reply

* Re: [PATCH net-next v3] virtio-net: xsk: support tx wake up
From: Menglong Dong @ 2026-06-22 12:27 UTC (permalink / raw)
  To: Menglong Dong, Michael S. Tsirkin
  Cc: xuanzhuo, eperezma, jasowang, andrew+netdev, davem, edumazet,
	kuba, pabeni, netdev, virtualization, linux-kernel
In-Reply-To: <20260621182119-mutt-send-email-mst@kernel.org>

On 2026/6/22 06:31 Michael S. Tsirkin <mst@redhat.com> write:
> On Tue, Jun 16, 2026 at 07:59:12PM +0800, Menglong Dong wrote:
[...]
> >  
> > +	vring_size = virtqueue_get_vring_size(sq->vq);
> > +	need_wakeup = xsk_uses_need_wakeup(pool);
> > +
> > +	if (need_wakeup && vring_size == sq->vq->num_free)
> > +		xsk_set_tx_need_wakeup(pool);
> > +
> 
> why are we doing this here?
> the check after virtnet_xsk_xmit_batch not enough?
> I vaguely think it's some kind of race we are closing?
> Pls add a comment to explain.

Hi, Michael. Thanks for your review.

Yeah, it's for a race condition between user space and kernel
space. I added a comment in V2, which is too confusing, and
I removed it 😢. I'll make it more clear and add it in the V4. The
origin comment is:

 * If the sq->vq is empty, and the tx ring is empty, and the user
 * submit an entry to the tx ring after virtnet_xsk_xmit_batch() and
 * before xsk_set_tx_need_wakeup(), we will lose the chance to wake
 * up the tx napi, so we have to set the need_wakeup flag here.

And the logic is like this:

Kernel: tx NAPI is waked up from skb_xmit_done() ->
Kernel: sq->vq and xsk->tx_ring are both empty ->
Kernel: call virtnet_xsk_xmit_batch()

    User: submit a entry to the xsk->tx_ring
    User: check the wakeup flag
    User: wakeup flag is not set, skip send()

Kernel: call xsk_set_tx_need_wakeup(), because sq->vq is empty

If we don't send more data, the data in the xsk->tx_ring will
not be sent forever.

> 
> >  	sent = virtnet_xsk_xmit_batch(sq, pool, budget, &kicks);
> >  
> > +	if (need_wakeup) {
> > +		if (vring_size == sq->vq->num_free)
> > +			/* we can't wake up by ourself, and it should be done
> > +			 * by the user.
> > +			 */
> > +			xsk_set_tx_need_wakeup(pool);
> > +		else
> > +			/* we can wake up from skb_xmit_done() */
> > +			xsk_clear_tx_need_wakeup(pool);
> 
> But what if we don't have get tx napi so no wakeup in skb_xmit_done?

Sorry that I'm not sure what "get tx napi" means here ;(

There are entry in sq->vq, so skb_xmit_done() will be called after
the entries in the ring is consumed by the HOST, right?
Then, the corresponding sq->napi will be scheduled, as we ensure
that tx napi is always enabled, which means napi->weight is not
zero, in this commit:
1df5116a41a8 ("virtio_net: xsk: prevent disable tx napi")

Right?

Thanks!
Menglong Dong

> 
> 
> > +	}
> > +
> >  	if (!is_xdp_raw_buffer_queue(vi, sq - vi->sq))
> >  		check_sq_full_and_disable(vi, vi->dev, sq);
> >  
> > @@ -1470,9 +1488,6 @@ static bool virtnet_xsk_xmit(struct send_queue *sq, struct xsk_buff_pool *pool,
> >  	u64_stats_add(&sq->stats.xdp_tx,  sent);
> >  	u64_stats_update_end(&sq->stats.syncp);
> >  
> > -	if (xsk_uses_need_wakeup(pool))
> > -		xsk_set_tx_need_wakeup(pool);
> > -
> >  	return sent;
> >  }
> >  
> > -- 
> > 2.54.0
> 
> 
> 

^ permalink raw reply

* Re: [REGRESSION 6.12.90 -> 6.12.94] vsock/virtio: large AF_VSOCK transfers reset under backpressure
From: Stefano Garzarella @ 2026-06-22 12:22 UTC (permalink / raw)
  To: Brien Oberstein; +Cc: netdev, regressions, stable
In-Reply-To: <618701dd023e$063de350$12b9a9f0$@gmail.com>

On Mon, Jun 22, 2026 at 07:55:30AM -0400, Brien Oberstein wrote:
>Hi Stefano,
>
>Thanks, that matches what I'm seeing: large transfers reset mid-stream
>instead of the sender being throttled (reliable above ~1.5 MB, fine below
>~90 KB).
>
>The bind for me: it's not just this mail bridge -- I use AF_VSOCK for a few
>host/guest services, some of which open their own sockets, so the per-socket
>buffer workaround can't cover them all. That leaves pinning 6.12.90 (losing
>the DoS fix and further kernel updates) as the only blanket option.

Okay, but in that case did it work?

>
>A few quick questions:
>
>1. Is a -stable backport of the merging fix likely, and roughly when?

We don't have a fix yet.

>2. Could a smaller interim land in -stable sooner (e.g. more default
>   headroom) without reopening the DoS?

What we've merged so far is the best we can do for now, but anyone who 
wants to help improve the situation is welcome to submit patches.

>3. Will the fix guarantee backpressure for any packet size, or just widen
>   the margin?

It should fix STREAM sockets for any packet size.
SEQPACKET/DGRAM is a bit different since we need to keep boundaries, so 
it will come later if needed.

>
>Happy to test any patch

THanks, I'll ask you to test.

>I have a solid reproducer and can turn it around
>in a day. I'll also file this as a tracked regression so it's not lost.

Unfortunately, it's always been partially broken, using more memory than 
specified, so I don't know if this is actually a full regression, but I 
understand.

Thanks,
Stefano


^ permalink raw reply

* Re: [PATCH net v2 7/7] ipv6: reset position for force_forwarding sysctl restart
From: Fernando Fernandez Mancera @ 2026-06-22 12:19 UTC (permalink / raw)
  To: Ido Schimmel
  Cc: netdev, nicolas.dichtel, stephen, brian.haley, horms, pabeni,
	kuba, edumazet, davem, dsahern
In-Reply-To: <20260622114223.GA233619@shredder>

On 6/22/26 1:42 PM, Ido Schimmel wrote:
> On Sat, Jun 20, 2026 at 06:18:50PM +0200, Fernando Fernandez Mancera wrote:
>> When handling proxy_ndp, if rtnl_net_trylock() fails, the operation is
> 
> s/proxy_ndp/force_forwarding/
> 
>> retried but the position pointer was already advanced meaning that the
>> restarted sysctl will read from an incorrect offset.
>>
>> Fix this by restoring the original position pointer before restarting
>> the syscall.
>>
>> In addition, remove the redundant position pointer restoration at the
>> end of the function.
>>
>> Fixes: f24987ef6959 ("ipv6: add `force_forwarding` sysctl to enable per-interface forwarding")
>> Signed-off-by: Fernando Fernandez Mancera <fmancera@suse.de>
>> ---
>>   net/ipv6/addrconf.c | 6 +++---
>>   1 file changed, 3 insertions(+), 3 deletions(-)
>>
>> diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
>> index cbe681de3818..8c0741e9dfcc 100644
>> --- a/net/ipv6/addrconf.c
>> +++ b/net/ipv6/addrconf.c
>> @@ -6825,8 +6825,10 @@ static int addrconf_sysctl_force_forwarding(const struct ctl_table *ctl, int wri
>>   	ret = proc_douintvec_minmax(&tmp_ctl, write, buffer, lenp, ppos);
>>   
>>   	if (write && old_val != new_val) {
>> -		if (!rtnl_net_trylock(net))
>> +		if (!rtnl_net_trylock(net)) {
>> +			*ppos = pos;
>>   			return restart_syscall();
>> +		}
> 
> Are you sure that this is needed?
> 
> AFAICT, the position pointer is only advanced if the return value is
> positive. From new_sync_write():
> 
> kiocb.ki_pos = (ppos ? *ppos : 0);
> [...]
> ret = filp->f_op->write_iter(&kiocb, &iter);
> [...]
> if (ret > 0 && ppos)
>          *ppos = kiocb.ki_pos;
> 
> And restart_syscall() returns '-ERESTARTNOINTR'.
> 

Hm, I think you are right. I was not aware of this check, thanks for 
pointing it out. That means we can get rid of position pointer reset 
from the rest of the code.. the are plenty of sysctl following this 
pattern. I will prepare a batch for net-next.

I am sending a v3 dropping this patch.

Thank you Ido!

>>   
>>   		WRITE_ONCE(*valp, new_val);
>>   
>> @@ -6851,8 +6853,6 @@ static int addrconf_sysctl_force_forwarding(const struct ctl_table *ctl, int wri
>>   		rtnl_net_unlock(net);
>>   	}
>>   
>> -	if (ret)
>> -		*ppos = pos;
>>   	return ret;
>>   }
>>   
>> -- 
>> 2.54.0
>>


^ permalink raw reply

* [syzbot] [wireless?] KASAN: slab-use-after-free Read in ath9k_hif_request_firmware (2)
From: syzbot @ 2026-06-22 12:15 UTC (permalink / raw)
  To: linux-kernel, linux-wireless, netdev, syzkaller-bugs, toke

Hello,

syzbot found the following issue on:

HEAD commit:    1a3746ccbb0a Merge tag 'strncpy-removal-v7.2-rc1' of git:/..
git tree:       upstream
console output: https://syzkaller.appspot.com/x/log.txt?x=153b07f2580000
kernel config:  https://syzkaller.appspot.com/x/.config?x=26c7945305cfa3b1
dashboard link: https://syzkaller.appspot.com/bug?extid=cb7ed9d85261445a0201
compiler:       gcc (Debian 14.2.0-19) 14.2.0, GNU ld (GNU Binutils for Debian) 2.44

Unfortunately, I don't have any reproducer for this issue yet.

Downloadable assets:
disk image: https://storage.googleapis.com/syzbot-assets/634e430ffbca/disk-1a3746cc.raw.xz
vmlinux: https://storage.googleapis.com/syzbot-assets/b11553afbbe2/vmlinux-1a3746cc.xz
kernel image: https://storage.googleapis.com/syzbot-assets/1fa9342aa2a9/bzImage-1a3746cc.xz

IMPORTANT: if you fix the issue, please add the following tag to the commit:
Reported-by: syzbot+cb7ed9d85261445a0201@syzkaller.appspotmail.com

==================================================================
BUG: KASAN: slab-use-after-free in ath9k_hif_request_firmware+0x416/0x450 drivers/net/wireless/ath/ath9k/hif_usb.c:1219
Read of size 8 at addr ffff888053c45000 by task kworker/1:8/11284

CPU: 1 UID: 0 PID: 11284 Comm: kworker/1:8 Tainted: G             L      syzkaller #0 PREEMPT(full) 
Tainted: [L]=SOFTLOCKUP
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 05/09/2026
Workqueue: events request_firmware_work_func
Call Trace:
 <TASK>
 __dump_stack lib/dump_stack.c:94 [inline]
 dump_stack_lvl+0x100/0x190 lib/dump_stack.c:120
 print_address_description mm/kasan/report.c:378 [inline]
 print_report+0x13d/0x4b0 mm/kasan/report.c:482
 kasan_report+0xdf/0x1c0 mm/kasan/report.c:595
 ath9k_hif_request_firmware+0x416/0x450 drivers/net/wireless/ath/ath9k/hif_usb.c:1219
 ath9k_hif_usb_firmware_cb+0x3f9/0x530 drivers/net/wireless/ath/ath9k/hif_usb.c:1237
 request_firmware_work_func+0x13f/0x440 drivers/base/firmware_loader/main.c:1164
 process_one_work+0xa23/0x1940 kernel/workqueue.c:3322
 process_scheduled_works kernel/workqueue.c:3405 [inline]
 worker_thread+0x5ef/0xe50 kernel/workqueue.c:3486
 kthread+0x370/0x450 kernel/kthread.c:436
 ret_from_fork+0x72b/0xd50 arch/x86/kernel/process.c:158
 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:245
 </TASK>

Allocated by task 11281:
 kasan_save_stack+0x30/0x50 mm/kasan/common.c:57
 kasan_save_track+0x14/0x30 mm/kasan/common.c:78
 poison_kmalloc_redzone mm/kasan/common.c:398 [inline]
 __kasan_kmalloc+0xaa/0xb0 mm/kasan/common.c:415
 _kmalloc_noprof include/linux/slab.h:969 [inline]
 _kzalloc_noprof include/linux/slab.h:1286 [inline]
 ath9k_hif_usb_probe+0x30e/0x830 drivers/net/wireless/ath/ath9k/hif_usb.c:1369
 usb_probe_interface+0x303/0x8f0 drivers/usb/core/driver.c:396
 call_driver_probe drivers/base/dd.c:628 [inline]
 really_probe+0x241/0xa60 drivers/base/dd.c:706
 __driver_probe_device+0x20e/0x450 drivers/base/dd.c:868
 driver_probe_device+0x4a/0x140 drivers/base/dd.c:898
 __device_attach_driver+0x1df/0x320 drivers/base/dd.c:1026
 bus_for_each_drv+0x159/0x1e0 drivers/base/bus.c:500
 __device_attach+0x1e4/0x4d0 drivers/base/dd.c:1098
 device_initial_probe+0xaf/0xd0 drivers/base/dd.c:1153
 bus_probe_device+0x64/0x160 drivers/base/bus.c:620
 device_add+0x121d/0x1970 drivers/base/core.c:3772
 usb_set_configuration+0xd97/0x1c60 drivers/usb/core/message.c:2268
 usb_generic_driver_probe+0xa1/0xe0 drivers/usb/core/generic.c:250
 usb_probe_device+0xef/0x400 drivers/usb/core/driver.c:291
 call_driver_probe drivers/base/dd.c:628 [inline]
 really_probe+0x241/0xa60 drivers/base/dd.c:706
 __driver_probe_device+0x20e/0x450 drivers/base/dd.c:868
 driver_probe_device+0x4a/0x140 drivers/base/dd.c:898
 __device_attach_driver+0x1df/0x320 drivers/base/dd.c:1026
 bus_for_each_drv+0x159/0x1e0 drivers/base/bus.c:500
 __device_attach+0x1e4/0x4d0 drivers/base/dd.c:1098
 device_initial_probe+0xaf/0xd0 drivers/base/dd.c:1153
 bus_probe_device+0x64/0x160 drivers/base/bus.c:620
 device_add+0x121d/0x1970 drivers/base/core.c:3772
 usb_new_device.cold+0x685/0x115c drivers/usb/core/hub.c:2695
 hub_port_connect drivers/usb/core/hub.c:5567 [inline]
 hub_port_connect_change drivers/usb/core/hub.c:5707 [inline]
 port_event drivers/usb/core/hub.c:5871 [inline]
 hub_event+0x314d/0x4af0 drivers/usb/core/hub.c:5953
 process_one_work+0xa23/0x1940 kernel/workqueue.c:3322
 process_scheduled_works kernel/workqueue.c:3405 [inline]
 worker_thread+0x5ef/0xe50 kernel/workqueue.c:3486
 kthread+0x370/0x450 kernel/kthread.c:436
 ret_from_fork+0x72b/0xd50 arch/x86/kernel/process.c:158
 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:245

Freed by task 5704:
 kasan_save_stack+0x30/0x50 mm/kasan/common.c:57
 kasan_save_track+0x14/0x30 mm/kasan/common.c:78
 kasan_save_free_info+0x3b/0x70 mm/kasan/generic.c:584
 poison_slab_object mm/kasan/common.c:253 [inline]
 __kasan_slab_free+0x5f/0x80 mm/kasan/common.c:285
 kasan_slab_free include/linux/kasan.h:235 [inline]
 slab_free_hook mm/slub.c:2700 [inline]
 slab_free mm/slub.c:6310 [inline]
 kfree+0x22b/0x6c0 mm/slub.c:6625
 ath9k_hif_usb_disconnect+0x207/0x3c0 drivers/net/wireless/ath/ath9k/hif_usb.c:1439
 usb_unbind_interface+0x1dd/0x9e0 drivers/usb/core/driver.c:458
 device_remove drivers/base/dd.c:618 [inline]
 device_remove+0x12a/0x180 drivers/base/dd.c:610
 __device_release_driver drivers/base/dd.c:1349 [inline]
 device_release_driver_internal+0x44e/0x620 drivers/base/dd.c:1372
 bus_remove_device+0x2bc/0x560 drivers/base/bus.c:664
 device_del+0x376/0x9b0 drivers/base/core.c:3961
 usb_disable_device+0x367/0x810 drivers/usb/core/message.c:1478
 usb_disconnect+0x2e2/0x9a0 drivers/usb/core/hub.c:2345
 hub_port_connect drivers/usb/core/hub.c:5407 [inline]
 hub_port_connect_change drivers/usb/core/hub.c:5707 [inline]
 port_event drivers/usb/core/hub.c:5871 [inline]
 hub_event+0x1d0c/0x4af0 drivers/usb/core/hub.c:5953
 process_one_work+0xa23/0x1940 kernel/workqueue.c:3322
 process_scheduled_works kernel/workqueue.c:3405 [inline]
 worker_thread+0x5ef/0xe50 kernel/workqueue.c:3486
 kthread+0x370/0x450 kernel/kthread.c:436
 ret_from_fork+0x72b/0xd50 arch/x86/kernel/process.c:158
 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:245

The buggy address belongs to the object at ffff888053c45000
 which belongs to the cache kmalloc-2k of size 2048
The buggy address is located 0 bytes inside of
 freed 2048-byte region [ffff888053c45000, ffff888053c45800)

The buggy address belongs to the physical page:
page: refcount:0 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x53c40
head: order:3 mapcount:0 entire_mapcount:0 nr_pages_mapped:0 pincount:0
flags: 0xfff00000000040(head|node=0|zone=1|lastcpupid=0x7ff)
page_type: f5(slab)
raw: 00fff00000000040 ffff88813fe40000 dead000000000100 dead000000000122
raw: 0000000000000000 0000000800080008 00000000f5000000 0000000000000000
head: 00fff00000000040 ffff88813fe40000 dead000000000100 dead000000000122
head: 0000000000000000 0000000800080008 00000000f5000000 0000000000000000
head: 00fff00000000003 fffffffffffffe01 00000000ffffffff 00000000ffffffff
head: ffffffffffffffff 0000000000000000 00000000ffffffff 0000000000000008
page dumped because: kasan: bad access detected
page_owner tracks the page as allocated
page last allocated via order 3, migratetype Unmovable, gfp_mask 0xd20c0(__GFP_IO|__GFP_FS|__GFP_NOWARN|__GFP_NORETRY|__GFP_COMP|__GFP_NOMEMALLOC), pid 4965, tgid 4965 (klogd), ts 316220316300, free_ts 316211784975
 set_page_owner include/linux/page_owner.h:32 [inline]
 post_alloc_hook+0xfd/0x120 mm/page_alloc.c:1859
 prep_new_page mm/page_alloc.c:1867 [inline]
 get_page_from_freelist+0xf48/0x3530 mm/page_alloc.c:3946
 __alloc_frozen_pages_noprof+0x299/0x2dc0 mm/page_alloc.c:5304
 alloc_slab_page mm/slub.c:3289 [inline]
 allocate_slab mm/slub.c:3404 [inline]
 new_slab+0xa2/0x670 mm/slub.c:3447
 refill_objects+0xe3/0x430 mm/slub.c:7241
 refill_sheaf mm/slub.c:2827 [inline]
 __pcs_replace_empty_main+0x375/0x660 mm/slub.c:4692
 alloc_from_pcs mm/slub.c:4790 [inline]
 slab_alloc_node mm/slub.c:4924 [inline]
 __kmalloc_cache_noprof+0x48d/0x6e0 mm/slub.c:5446
 _kmalloc_noprof include/linux/slab.h:969 [inline]
 syslog_print+0xf8/0x620 kernel/printk/printk.c:1585
 do_syslog+0x5bd/0x6d0 kernel/printk/printk.c:1763
 __do_sys_syslog kernel/printk/printk.c:1855 [inline]
 __se_sys_syslog kernel/printk/printk.c:1853 [inline]
 __x64_sys_syslog+0x74/0xb0 kernel/printk/printk.c:1853
 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline]
 do_syscall_64+0x115/0x870 arch/x86/entry/syscall_64.c:94
 entry_SYSCALL_64_after_hwframe+0x77/0x7f
page last free pid 11284 tgid 11284 stack trace:
 reset_page_owner include/linux/page_owner.h:25 [inline]
 __free_pages_prepare mm/page_alloc.c:1406 [inline]
 free_pages_prepare+0x586/0xd80 mm/page_alloc.c:1451
 __free_contig_range_common+0x14f/0x250 mm/page_alloc.c:6895
 __free_contig_range mm/page_alloc.c:6940 [inline]
 free_pages_bulk+0x12a/0x200 mm/page_alloc.c:5257
 vm_area_free_pages+0xad/0x2b0 mm/vmalloc.c:3439
 vfree mm/vmalloc.c:3488 [inline]
 vfree+0x107/0x750 mm/vmalloc.c:3462
 delayed_vfree_work+0x56/0x80 mm/vmalloc.c:3392
 process_one_work+0xa23/0x1940 kernel/workqueue.c:3322
 process_scheduled_works kernel/workqueue.c:3405 [inline]
 worker_thread+0x5ef/0xe50 kernel/workqueue.c:3486
 kthread+0x370/0x450 kernel/kthread.c:436
 ret_from_fork+0x72b/0xd50 arch/x86/kernel/process.c:158
 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:245

Memory state around the buggy address:
 ffff888053c44f00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
 ffff888053c44f80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
>ffff888053c45000: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
                   ^
 ffff888053c45080: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
 ffff888053c45100: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
==================================================================


---
This report is generated by a bot. It may contain errors.
See https://goo.gl/tpsmEJ for more information about syzbot.
syzbot engineers can be reached at syzkaller@googlegroups.com.

syzbot will keep track of this issue. See:
https://goo.gl/tpsmEJ#status for how to communicate with syzbot.

If the report is already addressed, let syzbot know by replying with:
#syz fix: exact-commit-title

If you want to overwrite report's subsystems, reply with:
#syz set subsystems: new-subsystem
(See the list of subsystem names on the web dashboard)

If the report is a duplicate of another one, reply with:
#syz dup: exact-subject-of-another-report

If you want to undo deduplication, reply with:
#syz undup

^ permalink raw reply

* [syzbot] [wireless?] divide error in mac80211_hwsim_write_tsf
From: syzbot @ 2026-06-22 12:15 UTC (permalink / raw)
  To: johannes, linux-kernel, linux-wireless, netdev, syzkaller-bugs

Hello,

syzbot found the following issue on:

HEAD commit:    83f1454877cc Merge tag 'ext4_for_linus-7.2-rc1' of git://g..
git tree:       upstream
console output: https://syzkaller.appspot.com/x/log.txt?x=17956aae580000
kernel config:  https://syzkaller.appspot.com/x/.config?x=8deb4438448ed47a
dashboard link: https://syzkaller.appspot.com/bug?extid=21629c14aa749636db9d
compiler:       Debian clang version 22.1.6 (++20260514074242+fc4aad7b5db3-1~exp1~20260514074407.73), Debian LLD 22.1.6

Unfortunately, I don't have any reproducer for this issue yet.

Downloadable assets:
disk image (non-bootable): https://storage.googleapis.com/syzbot-assets/d900f083ada3/non_bootable_disk-83f14548.raw.xz
vmlinux: https://storage.googleapis.com/syzbot-assets/06b66919e887/vmlinux-83f14548.xz
kernel image: https://storage.googleapis.com/syzbot-assets/3dedd791b7cd/bzImage-83f14548.xz

IMPORTANT: if you fix the issue, please add the following tag to the commit:
Reported-by: syzbot+21629c14aa749636db9d@syzkaller.appspotmail.com

Oops: divide error: 0000 [#1] SMP KASAN NOPTI
CPU: 0 UID: 0 PID: 5321 Comm: syz.0.0 Not tainted syzkaller #0 PREEMPT(full) 
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-debian-1.16.3-2 04/01/2014
RIP: 0010:mac80211_hwsim_write_tsf+0x3a3/0x590 drivers/net/wireless/virtual/mac80211_hwsim_main.c:1628
Code: 81 c4 e8 49 00 00 4c 89 e0 48 c1 e8 03 42 80 3c 30 00 74 08 4c 89 e7 e8 1b bb 22 fb 48 8b 34 24 41 03 34 24 66 b8 20 03 31 d2 <66> f7 f5 0f b7 d8 4d 8d 65 0a 49 83 c5 0d 4c 89 e0 48 c1 e8 03 42
RSP: 0018:ffffc900037aedf0 EFLAGS: 00010246
RAX: 1ffff110080a0320 RBX: 000000000000001c RCX: 0000000000100000
RDX: 0000000000000000 RSI: 0000000005e6b00c RDI: 0000000000000230
RBP: 0000000000000000 R08: 0000000000000003 R09: 0000000000000004
R10: dffffc0000000000 R11: fffff520006f5dac R12: ffff888040547c08
R13: ffff88803d7fadda R14: dffffc0000000000 R15: 0000000000000020
FS:  00007f2f6aff66c0(0000) GS:ffff88808c852000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000200000002280 CR3: 0000000013282000 CR4: 0000000000352ef0
Call Trace:
 <TASK>
 mac80211_hwsim_tx_frame_no_nl+0x16b/0x1760 drivers/net/wireless/virtual/mac80211_hwsim_main.c:1902
 mac80211_hwsim_tx+0x1784/0x2500 drivers/net/wireless/virtual/mac80211_hwsim_main.c:2261
 drv_tx net/mac80211/driver-ops.h:38 [inline]
 ieee80211_tx_frags+0x3df/0x890 net/mac80211/tx.c:1746
 __ieee80211_tx+0x267/0x580 net/mac80211/tx.c:1801
 ieee80211_tx+0x312/0x4b0 net/mac80211/tx.c:1984
 ieee80211_monitor_start_xmit+0xb33/0x1280 net/mac80211/tx.c:2479
 __netdev_start_xmit include/linux/netdevice.h:5387 [inline]
 netdev_start_xmit include/linux/netdevice.h:5396 [inline]
 xmit_one net/core/dev.c:3889 [inline]
 dev_hard_start_xmit+0x2cd/0x830 net/core/dev.c:3905
 __dev_queue_xmit+0x1435/0x37f0 net/core/dev.c:4872
 packet_snd net/packet/af_packet.c:3082 [inline]
 packet_sendmsg+0x3d95/0x5040 net/packet/af_packet.c:3114
 sock_sendmsg_nosec net/socket.c:775 [inline]
 __sock_sendmsg net/socket.c:790 [inline]
 __sys_sendto+0x626/0x6c0 net/socket.c:2252
 __do_sys_sendto net/socket.c:2259 [inline]
 __se_sys_sendto net/socket.c:2255 [inline]
 __x64_sys_sendto+0xde/0x100 net/socket.c:2255
 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline]
 do_syscall_64+0x174/0x580 arch/x86/entry/syscall_64.c:94
 entry_SYSCALL_64_after_hwframe+0x77/0x7f
RIP: 0033:0x7f2f6a19ce59
Code: ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 e8 ff ff ff f7 d8 64 89 01 48
RSP: 002b:00007f2f6aff5fe8 EFLAGS: 00000246 ORIG_RAX: 000000000000002c
RAX: ffffffffffffffda RBX: 00007f2f6a415fa0 RCX: 00007f2f6a19ce59
RDX: 0000000000000026 RSI: 0000200000000640 RDI: 0000000000000007
RBP: 00007f2f6a232e6f R08: 0000200000000380 R09: 0000000000000014
R10: 0000000004000000 R11: 0000000000000246 R12: 0000000000000000
R13: 00007f2f6a416038 R14: 00007f2f6a415fa0 R15: 00007ffff9cddab8
 </TASK>
Modules linked in:
---[ end trace 0000000000000000 ]---
RIP: 0010:mac80211_hwsim_write_tsf+0x3a3/0x590 drivers/net/wireless/virtual/mac80211_hwsim_main.c:1628
Code: 81 c4 e8 49 00 00 4c 89 e0 48 c1 e8 03 42 80 3c 30 00 74 08 4c 89 e7 e8 1b bb 22 fb 48 8b 34 24 41 03 34 24 66 b8 20 03 31 d2 <66> f7 f5 0f b7 d8 4d 8d 65 0a 49 83 c5 0d 4c 89 e0 48 c1 e8 03 42
RSP: 0018:ffffc900037aedf0 EFLAGS: 00010246
RAX: 1ffff110080a0320 RBX: 000000000000001c RCX: 0000000000100000
RDX: 0000000000000000 RSI: 0000000005e6b00c RDI: 0000000000000230
RBP: 0000000000000000 R08: 0000000000000003 R09: 0000000000000004
R10: dffffc0000000000 R11: fffff520006f5dac R12: ffff888040547c08
R13: ffff88803d7fadda R14: dffffc0000000000 R15: 0000000000000020
FS:  00007f2f6aff66c0(0000) GS:ffff88808c852000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000200000002280 CR3: 0000000013282000 CR4: 0000000000352ef0
----------------
Code disassembly (best guess):
   0:	81 c4 e8 49 00 00    	add    $0x49e8,%esp
   6:	4c 89 e0             	mov    %r12,%rax
   9:	48 c1 e8 03          	shr    $0x3,%rax
   d:	42 80 3c 30 00       	cmpb   $0x0,(%rax,%r14,1)
  12:	74 08                	je     0x1c
  14:	4c 89 e7             	mov    %r12,%rdi
  17:	e8 1b bb 22 fb       	call   0xfb22bb37
  1c:	48 8b 34 24          	mov    (%rsp),%rsi
  20:	41 03 34 24          	add    (%r12),%esi
  24:	66 b8 20 03          	mov    $0x320,%ax
  28:	31 d2                	xor    %edx,%edx
* 2a:	66 f7 f5             	div    %bp <-- trapping instruction
  2d:	0f b7 d8             	movzwl %ax,%ebx
  30:	4d 8d 65 0a          	lea    0xa(%r13),%r12
  34:	49 83 c5 0d          	add    $0xd,%r13
  38:	4c 89 e0             	mov    %r12,%rax
  3b:	48 c1 e8 03          	shr    $0x3,%rax
  3f:	42                   	rex.X


---
This report is generated by a bot. It may contain errors.
See https://goo.gl/tpsmEJ for more information about syzbot.
syzbot engineers can be reached at syzkaller@googlegroups.com.

syzbot will keep track of this issue. See:
https://goo.gl/tpsmEJ#status for how to communicate with syzbot.

If the report is already addressed, let syzbot know by replying with:
#syz fix: exact-commit-title

If you want to overwrite report's subsystems, reply with:
#syz set subsystems: new-subsystem
(See the list of subsystem names on the web dashboard)

If the report is a duplicate of another one, reply with:
#syz dup: exact-subject-of-another-report

If you want to undo deduplication, reply with:
#syz undup

^ permalink raw reply

* Re: [PATCH net v2] net/smc: fix out-of-bounds read when sk_user_data holds a sk_psock
From: Jiayuan Chen @ 2026-06-22 12:11 UTC (permalink / raw)
  To: Sechang Lim, D . Wythe, Dust Li, Sidraya Jayagond, Wenjia Zhang,
	David S . Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni
  Cc: Mahanta Jambigi, Tony Lu, Wen Gu, Simon Horman, Ursula Braun,
	Karsten Graul, Guvenc Gulce, linux-rdma, linux-s390, netdev,
	linux-kernel, bpf
In-Reply-To: <20260619150342.3626224-1-rhkrqnwk98@gmail.com>


On 6/19/26 11:03 PM, Sechang Lim wrote:
> SMC stores its smc_sock in the clcsock's sk_user_data tagged
> SK_USER_DATA_NOCOPY and reads it back with smc_clcsock_user_data(), which
> only strips that flag. sockmap stores a sk_psock in the same field tagged
> SK_USER_DATA_NOCOPY | SK_USER_DATA_PSOCK. Nothing keeps both off one
> socket, and SMC then casts the sk_psock to an smc_sock.

How about SK_USER_DATA_BPF



^ permalink raw reply

* [PATCH bpf-next v8 7/7] selftests/bpf: add bpf_icmp_send recursion test
From: Mahe Tardy @ 2026-06-22 12:05 UTC (permalink / raw)
  To: bpf
  Cc: andrii, ast, daniel, edumazet, john.fastabend, jordan, kuba,
	martin.lau, netdev, netfilter-devel, pabeni, yonghong.song,
	Mahe Tardy
In-Reply-To: <20260622120515.137082-1-mahe.tardy@gmail.com>

This test is similar to test_icmp_send_unreach_cgroup but checks that,
in case of recursion, meaning that the BPF program calling the kfunc was
re-triggered by the icmp_send done by the kfunc, the kfunc will stop
early and return -EBUSY.

The test attaches to the root cgroup to ensure the ICMP packet generated
by the kfunc re-triggers the BPF program. Since it's attached only for
this recursion test, it should not disrupt the whole network.

Signed-off-by: Mahe Tardy <mahe.tardy@gmail.com>
---
 .../bpf/prog_tests/icmp_send_kfunc.c          | 45 +++++++++++++++
 tools/testing/selftests/bpf/progs/icmp_send.c | 56 +++++++++++++++++++
 2 files changed, 101 insertions(+)

diff --git a/tools/testing/selftests/bpf/prog_tests/icmp_send_kfunc.c b/tools/testing/selftests/bpf/prog_tests/icmp_send_kfunc.c
index 66447681f72d..fd4b8fa78a01 100644
--- a/tools/testing/selftests/bpf/prog_tests/icmp_send_kfunc.c
+++ b/tools/testing/selftests/bpf/prog_tests/icmp_send_kfunc.c
@@ -1,8 +1,10 @@
 // SPDX-License-Identifier: GPL-2.0
 #include <test_progs.h>
 #include <network_helpers.h>
+#include <cgroup_helpers.h>
 #include <linux/errqueue.h>
 #include <poll.h>
+#include <unistd.h>
 #include "icmp_send.skel.h"

 #define TIMEOUT_MS 1000
@@ -10,6 +12,7 @@
 #define ICMP_DEST_UNREACH 3
 #define ICMPV6_DEST_UNREACH 1

+#define ICMP_HOST_UNREACH 1
 #define ICMP_FRAG_NEEDED 4
 #define NR_ICMP_UNREACH 15
 #define ICMPV6_REJECT_ROUTE 6
@@ -203,3 +206,45 @@ void test_icmp_send_unreach_tc(void)
 	bpf_link__destroy(link);
 	icmp_send__destroy(skel);
 }
+
+void test_icmp_send_unreach_recursion(void)
+{
+	struct icmp_send *skel;
+	int cgroup_fd = -1;
+
+	skel = icmp_send__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "skel_open"))
+		goto cleanup;
+
+	if (setup_cgroup_environment()) {
+		fprintf(stderr, "Failed to setup cgroup environment\n");
+		goto cleanup;
+	}
+
+	cgroup_fd = get_root_cgroup();
+	if (!ASSERT_OK_FD(cgroup_fd, "get_root_cgroup"))
+		goto cleanup;
+
+	skel->data->target_pid = getpid();
+	skel->links.recursion =
+		bpf_program__attach_cgroup(skel->progs.recursion, cgroup_fd);
+	if (!ASSERT_OK_PTR(skel->links.recursion, "prog_attach_cgroup"))
+		goto cleanup;
+
+	trigger_prog_read_icmp_errqueue(skel, ICMP_HOST_UNREACH, AF_INET,
+					"127.0.0.1");
+
+	/*
+	 * Because there's recursion involved, the first call will return at
+	 * index 1 since it will return the second, and the second call will
+	 * return at index 0 since it will return the first.
+	 */
+	ASSERT_EQ(skel->data->rec_kfunc_rets[0], -EBUSY, "kfunc_rets[0]");
+	ASSERT_EQ(skel->data->rec_kfunc_rets[1], 0, "kfunc_rets[1]");
+
+cleanup:
+	cleanup_cgroup_environment();
+	icmp_send__destroy(skel);
+	if (cgroup_fd >= 0)
+		close(cgroup_fd);
+}
diff --git a/tools/testing/selftests/bpf/progs/icmp_send.c b/tools/testing/selftests/bpf/progs/icmp_send.c
index 5fa5467bdb70..fd9c7684797b 100644
--- a/tools/testing/selftests/bpf/progs/icmp_send.c
+++ b/tools/testing/selftests/bpf/progs/icmp_send.c
@@ -13,6 +13,10 @@ __u16 server_port = 0;
 int unreach_type = 0;
 int unreach_code = 0;
 int kfunc_ret = -1;
+int target_pid = -1;
+
+unsigned int rec_count = 0;
+int rec_kfunc_rets[] = { -1, -1 };

 SEC("cgroup_skb/egress")
 int egress(struct __sk_buff *skb)
@@ -125,4 +129,56 @@ int tc_egress(struct __sk_buff *skb)
 	return TCX_DROP;
 }

+SEC("cgroup_skb/egress")
+int recursion(struct __sk_buff *skb)
+{
+	void *data = (void *)(long)skb->data;
+	void *data_end = (void *)(long)skb->data_end;
+	struct icmphdr *icmph;
+	struct tcphdr *tcph;
+	struct iphdr *iph;
+	int ret;
+
+	if ((bpf_get_current_pid_tgid() >> 32) != target_pid)
+		return SK_PASS;
+
+	iph = data;
+	if ((void *)(iph + 1) > data_end || iph->version != 4)
+		return SK_PASS;
+
+	if (iph->daddr != bpf_htonl(SERVER_IP))
+		return SK_PASS;
+
+	if (iph->protocol == IPPROTO_TCP) {
+		tcph = (void *)iph + iph->ihl * 4;
+		if ((void *)(tcph + 1) > data_end ||
+		    tcph->dest != bpf_htons(server_port))
+			return SK_PASS;
+	} else if (iph->protocol == IPPROTO_ICMP) {
+		icmph = (void *)iph + iph->ihl * 4;
+		if ((void *)(icmph + 1) > data_end ||
+		    icmph->type != unreach_type ||
+		    icmph->code != unreach_code)
+			return SK_PASS;
+	} else {
+		return SK_PASS;
+	}
+
+	/*
+	 * This call will provoke a recursion: the ICMP packet generated by the
+	 * kfunc will re-trigger this program since we are in the root cgroup in
+	 * which the kernel ICMP socket belongs. However when re-entering the
+	 * kfunc, it should return EBUSY.
+	 */
+	ret = bpf_icmp_send(skb, unreach_type, unreach_code);
+	rec_kfunc_rets[rec_count & 1] = ret;
+	__sync_fetch_and_add(&rec_count, 1);
+
+	/* Let the first ICMP error message pass */
+	if (iph->protocol == IPPROTO_ICMP)
+		return SK_PASS;
+
+	return SK_DROP;
+}
+
 char LICENSE[] SEC("license") = "Dual BSD/GPL";
--
2.34.1


^ permalink raw reply related

* [PATCH bpf-next v8 6/7] selftests/bpf: add bpf_icmp_send kfunc tc tests
From: Mahe Tardy @ 2026-06-22 12:05 UTC (permalink / raw)
  To: bpf
  Cc: andrii, ast, daniel, edumazet, john.fastabend, jordan, kuba,
	martin.lau, netdev, netfilter-devel, pabeni, yonghong.song,
	Mahe Tardy
In-Reply-To: <20260622120515.137082-1-mahe.tardy@gmail.com>

This test is similar to the one with cgroup_skb programs but uses tc
egress instead.

Signed-off-by: Mahe Tardy <mahe.tardy@gmail.com>
---
 .../bpf/prog_tests/icmp_send_kfunc.c          | 25 ++++++++
 tools/testing/selftests/bpf/progs/icmp_send.c | 60 +++++++++++++++++++
 2 files changed, 85 insertions(+)

diff --git a/tools/testing/selftests/bpf/prog_tests/icmp_send_kfunc.c b/tools/testing/selftests/bpf/prog_tests/icmp_send_kfunc.c
index a5ac1a6ea77a..66447681f72d 100644
--- a/tools/testing/selftests/bpf/prog_tests/icmp_send_kfunc.c
+++ b/tools/testing/selftests/bpf/prog_tests/icmp_send_kfunc.c
@@ -178,3 +178,28 @@ void test_icmp_send_unreach_cgroup(void)
 	if (cgroup_fd >= 0)
 		close(cgroup_fd);
 }
+
+void test_icmp_send_unreach_tc(void)
+{
+	LIBBPF_OPTS(bpf_tcx_opts, opts);
+	struct icmp_send *skel;
+	struct bpf_link *link = NULL;
+
+	skel = icmp_send__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "skel_open"))
+		goto cleanup;
+
+	link = bpf_program__attach_tcx(skel->progs.tc_egress, 1, &opts);
+	if (!ASSERT_OK_PTR(link, "prog_attach"))
+		goto cleanup;
+
+	if (test__start_subtest("ipv4"))
+		run_icmp_test(skel, AF_INET, "127.0.0.1", NR_ICMP_UNREACH);
+
+	if (test__start_subtest("ipv6"))
+		run_icmp_test(skel, AF_INET6, "::1", ICMPV6_REJECT_ROUTE);
+
+cleanup:
+	bpf_link__destroy(link);
+	icmp_send__destroy(skel);
+}
diff --git a/tools/testing/selftests/bpf/progs/icmp_send.c b/tools/testing/selftests/bpf/progs/icmp_send.c
index 6e1ba539eeb0..5fa5467bdb70 100644
--- a/tools/testing/selftests/bpf/progs/icmp_send.c
+++ b/tools/testing/selftests/bpf/progs/icmp_send.c
@@ -2,6 +2,7 @@
 #include "vmlinux.h"
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_endian.h>
+#include "bpf_tracing_net.h"

 /* 127.0.0.1 in host byte order */
 #define SERVER_IP 0x7F000001
@@ -65,4 +66,63 @@ int egress(struct __sk_buff *skb)
 	return SK_DROP;
 }

+SEC("tc/egress")
+int tc_egress(struct __sk_buff *skb)
+{
+	void *data = (void *)(long)skb->data;
+	void *data_end = (void *)(long)skb->data_end;
+	struct ethhdr *eth;
+	struct iphdr *iph;
+	struct ipv6hdr *ip6h;
+	struct tcphdr *tcph;
+
+	eth = data;
+	if ((void *)(eth + 1) > data_end)
+		return TCX_PASS;
+
+	if (eth->h_proto == bpf_htons(ETH_P_IP)) {
+		iph = (void *)(eth + 1);
+		if ((void *)(iph + 1) > data_end)
+			return TCX_PASS;
+
+		if (iph->protocol != IPPROTO_TCP ||
+		    iph->daddr != bpf_htonl(SERVER_IP))
+			return TCX_PASS;
+
+		tcph = (void *)iph + iph->ihl * 4;
+		if ((void *)(tcph + 1) > data_end)
+			return TCX_PASS;
+
+		if (tcph->dest != bpf_htons(server_port))
+			return TCX_PASS;
+
+	} else if (eth->h_proto == bpf_htons(ETH_P_IPV6)) {
+		ip6h = (void *)(eth + 1);
+		if ((void *)(ip6h + 1) > data_end)
+			return TCX_PASS;
+
+		if (ip6h->nexthdr != IPPROTO_TCP)
+			return TCX_PASS;
+
+		if (ip6h->daddr.in6_u.u6_addr32[0] != 0 ||
+		    ip6h->daddr.in6_u.u6_addr32[1] != 0 ||
+		    ip6h->daddr.in6_u.u6_addr32[2] != 0 ||
+		    ip6h->daddr.in6_u.u6_addr32[3] != bpf_htonl(SERVER_IP6_LO))
+			return TCX_PASS;
+
+		tcph = (void *)(ip6h + 1);
+		if ((void *)(tcph + 1) > data_end)
+			return TCX_PASS;
+
+		if (tcph->dest != bpf_htons(server_port))
+			return TCX_PASS;
+	} else {
+		return TCX_PASS;
+	}
+
+	kfunc_ret = bpf_icmp_send(skb, unreach_type, unreach_code);
+
+	return TCX_DROP;
+}
+
 char LICENSE[] SEC("license") = "Dual BSD/GPL";
--
2.34.1


^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox